#!/usr/bin/env python3 """ docx_editor.py — 保留原格式替换文本 + 修改字体颜色 + 替换图片 用法: # 列出文档中所有图片 python3 docx_editor.py input.docx --list-images # 文本替换 + 颜色 python3 docx_editor.py input.docx output.docx \ --replace "原文" "新文" \ --color "关键词" "FF0000" # 图片替换(按文档中出现的顺序,从1开始) python3 docx_editor.py input.docx output.docx \ --image 1 new_chart.png \ --image 2 new_photo.jpg # 同时替换文字和图片 python3 docx_editor.py input.docx output.docx \ --replace "旧标题" "新标题" \ --image 1 new_image.png \ --color "重点" "FF0000" """ import argparse import os import tempfile import zipfile from lxml import etree from PIL import Image W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' WD = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing' A = 'http://schemas.openxmlformats.org/drawingml/2006/main' R = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' REL_TYPE_IMAGE = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image' EXT_TO_MIME = { 'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg', 'gif': 'image/gif', 'bmp': 'image/bmp', 'tiff': 'image/tiff', 'webp': 'image/webp', } def unpack(docx_path, out_dir): """使用 zipfile 直接解包 .docx 到临时目录,替代外部 unpack.py 脚本。""" with zipfile.ZipFile(docx_path, 'r') as zf: zf.extractall(out_dir) def pack(unpacked_dir, output_docx, original_docx): """ 使用 zipfile 将修改后的目录重新打包为 .docx。 original_docx 参数目前保留只是为了兼容原函数签名,没有实际使用。 """ # 确保输出目录存在 out_dir = os.path.dirname(os.path.abspath(output_docx)) if out_dir and not os.path.exists(out_dir): os.makedirs(out_dir, exist_ok=True) # 将解包目录中的所有文件打成 ZIP(保持相对路径结构) with zipfile.ZipFile(output_docx, 'w', compression=zipfile.ZIP_DEFLATED) as zf: for root, _, files in os.walk(unpacked_dir): for fname in files: abs_path = os.path.join(root, fname) # docx 内部使用 / 作为路径分隔符 arcname = os.path.relpath(abs_path, unpacked_dir).replace(os.sep, '/') zf.write(abs_path, arcname) def build_image_index(unpacked_dir): """返回按文档顺序排列的图片列表""" word_dir = os.path.join(unpacked_dir, 'word') doc_xml = os.path.join(word_dir, 'document.xml') rels_xml = os.path.join(word_dir, '_rels', 'document.xml.rels') rels_root = etree.parse(rels_xml).getroot() rid_to_media = {} for rel in rels_root: if rel.get('Type', '') == REL_TYPE_IMAGE: rid_to_media[rel.get('Id')] = rel.get('Target') doc_root = etree.parse(doc_xml).getroot() results = [] for blip in doc_root.iter(f'{{{A}}}blip'): rid = blip.get(f'{{{R}}}embed') if not rid or rid not in rid_to_media: continue media_rel = rid_to_media[rid] media_abs = os.path.join(word_dir, media_rel.replace('/', os.sep)) ext = os.path.splitext(media_rel)[1].lstrip('.').lower() inline = blip while inline is not None and inline.tag not in (f'{{{WD}}}inline', f'{{{WD}}}anchor'): inline = inline.getparent() w_cm = h_cm = None docpr_name = '' if inline is not None: ext_el = inline.find(f'{{{WD}}}extent') if ext_el is not None: w_cm = round(int(ext_el.get('cx', 0)) / 360000, 2) h_cm = round(int(ext_el.get('cy', 0)) / 360000, 2) dp = inline.find(f'{{{WD}}}docPr') if dp is not None: docpr_name = dp.get('name', '') results.append({ 'index': len(results) + 1, 'rid': rid, 'media_file': media_rel, 'abs_path': media_abs, 'ext': ext, 'docpr_name': docpr_name, 'width_cm': w_cm, 'height_cm': h_cm, }) return results def list_images(docx_path): imgs = get_images_info(docx_path) if not imgs: print("文档中没有找到图片。") return print(f"共找到 {len(imgs)} 张图片:\n") print(f" {'#':<4} {'文件名':<20} {'尺寸':<18} Word内部名称") print(" " + "-" * 62) for img in imgs: size = f"{img['width_cm']}×{img['height_cm']}cm" if img['width_cm'] else "未知" print(f" {img['index']:<4} {os.path.basename(img['media_file']):<20} {size:<18} {img['docpr_name']}") def get_images_info(docx_path): """ 返回给定 DOCX 文件中所有图片的结构化信息列表。 该函数专门为其他模块(例如 MCP 服务器)复用而设计, 行为等价于原来的 list_images 内部逻辑,但不做任何打印。 """ with tempfile.TemporaryDirectory() as tmpdir: unpack(docx_path, tmpdir) return build_image_index(tmpdir) def replace_image(unpacked_dir, index, new_image_path): """替换第 index 张图片(1-based)""" imgs = build_image_index(unpacked_dir) if index < 1 or index > len(imgs): raise ValueError(f"图片序号 {index} 超出范围(共 {len(imgs)} 张)") info = imgs[index - 1] old_abs = info['abs_path'] old_ext = info['ext'] new_ext = os.path.splitext(new_image_path)[1].lstrip('.').lower() if new_ext == 'jpg': new_ext = 'jpeg' print(f" 图片#{index} {os.path.basename(info['media_file'])}({old_ext.upper()})" f" ← {os.path.basename(new_image_path)}({new_ext.upper()})") if old_ext == new_ext: # ── 同格式:直接覆盖 ────────────────────────────── import shutil shutil.copy2(new_image_path, old_abs) else: # ── 不同格式:Pillow 转换 + 更新 rels + ContentTypes new_abs = os.path.splitext(old_abs)[0] + '.' + new_ext img = Image.open(new_image_path) fmt = {'jpeg': 'JPEG', 'png': 'PNG', 'gif': 'GIF', 'bmp': 'BMP', 'tiff': 'TIFF', 'webp': 'WEBP'}.get(new_ext, new_ext.upper()) if fmt == 'JPEG' and img.mode in ('RGBA', 'P'): img = img.convert('RGB') img.save(new_abs, format=fmt) if os.path.abspath(new_abs) != os.path.abspath(old_abs): os.remove(old_abs) # 更新 rels old_media = info['media_file'] new_media = os.path.splitext(old_media)[0] + '.' + new_ext word_dir = os.path.join(unpacked_dir, 'word') rels_path = os.path.join(word_dir, '_rels', 'document.xml.rels') rels_tree = etree.parse(rels_path) for rel in rels_tree.getroot(): if rel.get('Id') == info['rid']: rel.set('Target', new_media) break rels_tree.write(rels_path, xml_declaration=True, encoding='UTF-8', standalone=True) # 更新 ContentTypes ct_path = os.path.join(unpacked_dir, '[Content_Types].xml') ct_tree = etree.parse(ct_path) ct_root = ct_tree.getroot() existing = {el.get('Extension', '') for el in ct_root} if new_ext not in existing: etree.SubElement(ct_root, 'Default', Extension=new_ext, ContentType=EXT_TO_MIME.get(new_ext, f'image/{new_ext}')) ct_tree.write(ct_path, xml_declaration=True, encoding='UTF-8', standalone=True) print(f" 格式转换 {old_ext}→{new_ext},rels 和 ContentTypes 已更新") def paragraph_replace(para_el, replacements): """ 在段落级别替换文本,支持跨 元素的匹配。 策略: 1. 收集段落中所有 元素及其文本 2. 拼接成完整文本进行替换 3. 如果有替换发生,重新分配文本到原有的 元素中 """ # 收集所有 run 元素(),保持顺序 runs = list(para_el.findall(f'.//{{{W}}}r')) if not runs: return # 收集所有文本元素及其位置信息 t_elements = [] for run in runs: for t_el in run.findall(f'{{{W}}}t'): t_elements.append((run, t_el)) if not t_elements: return # 拼接完整文本 full_text = ''.join(t_el.text or '' for _, t_el in t_elements) original_text = full_text # 执行所有替换 for old, new in replacements: if old in full_text: full_text = full_text.replace(old, new) # 如果没有变化,直接返回 if full_text == original_text: return print(f"段落替换: {len(original_text)} -> {len(full_text)} 字符") # 将新文本重新分配到原有的 元素中 # 策略:尽量保持原有的文本分布比例 if len(t_elements) == 1: # 只有一个 元素,直接替换 _, t_el = t_elements[0] t_el.text = full_text if full_text and (full_text[0] == ' ' or full_text[-1] == ' '): t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') else: # 多个 元素:按原始长度比例分配新文本 original_lengths = [len(t_el.text or '') for _, t_el in t_elements] total_original = sum(original_lengths) if total_original == 0: # 原始都是空的,把所有文本放到第一个元素 t_elements[0][1].text = full_text for i in range(1, len(t_elements)): t_elements[i][1].text = '' else: # 按比例分配 pos = 0 for i, (_, t_el) in enumerate(t_elements): if i == len(t_elements) - 1: # 最后一个元素,取剩余所有文本 chunk = full_text[pos:] else: # 按比例计算应该分配的长度 ratio = original_lengths[i] / total_original chunk_len = int(len(full_text) * ratio) chunk = full_text[pos:pos + chunk_len] pos += chunk_len t_el.text = chunk if chunk and (chunk[0] == ' ' or chunk[-1] == ' '): t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') def ensure_rpr(run_el): rpr = run_el.find(f'{{{W}}}rPr') if rpr is None: rpr = etree.Element(f'{{{W}}}rPr') run_el.insert(0, rpr) return rpr def set_color_on_rpr(rpr_el, hex_color): c = rpr_el.find(f'{{{W}}}color') if c is None: c = etree.SubElement(rpr_el, f'{{{W}}}color') c.set(f'{{{W}}}val', hex_color.lstrip('#')) def apply_color_to_keyword(doc_el, keyword, hex_color): """ 只给匹配到的关键字本身着色,而不是整个 run。 做法:在有关键字的 run 上,把文本拆成多段 run: [前缀][关键字][后缀],只有“关键字”这个 run 设置颜色。 """ # 先 list 一下,避免在遍历时修改树结构导致问题 runs = list(doc_el.iter(f'{{{W}}}r')) for run in runs: t_nodes = list(run.findall(f'{{{W}}}t')) if not t_nodes: continue full_text = ''.join(t.text or '' for t in t_nodes) if keyword not in full_text: continue parent = run.getparent() if parent is None: continue insert_pos = parent.index(run) # 原 run 的 rPr 复制给新 run orig_rpr = run.find(f'{{{W}}}rPr') if orig_rpr is not None: rpr_bytes = etree.tostring(orig_rpr) else: rpr_bytes = None def make_run(text, colored): new_r = etree.Element(f'{{{W}}}r') if rpr_bytes is not None: new_r.append(etree.fromstring(rpr_bytes)) t_el = etree.SubElement(new_r, f'{{{W}}}t') t_el.text = text if text and (text[0] == ' ' or text[-1] == ' '): t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') if colored: set_color_on_rpr(ensure_rpr(new_r), hex_color) return new_r segments = [] s = full_text start = 0 klen = len(keyword) while True: idx = s.find(keyword, start) if idx == -1: if start < len(s): segments.append((s[start:], False)) break if idx > start: segments.append((s[start:idx], False)) segments.append((keyword, True)) start = idx + klen # 用新 run 替换原 run parent.remove(run) for offset, (seg_text, colored) in enumerate(segments): if seg_text: parent.insert(insert_pos + offset, make_run(seg_text, colored)) def process(input_docx, output_docx, replacements, image_replacements, color_keywords): with tempfile.TemporaryDirectory() as tmpdir: print(f"📂 解包 {input_docx} ...") unpack(input_docx, tmpdir) doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml') if image_replacements: print(f"🖼️ 替换 {len(image_replacements)} 张图片...") for idx, new_img in image_replacements: replace_image(tmpdir, idx, new_img) tree = etree.parse(doc_xml_path) root = tree.getroot() if replacements: print(f"✏️ 替换 {len(replacements)} 条文本...") for para in root.iter(f'{{{W}}}p'): paragraph_replace(para, replacements) # 根据 span 解析出的关键字上色 for keyword, color in color_keywords: print(f"🎨 关键词「{keyword}」→ #{color}") apply_color_to_keyword(root, keyword, color) tree.write(doc_xml_path, xml_declaration=True, encoding='UTF-8', standalone=True) print(f"📦 打包 → {output_docx} ...") pack(tmpdir, output_docx, input_docx) print(f"✅ 完成!输出: {output_docx}") def _parse_span_replacement(new_text): """ 解析 NEW 文本中的 span 标签,用于决定颜色。 约定格式(不区分大小写): 待补充 待补充 返回: (纯文本, [(keyword, hex_color), ...]) """ import re span_pattern = re.compile( r']*?color=["\']?(#?[0-9a-fA-F]{6})["\']?[^>]*>(.*?)', re.IGNORECASE | re.DOTALL, ) color_keywords = [] def _repl(m): hex_color = m.group(1).lstrip('#') keyword = m.group(2) color_keywords.append((keyword, hex_color)) return keyword plain_text = span_pattern.sub(_repl, new_text) return plain_text, color_keywords def main(): parser = argparse.ArgumentParser(description='DOCX 格式保留:替换文本/图片/颜色') parser.add_argument('input', help='输入 .docx') parser.add_argument('output', nargs='?', help='输出 .docx') parser.add_argument('--list-images', action='store_true', help='列出所有图片') parser.add_argument('--replace', nargs=2, metavar=('OLD', 'NEW'), action='append', default=[]) parser.add_argument('--image', nargs=2, metavar=('INDEX', 'FILE'), action='append', default=[], help='图片替换') args = parser.parse_args() if args.list_images: list_images(args.input) return if not args.output: parser.error("需要指定输出文件") # 处理 span 颜色:把 NEW 中的 文字 抽出来 replacements = [] color_keywords = [] for old, new_raw in args.replace: new_plain, spans = _parse_span_replacement(new_raw) replacements.append((old, new_plain)) color_keywords.extend(spans) process( input_docx = args.input, output_docx = args.output, replacements = replacements, image_replacements= [(int(i), f) for i, f in args.image], color_keywords = color_keywords, ) if __name__ == '__main__': main()