first commit

2026-02-12 16:24:41 +08:00
commit 1b4f81a9bc
6 changed files with 674 additions and 0 deletions
--- a/mcp_docx.py
+++ b/mcp_docx.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python3
+"""
+docx_editor.py — 保留原格式替换文本 + 修改字体颜色 + 替换图片
+
+用法:
+  # 列出文档中所有图片
+  python3 docx_editor.py input.docx --list-images
+
+  # 文本替换 + 颜色
+  python3 docx_editor.py input.docx output.docx \
+    --replace "原文" "新文" \
+    --color "关键词" "FF0000"
+
+  # 图片替换（按文档中出现的顺序，从1开始）
+  python3 docx_editor.py input.docx output.docx \
+    --image 1 new_chart.png \
+    --image 2 new_photo.jpg
+
+  # 同时替换文字和图片
+  python3 docx_editor.py input.docx output.docx \
+    --replace "旧标题" "新标题" \
+    --image 1 new_image.png \
+    --color "重点" "FF0000"
+"""
+
+import argparse
+import os
+import tempfile
+import zipfile
+from lxml import etree
+from PIL import Image
+
+W   = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
+WD  = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
+A   = 'http://schemas.openxmlformats.org/drawingml/2006/main'
+R   = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
+REL_TYPE_IMAGE = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
+
+EXT_TO_MIME = {
+    'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg',
+    'gif': 'image/gif', 'bmp': 'image/bmp', 'tiff': 'image/tiff',
+    'webp': 'image/webp',
+}
+
+
+def unpack(docx_path, out_dir):
+    """使用 zipfile 直接解包 .docx 到临时目录，替代外部 unpack.py 脚本。"""
+    with zipfile.ZipFile(docx_path, 'r') as zf:
+        zf.extractall(out_dir)
+
+
+def pack(unpacked_dir, output_docx, original_docx):
+    """
+    使用 zipfile 将修改后的目录重新打包为 .docx。
+
+    original_docx 参数目前保留只是为了兼容原函数签名，没有实际使用。
+    """
+    # 确保输出目录存在
+    out_dir = os.path.dirname(os.path.abspath(output_docx))
+    if out_dir and not os.path.exists(out_dir):
+        os.makedirs(out_dir, exist_ok=True)
+
+    # 将解包目录中的所有文件打成 ZIP（保持相对路径结构）
+    with zipfile.ZipFile(output_docx, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
+        for root, _, files in os.walk(unpacked_dir):
+            for fname in files:
+                abs_path = os.path.join(root, fname)
+                # docx 内部使用 / 作为路径分隔符
+                arcname = os.path.relpath(abs_path, unpacked_dir).replace(os.sep, '/')
+                zf.write(abs_path, arcname)
+
+
+def build_image_index(unpacked_dir):
+    """返回按文档顺序排列的图片列表"""
+    word_dir  = os.path.join(unpacked_dir, 'word')
+    doc_xml   = os.path.join(word_dir, 'document.xml')
+    rels_xml  = os.path.join(word_dir, '_rels', 'document.xml.rels')
+
+    rels_root = etree.parse(rels_xml).getroot()
+    rid_to_media = {}
+    for rel in rels_root:
+        if rel.get('Type', '') == REL_TYPE_IMAGE:
+            rid_to_media[rel.get('Id')] = rel.get('Target')
+
+    doc_root = etree.parse(doc_xml).getroot()
+    results = []
+    for blip in doc_root.iter(f'{{{A}}}blip'):
+        rid = blip.get(f'{{{R}}}embed')
+        if not rid or rid not in rid_to_media:
+            continue
+        media_rel = rid_to_media[rid]
+        media_abs = os.path.join(word_dir, media_rel.replace('/', os.sep))
+        ext       = os.path.splitext(media_rel)[1].lstrip('.').lower()
+
+        inline = blip
+        while inline is not None and inline.tag not in (f'{{{WD}}}inline', f'{{{WD}}}anchor'):
+            inline = inline.getparent()
+        w_cm = h_cm = None
+        docpr_name = ''
+        if inline is not None:
+            ext_el = inline.find(f'{{{WD}}}extent')
+            if ext_el is not None:
+                w_cm = round(int(ext_el.get('cx', 0)) / 360000, 2)
+                h_cm = round(int(ext_el.get('cy', 0)) / 360000, 2)
+            dp = inline.find(f'{{{WD}}}docPr')
+            if dp is not None:
+                docpr_name = dp.get('name', '')
+
+        results.append({
+            'index': len(results) + 1, 'rid': rid,
+            'media_file': media_rel, 'abs_path': media_abs,
+            'ext': ext, 'docpr_name': docpr_name,
+            'width_cm': w_cm, 'height_cm': h_cm,
+        })
+    return results
+
+
+def list_images(docx_path):
+    imgs = get_images_info(docx_path)
+    if not imgs:
+        print("文档中没有找到图片。")
+        return
+    print(f"共找到 {len(imgs)} 张图片：\n")
+    print(f"  {'#':<4} {'文件名':<20} {'尺寸':<18} Word内部名称")
+    print("  " + "-" * 62)
+    for img in imgs:
+        size = f"{img['width_cm']}×{img['height_cm']}cm" if img['width_cm'] else "未知"
+        print(f"  {img['index']:<4} {os.path.basename(img['media_file']):<20} {size:<18} {img['docpr_name']}")
+
+
+def get_images_info(docx_path):
+    """
+    返回给定 DOCX 文件中所有图片的结构化信息列表。
+
+    该函数专门为其他模块（例如 MCP 服务器）复用而设计，
+    行为等价于原来的 list_images 内部逻辑，但不做任何打印。
+    """
+    with tempfile.TemporaryDirectory() as tmpdir:
+        unpack(docx_path, tmpdir)
+        return build_image_index(tmpdir)
+
+
+def replace_image(unpacked_dir, index, new_image_path):
+    """替换第 index 张图片（1-based）"""
+    imgs = build_image_index(unpacked_dir)
+    if index < 1 or index > len(imgs):
+        raise ValueError(f"图片序号 {index} 超出范围（共 {len(imgs)} 张）")
+
+    info     = imgs[index - 1]
+    old_abs  = info['abs_path']
+    old_ext  = info['ext']
+    new_ext  = os.path.splitext(new_image_path)[1].lstrip('.').lower()
+    if new_ext == 'jpg':
+        new_ext = 'jpeg'
+
+    print(f"    图片#{index} {os.path.basename(info['media_file'])}({old_ext.upper()})"
+          f" ← {os.path.basename(new_image_path)}({new_ext.upper()})")
+
+    if old_ext == new_ext:
+        # ── 同格式：直接覆盖 ──────────────────────────────
+        import shutil
+        shutil.copy2(new_image_path, old_abs)
+
+    else:
+        # ── 不同格式：Pillow 转换 + 更新 rels + ContentTypes
+        new_abs = os.path.splitext(old_abs)[0] + '.' + new_ext
+        img = Image.open(new_image_path)
+        fmt = {'jpeg': 'JPEG', 'png': 'PNG', 'gif': 'GIF',
+               'bmp': 'BMP', 'tiff': 'TIFF', 'webp': 'WEBP'}.get(new_ext, new_ext.upper())
+        if fmt == 'JPEG' and img.mode in ('RGBA', 'P'):
+            img = img.convert('RGB')
+        img.save(new_abs, format=fmt)
+        if os.path.abspath(new_abs) != os.path.abspath(old_abs):
+            os.remove(old_abs)
+
+        # 更新 rels
+        old_media = info['media_file']
+        new_media = os.path.splitext(old_media)[0] + '.' + new_ext
+        word_dir  = os.path.join(unpacked_dir, 'word')
+        rels_path = os.path.join(word_dir, '_rels', 'document.xml.rels')
+        rels_tree = etree.parse(rels_path)
+        for rel in rels_tree.getroot():
+            if rel.get('Id') == info['rid']:
+                rel.set('Target', new_media)
+                break
+        rels_tree.write(rels_path, xml_declaration=True, encoding='UTF-8', standalone=True)
+
+        # 更新 ContentTypes
+        ct_path = os.path.join(unpacked_dir, '[Content_Types].xml')
+        ct_tree = etree.parse(ct_path)
+        ct_root = ct_tree.getroot()
+        existing = {el.get('Extension', '') for el in ct_root}
+        if new_ext not in existing:
+            etree.SubElement(ct_root, 'Default', Extension=new_ext,
+                             ContentType=EXT_TO_MIME.get(new_ext, f'image/{new_ext}'))
+        ct_tree.write(ct_path, xml_declaration=True, encoding='UTF-8', standalone=True)
+        print(f"      格式转换 {old_ext}→{new_ext}，rels 和 ContentTypes 已更新")
+
+
+def paragraph_replace(para_el, replacements):
+    """在 <w:t> 层面替换文本，完全不碰图片和格式"""
+    for t_el in para_el.iter(f'{{{W}}}t'):
+        if not t_el.text:
+            continue
+        new_text = t_el.text
+        for old, new in replacements:
+            new_text = new_text.replace(old, new)
+        if new_text != t_el.text:
+            t_el.text = new_text
+            if new_text and (new_text[0] == ' ' or new_text[-1] == ' '):
+                t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
+
+
+def ensure_rpr(run_el):
+    rpr = run_el.find(f'{{{W}}}rPr')
+    if rpr is None:
+        rpr = etree.Element(f'{{{W}}}rPr')
+        run_el.insert(0, rpr)
+    return rpr
+
+def set_color_on_rpr(rpr_el, hex_color):
+    c = rpr_el.find(f'{{{W}}}color')
+    if c is None:
+        c = etree.SubElement(rpr_el, f'{{{W}}}color')
+    c.set(f'{{{W}}}val', hex_color.lstrip('#'))
+
+def apply_color_to_keyword(doc_el, keyword, hex_color):
+    """
+    只给匹配到的关键字本身着色，而不是整个 run。
+
+    做法：在有关键字的 run 上，把文本拆成多段 run：
+      [前缀][关键字][后缀]，只有“关键字”这个 run 设置颜色。
+    """
+    # 先 list 一下，避免在遍历时修改树结构导致问题
+    runs = list(doc_el.iter(f'{{{W}}}r'))
+    for run in runs:
+        t_nodes = list(run.findall(f'{{{W}}}t'))
+        if not t_nodes:
+            continue
+        full_text = ''.join(t.text or '' for t in t_nodes)
+        if keyword not in full_text:
+            continue
+
+        parent = run.getparent()
+        if parent is None:
+            continue
+        insert_pos = parent.index(run)
+
+        # 原 run 的 rPr 复制给新 run
+        orig_rpr = run.find(f'{{{W}}}rPr')
+        if orig_rpr is not None:
+            rpr_bytes = etree.tostring(orig_rpr)
+        else:
+            rpr_bytes = None
+
+        def make_run(text, colored):
+            new_r = etree.Element(f'{{{W}}}r')
+            if rpr_bytes is not None:
+                new_r.append(etree.fromstring(rpr_bytes))
+            t_el = etree.SubElement(new_r, f'{{{W}}}t')
+            t_el.text = text
+            if text and (text[0] == ' ' or text[-1] == ' '):
+                t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
+            if colored:
+                set_color_on_rpr(ensure_rpr(new_r), hex_color)
+            return new_r
+
+        segments = []
+        s = full_text
+        start = 0
+        klen = len(keyword)
+        while True:
+            idx = s.find(keyword, start)
+            if idx == -1:
+                if start < len(s):
+                    segments.append((s[start:], False))
+                break
+            if idx > start:
+                segments.append((s[start:idx], False))
+            segments.append((keyword, True))
+            start = idx + klen
+
+        # 用新 run 替换原 run
+        parent.remove(run)
+        for offset, (seg_text, colored) in enumerate(segments):
+            if seg_text:
+                parent.insert(insert_pos + offset, make_run(seg_text, colored))
+
+def process(input_docx, output_docx, replacements, image_replacements,
+            color_keywords):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        print(f"📂 解包 {input_docx} ...")
+        unpack(input_docx, tmpdir)
+
+        doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml')
+
+        if image_replacements:
+            print(f"🖼️  替换 {len(image_replacements)} 张图片...")
+            for idx, new_img in image_replacements:
+                replace_image(tmpdir, idx, new_img)
+
+        tree = etree.parse(doc_xml_path)
+        root = tree.getroot()
+
+        if replacements:
+            print(f"✏️  替换 {len(replacements)} 条文本...")
+            for para in root.iter(f'{{{W}}}p'):
+                paragraph_replace(para, replacements)
+
+        # 根据 span 解析出的关键字上色
+        for keyword, color in color_keywords:
+            print(f"🎨 关键词「{keyword}」→ #{color}")
+            apply_color_to_keyword(root, keyword, color)
+
+        tree.write(doc_xml_path, xml_declaration=True, encoding='UTF-8', standalone=True)
+        print(f"📦 打包 → {output_docx} ...")
+        pack(tmpdir, output_docx, input_docx)
+        print(f"✅ 完成！输出: {output_docx}")
+
+
+def _parse_span_replacement(new_text):
+    """
+    解析 NEW 文本中的 span 标签，用于决定颜色。
+
+    约定格式（不区分大小写）：
+      <span color="FF0000">待补充</span>
+      <span color="#FF0000">待补充</span>
+
+    返回: (纯文本, [(keyword, hex_color), ...])
+    """
+    import re
+
+    span_pattern = re.compile(
+        r'<span\s+[^>]*?color=["\']?(#?[0-9a-fA-F]{6})["\']?[^>]*>(.*?)</span>',
+        re.IGNORECASE | re.DOTALL,
+    )
+
+    color_keywords = []
+
+    def _repl(m):
+        hex_color = m.group(1).lstrip('#')
+        keyword = m.group(2)
+        color_keywords.append((keyword, hex_color))
+        return keyword
+
+    plain_text = span_pattern.sub(_repl, new_text)
+    return plain_text, color_keywords
+
+
+def main():
+    parser = argparse.ArgumentParser(description='DOCX 格式保留：替换文本/图片/颜色')
+    parser.add_argument('input', help='输入 .docx')
+    parser.add_argument('output', nargs='?', help='输出 .docx')
+    parser.add_argument('--list-images', action='store_true', help='列出所有图片')
+    parser.add_argument('--replace', nargs=2, metavar=('OLD', 'NEW'),
+                        action='append', default=[])
+    parser.add_argument('--image', nargs=2, metavar=('INDEX', 'FILE'),
+                        action='append', default=[], help='图片替换')
+    args = parser.parse_args()
+
+    if args.list_images:
+        list_images(args.input)
+        return
+    if not args.output:
+        parser.error("需要指定输出文件")
+
+    # 处理 span 颜色：把 NEW 中的 <span color="...">文字</span> 抽出来
+    replacements = []
+    color_keywords = []
+    for old, new_raw in args.replace:
+        new_plain, spans = _parse_span_replacement(new_raw)
+        replacements.append((old, new_plain))
+        color_keywords.extend(spans)
+
+    process(
+        input_docx        = args.input,
+        output_docx       = args.output,
+        replacements      = replacements,
+        image_replacements= [(int(i), f) for i, f in args.image],
+        color_keywords    = color_keywords,
+    )
+
+if __name__ == '__main__':
+    main()