mcp/mcp_docx.py

#!/usr/bin/env python3
"""
docx_editor.py — 保留原格式替换文本 + 修改字体颜色 + 替换图片

用法:
  # 列出文档中所有图片
  python3 docx_editor.py input.docx --list-images

  # 文本替换 + 颜色
  python3 docx_editor.py input.docx output.docx \
    --replace "原文" "新文" \
    --color "关键词" "FF0000"

  # 图片替换（按文档中出现的顺序，从1开始）
  python3 docx_editor.py input.docx output.docx \
    --image 1 new_chart.png \
    --image 2 new_photo.jpg

  # 同时替换文字和图片
  python3 docx_editor.py input.docx output.docx \
    --replace "旧标题" "新标题" \
    --image 1 new_image.png \
    --color "重点" "FF0000"
"""

import argparse
import os
import tempfile
import zipfile
from lxml import etree
from PIL import Image

W   = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
WD  = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
A   = 'http://schemas.openxmlformats.org/drawingml/2006/main'
R   = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
REL_TYPE_IMAGE = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'

EXT_TO_MIME = {
    'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg',
    'gif': 'image/gif', 'bmp': 'image/bmp', 'tiff': 'image/tiff',
    'webp': 'image/webp',
}


def unpack(docx_path, out_dir):
    """使用 zipfile 直接解包 .docx 到临时目录，替代外部 unpack.py 脚本。"""
    with zipfile.ZipFile(docx_path, 'r') as zf:
        zf.extractall(out_dir)


def pack(unpacked_dir, output_docx, original_docx):
    """
    使用 zipfile 将修改后的目录重新打包为 .docx。

    original_docx 参数目前保留只是为了兼容原函数签名，没有实际使用。
    """
    # 确保输出目录存在
    out_dir = os.path.dirname(os.path.abspath(output_docx))
    if out_dir and not os.path.exists(out_dir):
        os.makedirs(out_dir, exist_ok=True)

    # 将解包目录中的所有文件打成 ZIP（保持相对路径结构）
    with zipfile.ZipFile(output_docx, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(unpacked_dir):
            for fname in files:
                abs_path = os.path.join(root, fname)
                # docx 内部使用 / 作为路径分隔符
                arcname = os.path.relpath(abs_path, unpacked_dir).replace(os.sep, '/')
                zf.write(abs_path, arcname)


def build_image_index(unpacked_dir):
    """返回按文档顺序排列的图片列表"""
    word_dir  = os.path.join(unpacked_dir, 'word')
    doc_xml   = os.path.join(word_dir, 'document.xml')
    rels_xml  = os.path.join(word_dir, '_rels', 'document.xml.rels')

    rels_root = etree.parse(rels_xml).getroot()
    rid_to_media = {}
    for rel in rels_root:
        if rel.get('Type', '') == REL_TYPE_IMAGE:
            rid_to_media[rel.get('Id')] = rel.get('Target')

    doc_root = etree.parse(doc_xml).getroot()
    results = []
    for blip in doc_root.iter(f'{{{A}}}blip'):
        rid = blip.get(f'{{{R}}}embed')
        if not rid or rid not in rid_to_media:
            continue
        media_rel = rid_to_media[rid]
        media_abs = os.path.join(word_dir, media_rel.replace('/', os.sep))
        ext       = os.path.splitext(media_rel)[1].lstrip('.').lower()

        inline = blip
        while inline is not None and inline.tag not in (f'{{{WD}}}inline', f'{{{WD}}}anchor'):
            inline = inline.getparent()
        w_cm = h_cm = None
        docpr_name = ''
        if inline is not None:
            ext_el = inline.find(f'{{{WD}}}extent')
            if ext_el is not None:
                w_cm = round(int(ext_el.get('cx', 0)) / 360000, 2)
                h_cm = round(int(ext_el.get('cy', 0)) / 360000, 2)
            dp = inline.find(f'{{{WD}}}docPr')
            if dp is not None:
                docpr_name = dp.get('name', '')

        results.append({
            'index': len(results) + 1, 'rid': rid,
            'media_file': media_rel, 'abs_path': media_abs,
            'ext': ext, 'docpr_name': docpr_name,
            'width_cm': w_cm, 'height_cm': h_cm,
        })
    return results


def list_images(docx_path):
    imgs = get_images_info(docx_path)
    if not imgs:
        print("文档中没有找到图片。")
        return
    print(f"共找到 {len(imgs)} 张图片：\n")
    print(f"  {'#':<4} {'文件名':<20} {'尺寸':<18} Word内部名称")
    print("  " + "-" * 62)
    for img in imgs:
        size = f"{img['width_cm']}×{img['height_cm']}cm" if img['width_cm'] else "未知"
        print(f"  {img['index']:<4} {os.path.basename(img['media_file']):<20} {size:<18} {img['docpr_name']}")


def get_images_info(docx_path):
    """
    返回给定 DOCX 文件中所有图片的结构化信息列表。

    该函数专门为其他模块（例如 MCP 服务器）复用而设计，
    行为等价于原来的 list_images 内部逻辑，但不做任何打印。
    """
    with tempfile.TemporaryDirectory() as tmpdir:
        unpack(docx_path, tmpdir)
        return build_image_index(tmpdir)


def replace_image(unpacked_dir, index, new_image_path):
    """替换第 index 张图片（1-based）"""
    imgs = build_image_index(unpacked_dir)
    if index < 1 or index > len(imgs):
        raise ValueError(f"图片序号 {index} 超出范围（共 {len(imgs)} 张）")

    info     = imgs[index - 1]
    old_abs  = info['abs_path']
    old_ext  = info['ext']
    new_ext  = os.path.splitext(new_image_path)[1].lstrip('.').lower()
    if new_ext == 'jpg':
        new_ext = 'jpeg'

    print(f"    图片#{index} {os.path.basename(info['media_file'])}({old_ext.upper()})"
          f" ← {os.path.basename(new_image_path)}({new_ext.upper()})")

    if old_ext == new_ext:
        # ── 同格式：直接覆盖 ──────────────────────────────
        import shutil
        shutil.copy2(new_image_path, old_abs)

    else:
        # ── 不同格式：Pillow 转换 + 更新 rels + ContentTypes
        new_abs = os.path.splitext(old_abs)[0] + '.' + new_ext
        img = Image.open(new_image_path)
        fmt = {'jpeg': 'JPEG', 'png': 'PNG', 'gif': 'GIF',
               'bmp': 'BMP', 'tiff': 'TIFF', 'webp': 'WEBP'}.get(new_ext, new_ext.upper())
        if fmt == 'JPEG' and img.mode in ('RGBA', 'P'):
            img = img.convert('RGB')
        img.save(new_abs, format=fmt)
        if os.path.abspath(new_abs) != os.path.abspath(old_abs):
            os.remove(old_abs)

        # 更新 rels
        old_media = info['media_file']
        new_media = os.path.splitext(old_media)[0] + '.' + new_ext
        word_dir  = os.path.join(unpacked_dir, 'word')
        rels_path = os.path.join(word_dir, '_rels', 'document.xml.rels')
        rels_tree = etree.parse(rels_path)
        for rel in rels_tree.getroot():
            if rel.get('Id') == info['rid']:
                rel.set('Target', new_media)
                break
        rels_tree.write(rels_path, xml_declaration=True, encoding='UTF-8', standalone=True)

        # 更新 ContentTypes
        ct_path = os.path.join(unpacked_dir, '[Content_Types].xml')
        ct_tree = etree.parse(ct_path)
        ct_root = ct_tree.getroot()
        existing = {el.get('Extension', '') for el in ct_root}
        if new_ext not in existing:
            etree.SubElement(ct_root, 'Default', Extension=new_ext,
                             ContentType=EXT_TO_MIME.get(new_ext, f'image/{new_ext}'))
        ct_tree.write(ct_path, xml_declaration=True, encoding='UTF-8', standalone=True)
        print(f"      格式转换 {old_ext}→{new_ext}，rels 和 ContentTypes 已更新")


def paragraph_replace(para_el, replacements):
    """在 <w:t> 层面替换文本，完全不碰图片和格式"""
    for t_el in para_el.iter(f'{{{W}}}t'):
        if not t_el.text:
            continue
        new_text = t_el.text
        for old, new in replacements:
            new_text = new_text.replace(old, new)
        if new_text != t_el.text:
            t_el.text = new_text
            if new_text and (new_text[0] == ' ' or new_text[-1] == ' '):
                t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')


def ensure_rpr(run_el):
    rpr = run_el.find(f'{{{W}}}rPr')
    if rpr is None:
        rpr = etree.Element(f'{{{W}}}rPr')
        run_el.insert(0, rpr)
    return rpr

def set_color_on_rpr(rpr_el, hex_color):
    c = rpr_el.find(f'{{{W}}}color')
    if c is None:
        c = etree.SubElement(rpr_el, f'{{{W}}}color')
    c.set(f'{{{W}}}val', hex_color.lstrip('#'))

def apply_color_to_keyword(doc_el, keyword, hex_color):
    """
    只给匹配到的关键字本身着色，而不是整个 run。

    做法：在有关键字的 run 上，把文本拆成多段 run：
      [前缀][关键字][后缀]，只有“关键字”这个 run 设置颜色。
    """
    # 先 list 一下，避免在遍历时修改树结构导致问题
    runs = list(doc_el.iter(f'{{{W}}}r'))
    for run in runs:
        t_nodes = list(run.findall(f'{{{W}}}t'))
        if not t_nodes:
            continue
        full_text = ''.join(t.text or '' for t in t_nodes)
        if keyword not in full_text:
            continue

        parent = run.getparent()
        if parent is None:
            continue
        insert_pos = parent.index(run)

        # 原 run 的 rPr 复制给新 run
        orig_rpr = run.find(f'{{{W}}}rPr')
        if orig_rpr is not None:
            rpr_bytes = etree.tostring(orig_rpr)
        else:
            rpr_bytes = None

        def make_run(text, colored):
            new_r = etree.Element(f'{{{W}}}r')
            if rpr_bytes is not None:
                new_r.append(etree.fromstring(rpr_bytes))
            t_el = etree.SubElement(new_r, f'{{{W}}}t')
            t_el.text = text
            if text and (text[0] == ' ' or text[-1] == ' '):
                t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
            if colored:
                set_color_on_rpr(ensure_rpr(new_r), hex_color)
            return new_r

        segments = []
        s = full_text
        start = 0
        klen = len(keyword)
        while True:
            idx = s.find(keyword, start)
            if idx == -1:
                if start < len(s):
                    segments.append((s[start:], False))
                break
            if idx > start:
                segments.append((s[start:idx], False))
            segments.append((keyword, True))
            start = idx + klen

        # 用新 run 替换原 run
        parent.remove(run)
        for offset, (seg_text, colored) in enumerate(segments):
            if seg_text:
                parent.insert(insert_pos + offset, make_run(seg_text, colored))

def process(input_docx, output_docx, replacements, image_replacements,
            color_keywords):
    with tempfile.TemporaryDirectory() as tmpdir:
        print(f"📂 解包 {input_docx} ...")
        unpack(input_docx, tmpdir)

        doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml')

        if image_replacements:
            print(f"🖼️  替换 {len(image_replacements)} 张图片...")
            for idx, new_img in image_replacements:
                replace_image(tmpdir, idx, new_img)

        tree = etree.parse(doc_xml_path)
        root = tree.getroot()

        if replacements:
            print(f"✏️  替换 {len(replacements)} 条文本...")
            for para in root.iter(f'{{{W}}}p'):
                paragraph_replace(para, replacements)

        # 根据 span 解析出的关键字上色
        for keyword, color in color_keywords:
            print(f"🎨 关键词「{keyword}」→ #{color}")
            apply_color_to_keyword(root, keyword, color)

        tree.write(doc_xml_path, xml_declaration=True, encoding='UTF-8', standalone=True)
        print(f"📦 打包 → {output_docx} ...")
        pack(tmpdir, output_docx, input_docx)
        print(f"✅ 完成！输出: {output_docx}")


def _parse_span_replacement(new_text):
    """
    解析 NEW 文本中的 span 标签，用于决定颜色。

    约定格式（不区分大小写）：
      <span color="FF0000">待补充</span>
      <span color="#FF0000">待补充</span>

    返回: (纯文本, [(keyword, hex_color), ...])
    """
    import re

    span_pattern = re.compile(
        r'<span\s+[^>]*?color=["\']?(#?[0-9a-fA-F]{6})["\']?[^>]*>(.*?)</span>',
        re.IGNORECASE | re.DOTALL,
    )

    color_keywords = []

    def _repl(m):
        hex_color = m.group(1).lstrip('#')
        keyword = m.group(2)
        color_keywords.append((keyword, hex_color))
        return keyword

    plain_text = span_pattern.sub(_repl, new_text)
    return plain_text, color_keywords


def main():
    parser = argparse.ArgumentParser(description='DOCX 格式保留：替换文本/图片/颜色')
    parser.add_argument('input', help='输入 .docx')
    parser.add_argument('output', nargs='?', help='输出 .docx')
    parser.add_argument('--list-images', action='store_true', help='列出所有图片')
    parser.add_argument('--replace', nargs=2, metavar=('OLD', 'NEW'),
                        action='append', default=[])
    parser.add_argument('--image', nargs=2, metavar=('INDEX', 'FILE'),
                        action='append', default=[], help='图片替换')
    args = parser.parse_args()

    if args.list_images:
        list_images(args.input)
        return
    if not args.output:
        parser.error("需要指定输出文件")

    # 处理 span 颜色：把 NEW 中的 <span color="...">文字</span> 抽出来
    replacements = []
    color_keywords = []
    for old, new_raw in args.replace:
        new_plain, spans = _parse_span_replacement(new_raw)
        replacements.append((old, new_plain))
        color_keywords.extend(spans)

    process(
        input_docx        = args.input,
        output_docx       = args.output,
        replacements      = replacements,
        image_replacements= [(int(i), f) for i, f in args.image],
        color_keywords    = color_keywords,
    )

if __name__ == '__main__':
    main()