mcp/mcp_docx.py

#!/usr/bin/env python3
"""
docx_editor.py — 保留原格式替换文本 + 修改字体颜色

用法:
  # 列出文档中所有图片
  python3 docx_editor.py input.docx --list-images

  # 文本替换 + 颜色
  python3 docx_editor.py input.docx output.docx \
    --replace "原文" "新文" \
    --color "关键词" "FF0000"
"""

import argparse
import copy
import os
import tempfile
import zipfile
from lxml import etree
import re

W   = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
WD  = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
A   = 'http://schemas.openxmlformats.org/drawingml/2006/main'
R   = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
REL_TYPE_IMAGE = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'


def unpack(docx_path, out_dir):
    """使用 zipfile 直接解包 .docx 到临时目录，替代外部 unpack.py 脚本。"""
    with zipfile.ZipFile(docx_path, 'r') as zf:
        zf.extractall(out_dir)


def pack(unpacked_dir, output_docx, original_docx):
    """
    使用 zipfile 将修改后的目录重新打包为 .docx。

    original_docx 参数目前保留只是为了兼容原函数签名，没有实际使用。
    """
    # 确保输出目录存在
    out_dir = os.path.dirname(os.path.abspath(output_docx))
    if out_dir and not os.path.exists(out_dir):
        os.makedirs(out_dir, exist_ok=True)

    # 将解包目录中的所有文件打成 ZIP（保持相对路径结构）
    with zipfile.ZipFile(output_docx, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(unpacked_dir):
            for fname in files:
                abs_path = os.path.join(root, fname)
                # docx 内部使用 / 作为路径分隔符
                arcname = os.path.relpath(abs_path, unpacked_dir).replace(os.sep, '/')
                zf.write(abs_path, arcname)


def build_image_index(unpacked_dir):
    """返回按文档顺序排列的图片列表"""
    word_dir  = os.path.join(unpacked_dir, 'word')
    doc_xml   = os.path.join(word_dir, 'document.xml')
    rels_xml  = os.path.join(word_dir, '_rels', 'document.xml.rels')

    rels_root = etree.parse(rels_xml).getroot()
    rid_to_media = {}
    for rel in rels_root:
        if rel.get('Type', '') == REL_TYPE_IMAGE:
            rid_to_media[rel.get('Id')] = rel.get('Target')

    doc_root = etree.parse(doc_xml).getroot()
    results = []
    for blip in doc_root.iter(f'{{{A}}}blip'):
        rid = blip.get(f'{{{R}}}embed')
        if not rid or rid not in rid_to_media:
            continue
        media_rel = rid_to_media[rid]
        media_abs = os.path.join(word_dir, media_rel.replace('/', os.sep))
        ext       = os.path.splitext(media_rel)[1].lstrip('.').lower()

        inline = blip
        while inline is not None and inline.tag not in (f'{{{WD}}}inline', f'{{{WD}}}anchor'):
            inline = inline.getparent()
        w_cm = h_cm = None
        docpr_name = ''
        if inline is not None:
            ext_el = inline.find(f'{{{WD}}}extent')
            if ext_el is not None:
                w_cm = round(int(ext_el.get('cx', 0)) / 360000, 2)
                h_cm = round(int(ext_el.get('cy', 0)) / 360000, 2)
            dp = inline.find(f'{{{WD}}}docPr')
            if dp is not None:
                docpr_name = dp.get('name', '')

        results.append({
            'index': len(results) + 1, 'rid': rid,
            'media_file': media_rel, 'abs_path': media_abs,
            'ext': ext, 'docpr_name': docpr_name,
            'width_cm': w_cm, 'height_cm': h_cm,
        })
    return results


def list_images(docx_path):
    imgs = get_images_info(docx_path)
    if not imgs:
        print("文档中没有找到图片。")
        return
    print(f"共找到 {len(imgs)} 张图片：\n")
    print(f"  {'#':<4} {'文件名':<20} {'尺寸':<18} Word内部名称")
    print("  " + "-" * 62)
    for img in imgs:
        size = f"{img['width_cm']}×{img['height_cm']}cm" if img['width_cm'] else "未知"
        print(f"  {img['index']:<4} {os.path.basename(img['media_file']):<20} {size:<18} {img['docpr_name']}")


def get_images_info(docx_path):
    """
    返回给定 DOCX 文件中所有图片的结构化信息列表。

    该函数专门为其他模块（例如 MCP 服务器）复用而设计，
    行为等价于原来的 list_images 内部逻辑，但不做任何打印。
    """
    with tempfile.TemporaryDirectory() as tmpdir:
        unpack(docx_path, tmpdir)
        return build_image_index(tmpdir)


def _normalize_newlines(text):
    if text is None:
        return ''
    return str(text).replace('\r\n', '\n').replace('\r', '\n')


def _is_text_node(el):
    return el.tag == f'{{{W}}}t'


def _is_break_node(el):
    return el.tag in (f'{{{W}}}br', f'{{{W}}}cr')


def _is_tab_node(el):
    return el.tag == f'{{{W}}}tab'


def _iter_run_text_parts(run_el):
    for child in run_el:
        if _is_text_node(child):
            yield child, _normalize_newlines(child.text or '')
        elif _is_break_node(child):
            yield child, '\n'
        elif _is_tab_node(child):
            yield child, '\t'


def _run_text(run_el):
    return ''.join(part for _, part in _iter_run_text_parts(run_el))


def _paragraph_text(para_el):
    return ''.join(_run_text(run) for run in para_el.iter(f'{{{W}}}r'))


def _clear_run_text_like_children(run_el):
    for child in list(run_el):
        if _is_text_node(child) or _is_break_node(child) or _is_tab_node(child):
            run_el.remove(child)


def _append_text_to_run(run_el, text):
    text = _normalize_newlines(text)
    parts = text.split('\n')

    if len(parts) == 1:
        t_el = etree.SubElement(run_el, f'{{{W}}}t')
        t_el.text = parts[0]
        if parts[0] and (parts[0][0] == ' ' or parts[0][-1] == ' '):
            t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
        return

    for idx, part in enumerate(parts):
        if part:
            t_el = etree.SubElement(run_el, f'{{{W}}}t')
            t_el.text = part
            if part[0] == ' ' or part[-1] == ' ':
                t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
        if idx < len(parts) - 1:
            etree.SubElement(run_el, f'{{{W}}}br')


def _ensure_paragraph_run(para_el):
    runs = list(para_el.findall(f'.//{{{W}}}r'))
    if runs:
        return runs[0]

    ppr = para_el.find(f'{{{W}}}pPr')
    new_r = etree.Element(f'{{{W}}}r')
    if ppr is None:
        para_el.insert(0, new_r)
    else:
        para_el.insert(para_el.index(ppr) + 1, new_r)
    return new_r


def _set_paragraph_text(para_el, text):
    runs = list(para_el.findall(f'.//{{{W}}}r'))
    text_runs = [run for run in runs if any(True for _ in _iter_run_text_parts(run))]

    if text_runs:
        first_run = text_runs[0]
        for run in text_runs:
            _clear_run_text_like_children(run)
    else:
        first_run = _ensure_paragraph_run(para_el)
        _clear_run_text_like_children(first_run)

    _append_text_to_run(first_run, text)


def _paragraph_list(doc_el):
    return list(doc_el.iter(f'{{{W}}}p'))


def _replace_paragraph_block(doc_el, old_text, new_text):
    old_segments = _normalize_newlines(old_text).split('\n\n')
    new_segments = _normalize_newlines(new_text).split('\n\n')
    if len(old_segments) <= 1:
        return False

    paras = _paragraph_list(doc_el)
    para_texts = [_paragraph_text(p) for p in paras]

    match_start = None
    for i in range(0, len(para_texts) - len(old_segments) + 1):
        if para_texts[i:i + len(old_segments)] == old_segments:
            match_start = i
            break

    if match_start is None:
        return False

    matched_paras = paras[match_start:match_start + len(old_segments)]
    parent = matched_paras[0].getparent()
    if parent is None:
        return False

    anchor_index = parent.index(matched_paras[-1])

    shared_count = min(len(matched_paras), len(new_segments))
    for idx in range(shared_count):
        _set_paragraph_text(matched_paras[idx], new_segments[idx])

    if len(new_segments) > len(matched_paras):
        template_para = matched_paras[-1]
        insert_at = anchor_index + 1
        for seg in new_segments[len(matched_paras):]:
            new_para = copy.deepcopy(template_para)
            _set_paragraph_text(new_para, seg)
            parent.insert(insert_at, new_para)
            insert_at += 1
    elif len(new_segments) < len(matched_paras):
        for para in matched_paras[len(new_segments):]:
            para_parent = para.getparent()
            if para_parent is not None:
                para_parent.remove(para)

    return True


def paragraph_replace(para_el, replacements):
    """
    在段落级别替换文本，支持跨 <w:t> 元素的匹配。

    策略：
    1. 收集段落中所有 <w:t> 元素及其文本
    2. 拼接成完整文本进行替换
    3. 如果有替换发生，重新分配文本到原有的 <w:t> 元素中
    """
    # 收集所有 run 元素（<w:r>），保持顺序
    runs = list(para_el.findall(f'.//{{{W}}}r'))
    if not runs:
        return

    # 收集所有文本元素及其位置信息
    text_runs = []
    for run in runs:
        if any(True for _ in _iter_run_text_parts(run)):
            text_runs.append(run)

    if not text_runs:
        return

    # 拼接完整文本
    full_text = _paragraph_text(para_el)
    original_text = full_text

    normalized_replacements = []
    for old, new in replacements:
        normalized_replacements.append((
            _normalize_newlines(old),
            _normalize_newlines(new),
        ))

    # 执行所有替换
    for old, new in normalized_replacements:
        if old in full_text:
            full_text = full_text.replace(old, new)

    # 如果没有变化，直接返回
    if full_text == original_text:
        return

    print(f"段落替换: {len(original_text)} -> {len(full_text)} 字符")

    # 将规范化文本重新写回第一个文本 run，\n 会回写成 Word 的换行节点。
    first_run = text_runs[0]
    for run in text_runs:
        _clear_run_text_like_children(run)
    _append_text_to_run(first_run, full_text)


def ensure_rpr(run_el):
    rpr = run_el.find(f'{{{W}}}rPr')
    if rpr is None:
        rpr = etree.Element(f'{{{W}}}rPr')
        run_el.insert(0, rpr)
    return rpr

def set_color_on_rpr(rpr_el, hex_color):
    c = rpr_el.find(f'{{{W}}}color')
    if c is None:
        c = etree.SubElement(rpr_el, f'{{{W}}}color')
    c.set(f'{{{W}}}val', hex_color.lstrip('#'))

def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
    """
    只给匹配到的关键字本身着色，而不是整个 run。

    做法：在有关键字的 run 上，把文本拆成多段 run：
      [前缀][关键字][后缀]，只有“关键字”这个 run 设置颜色。

    当 context_text 不为空时，只在“整段文本包含该 context_text 的段落”中进行上色，
    避免同一个关键字在其他段落里被误伤（例如单独的数字 0）。
    """
    keyword = _normalize_newlines(keyword)
    context_text = _normalize_newlines(context_text) if context_text is not None else None

    # 如果提供了上下文，只在包含该上下文的段落内着色
    allowed_paras = None
    if context_text:
        allowed_paras = set()
        for p in doc_el.iter(f'{{{W}}}p'):
            full = _paragraph_text(p)
            if context_text in full:
                allowed_paras.add(p)

    def _find_ancestor_para(el):
        cur = el
        while cur is not None and cur.tag != f'{{{W}}}p':
            cur = cur.getparent()
        return cur

    # 先 list 一下，避免在遍历时修改树结构导致问题
    runs = list(doc_el.iter(f'{{{W}}}r'))
    for run in runs:
        if allowed_paras is not None:
            para = _find_ancestor_para(run)
            if para not in allowed_paras:
                continue
        full_text = _run_text(run)
        if not full_text:
            continue
        if keyword not in full_text:
            continue

        parent = run.getparent()
        if parent is None:
            continue
        insert_pos = parent.index(run)

        # 原 run 的 rPr 复制给新 run
        orig_rpr = run.find(f'{{{W}}}rPr')
        if orig_rpr is not None:
            rpr_bytes = etree.tostring(orig_rpr)
        else:
            rpr_bytes = None

        def make_run(text, colored):
            new_r = etree.Element(f'{{{W}}}r')
            if rpr_bytes is not None:
                new_r.append(etree.fromstring(rpr_bytes))
            _append_text_to_run(new_r, text)
            if colored:
                set_color_on_rpr(ensure_rpr(new_r), hex_color)
            return new_r

        segments = []
        s = full_text
        start = 0
        klen = len(keyword)
        while True:
            idx = s.find(keyword, start)
            if idx == -1:
                if start < len(s):
                    segments.append((s[start:], False))
                break
            if idx > start:
                segments.append((s[start:idx], False))
            segments.append((keyword, True))
            start = idx + klen

        # 用新 run 替换原 run
        parent.remove(run)
        for offset, (seg_text, colored) in enumerate(segments):
            if seg_text:
                parent.insert(insert_pos + offset, make_run(seg_text, colored))


def remove_rule_blocks(doc_el):
    """
    删除文档中位于 <global_rule>...</global_rule>、<rule>...</rule> 和 <chart_rule>...</chart_rule> 之间的所有段落。

    说明：
    - 标签内容可能跨段落，这里按段落顺序遍历，记录是否处于 rule 块内。
    - 一旦进入某个块（遇到起始标签），直到遇到对应的结束标签为止，整段段落都会被删除。
    - 假设标签本身和其中内容都不需要出现在最终文档里。
    """
    inside_global = False
    inside_rule = False
    inside_chart = False
    paras_to_delete = []

    # list(...) 防止在遍历时修改树结构
    for p in list(doc_el.iter(f'{{{W}}}p')):
        t_nodes = list(p.iter(f'{{{W}}}t'))
        full = ''.join(t.text or '' for t in t_nodes)

        if not full:
            # 空段落如果在块内，也删掉
            if inside_global or inside_rule or inside_chart:
                paras_to_delete.append(p)
            continue

        # 当前是否在某个块内
        if inside_global or inside_rule or inside_chart:
            paras_to_delete.append(p)

        # 检测 global_rule 块
        if '<global_rule>' in full:
            inside_global = True
            if p not in paras_to_delete:
                paras_to_delete.append(p)
        if '</global_rule>' in full:
            inside_global = False

        # 检测 rule 块
        if '<rule>' in full:
            inside_rule = True
            if p not in paras_to_delete:
                paras_to_delete.append(p)
        if '</rule>' in full:
            inside_rule = False

        # 检测 chart_rule 块
        if '<chart_rule>' in full:
            inside_chart = True
            if p not in paras_to_delete:
                paras_to_delete.append(p)
        if '</chart_rule>' in full:
            inside_chart = False

    for p in paras_to_delete:
        parent = p.getparent()
        if parent is not None:
            parent.remove(p)

def process(input_docx, output_docx, replacements, color_keywords):
    with tempfile.TemporaryDirectory() as tmpdir:
        print(f"📂 解包 {input_docx} ...")
        unpack(input_docx, tmpdir)

        doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml')

        tree = etree.parse(doc_xml_path)
        root = tree.getroot()

        # 先整体删除全局规则和普通规则块（支持标签跨段落）
        remove_rule_blocks(root)

        if replacements:
            print(f"✏️  替换 {len(replacements)} 条文本...")
            remaining_replacements = []
            for old, new in replacements:
                if '\n\n' in _normalize_newlines(old):
                    replaced = _replace_paragraph_block(root, old, new)
                    if replaced:
                        print("🧩 跨段替换命中")
                        continue
                remaining_replacements.append((old, new))
            if remaining_replacements:
                for para in root.iter(f'{{{W}}}p'):
                    paragraph_replace(para, remaining_replacements)

        # 根据 span 解析出的关键字上色
        for item in color_keywords:
            # 兼容旧格式: (keyword, color)
            if len(item) == 2:
                keyword, color = item
                context_text = None
            else:
                keyword, color, context_text = item
            print(f"🎨 关键词「{keyword}」→ #{color}")
            apply_color_to_keyword(root, keyword, color, context_text)

        tree.write(doc_xml_path, xml_declaration=True, encoding='UTF-8', standalone=True)
        print(f"📦 打包 → {output_docx} ...")
        pack(tmpdir, output_docx, input_docx)
        print(f"✅ 完成！输出: {output_docx}")


def _parse_span_replacement(new_text):
    """
    解析 NEW 文本中的 span 标签，用于决定颜色。

    约定格式（不区分大小写）：
      <span color="red">待补充</span>

    返回: (纯文本, [(keyword, hex_color), ...])
    """
    import re

    new_text = _normalize_newlines(new_text)

    # 简单的命名颜色到 16 进制的映射，可按需扩展
    named_colors = {
        'red': 'FF0000',
        'blue': '0000FF',
        'green': '00FF00',
        'yellow': 'FFFF00',
        'black': '000000',
        'white': 'FFFFFF',
        'gray': '808080',
        'grey': '808080',
    }

    def _normalize_color(raw_color: str) -> str:
        """
        支持：
          - FFFFFF / ffffff
          - #FFFFFF / #ffffff
          - red / blue 等命名颜色（见 named_colors）
        返回不带 # 的大写 16 进制字符串；如果无法识别命名颜色则原样返回（去掉 #）。
        """
        c = (raw_color or '').strip()
        if not c:
            return ''

        # 去掉前导 #
        if c.startswith('#'):
            c = c[1:]

        # 纯 16 进制
        if re.fullmatch(r'[0-9a-fA-F]{6}', c):
            return c.upper()

        # 命名颜色
        mapped = named_colors.get(c.lower())
        if mapped:
            return mapped

        # 兜底：返回去掉 # 的原值
        return c.upper()

    # color 属性允许：
    #   - 6 位 16 进制（可带 #）
    #   - 命名颜色（red / blue ...）
    span_pattern = re.compile(
        r'<span\s+[^>]*?color=["\']?([^"\'\s>]+)["\']?[^>]*>(.*?)</span>',
        re.IGNORECASE | re.DOTALL,
    )

    # 先按段落边界拆分，这样 span 上色时可以使用所在段落作为上下文。
    def _strip_repl(m):
        return m.group(2)

    plain_segments = []
    color_keywords = []
    for segment in new_text.split('\n\n'):
        plain_segment = span_pattern.sub(_strip_repl, segment)
        plain_segments.append(plain_segment)
        for m in span_pattern.finditer(segment):
            raw_color = m.group(1)
            hex_color = _normalize_color(raw_color)
            keyword = m.group(2)
            # 三元组: (关键字, 颜色, 所在段落的纯文本上下文)
            color_keywords.append((keyword, hex_color, plain_segment))

    plain_text = '\n\n'.join(plain_segments)
    return plain_text, color_keywords


def main():
    parser = argparse.ArgumentParser(description='DOCX 格式保留：替换文本/颜色')
    parser.add_argument('input', help='输入 .docx')
    parser.add_argument('output', nargs='?', help='输出 .docx')
    parser.add_argument('--list-images', action='store_true', help='列出所有图片')
    parser.add_argument('--replace', nargs=2, metavar=('OLD', 'NEW'),
                        action='append', default=[])
    args = parser.parse_args()

    if args.list_images:
        list_images(args.input)
        return
    if not args.output:
        parser.error("需要指定输出文件")

    # 处理 span 颜色：把 NEW 中的 <span color="...">文字</span> 抽出来
    replacements = []
    color_keywords = []
    for old, new_raw in args.replace:
        new_plain, spans = _parse_span_replacement(new_raw)
        replacements.append((old, new_plain))
        color_keywords.extend(spans)

    process(
        input_docx=args.input,
        output_docx=args.output,
        replacements=replacements,
        color_keywords=color_keywords,
    )

if __name__ == '__main__':
    main()