1.0

2026-03-26 22:56:58 +08:00
parent a337aa4540
commit 2bb850e8e2
2 changed files with 564 additions and 265 deletions
--- a/mcp_docx.py
+++ b/mcp_docx.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-docx_editor.py — 保留原格式替换文本 + 修改字体颜色 + 替换图片
+docx_editor.py — 保留原格式替换文本 + 修改字体颜色

 用法:
  # 列出文档中所有图片
@@ -10,25 +10,14 @@ docx_editor.py — 保留原格式替换文本 + 修改字体颜色 + 替换图
  python3 docx_editor.py input.docx output.docx \
    --replace "原文" "新文" \
    --color "关键词" "FF0000"
-
-  # 图片替换（按文档中出现的顺序，从1开始）
-  python3 docx_editor.py input.docx output.docx \
-    --image 1 new_chart.png \
-    --image 2 new_photo.jpg
-
-  # 同时替换文字和图片
-  python3 docx_editor.py input.docx output.docx \
-    --replace "旧标题" "新标题" \
-    --image 1 new_image.png \
-    --color "重点" "FF0000"
 """

 import argparse
+import copy
 import os
 import tempfile
 import zipfile
 from lxml import etree
-from PIL import Image
 import re

 W   = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
@@ -37,12 +26,6 @@ A   = 'http://schemas.openxmlformats.org/drawingml/2006/main'
 R   = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
 REL_TYPE_IMAGE = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'

-EXT_TO_MIME = {
-    'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg',
-    'gif': 'image/gif', 'bmp': 'image/bmp', 'tiff': 'image/tiff',
-    'webp': 'image/webp',
-}
-

 def unpack(docx_path, out_dir):
    """使用 zipfile 直接解包 .docx 到临时目录，替代外部 unpack.py 脚本。"""
@@ -141,61 +124,146 @@ def get_images_info(docx_path):
        return build_image_index(tmpdir)


-def replace_image(unpacked_dir, index, new_image_path):
-    """替换第 index 张图片（1-based）"""
-    imgs = build_image_index(unpacked_dir)
-    if index < 1 or index > len(imgs):
-        raise ValueError(f"图片序号 {index} 超出范围（共 {len(imgs)} 张）")
+def _normalize_newlines(text):
+    if text is None:
+        return ''
+    return str(text).replace('\r\n', '\n').replace('\r', '\n')

-    info     = imgs[index - 1]
-    old_abs  = info['abs_path']
-    old_ext  = info['ext']
-    new_ext  = os.path.splitext(new_image_path)[1].lstrip('.').lower()
-    if new_ext == 'jpg':
-        new_ext = 'jpeg'

-    print(f"    图片#{index} {os.path.basename(info['media_file'])}({old_ext.upper()})"
-          f" ← {os.path.basename(new_image_path)}({new_ext.upper()})")
+def _is_text_node(el):
+    return el.tag == f'{{{W}}}t'

-    if old_ext == new_ext:
-        # ── 同格式：直接覆盖 ──────────────────────────────
-        import shutil
-        shutil.copy2(new_image_path, old_abs)

+def _is_break_node(el):
+    return el.tag in (f'{{{W}}}br', f'{{{W}}}cr')
+
+
+def _is_tab_node(el):
+    return el.tag == f'{{{W}}}tab'
+
+
+def _iter_run_text_parts(run_el):
+    for child in run_el:
+        if _is_text_node(child):
+            yield child, _normalize_newlines(child.text or '')
+        elif _is_break_node(child):
+            yield child, '\n'
+        elif _is_tab_node(child):
+            yield child, '\t'
+
+
+def _run_text(run_el):
+    return ''.join(part for _, part in _iter_run_text_parts(run_el))
+
+
+def _paragraph_text(para_el):
+    return ''.join(_run_text(run) for run in para_el.iter(f'{{{W}}}r'))
+
+
+def _clear_run_text_like_children(run_el):
+    for child in list(run_el):
+        if _is_text_node(child) or _is_break_node(child) or _is_tab_node(child):
+            run_el.remove(child)
+
+
+def _append_text_to_run(run_el, text):
+    text = _normalize_newlines(text)
+    parts = text.split('\n')
+
+    if len(parts) == 1:
+        t_el = etree.SubElement(run_el, f'{{{W}}}t')
+        t_el.text = parts[0]
+        if parts[0] and (parts[0][0] == ' ' or parts[0][-1] == ' '):
+            t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
+        return
+
+    for idx, part in enumerate(parts):
+        if part:
+            t_el = etree.SubElement(run_el, f'{{{W}}}t')
+            t_el.text = part
+            if part[0] == ' ' or part[-1] == ' ':
+                t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
+        if idx < len(parts) - 1:
+            etree.SubElement(run_el, f'{{{W}}}br')
+
+
+def _ensure_paragraph_run(para_el):
+    runs = list(para_el.findall(f'.//{{{W}}}r'))
+    if runs:
+        return runs[0]
+
+    ppr = para_el.find(f'{{{W}}}pPr')
+    new_r = etree.Element(f'{{{W}}}r')
+    if ppr is None:
+        para_el.insert(0, new_r)
    else:
-        # ── 不同格式：Pillow 转换 + 更新 rels + ContentTypes
-        new_abs = os.path.splitext(old_abs)[0] + '.' + new_ext
-        img = Image.open(new_image_path)
-        fmt = {'jpeg': 'JPEG', 'png': 'PNG', 'gif': 'GIF',
-               'bmp': 'BMP', 'tiff': 'TIFF', 'webp': 'WEBP'}.get(new_ext, new_ext.upper())
-        if fmt == 'JPEG' and img.mode in ('RGBA', 'P'):
-            img = img.convert('RGB')
-        img.save(new_abs, format=fmt)
-        if os.path.abspath(new_abs) != os.path.abspath(old_abs):
-            os.remove(old_abs)
+        para_el.insert(para_el.index(ppr) + 1, new_r)
+    return new_r

-        # 更新 rels
-        old_media = info['media_file']
-        new_media = os.path.splitext(old_media)[0] + '.' + new_ext
-        word_dir  = os.path.join(unpacked_dir, 'word')
-        rels_path = os.path.join(word_dir, '_rels', 'document.xml.rels')
-        rels_tree = etree.parse(rels_path)
-        for rel in rels_tree.getroot():
-            if rel.get('Id') == info['rid']:
-                rel.set('Target', new_media)
-                break
-        rels_tree.write(rels_path, xml_declaration=True, encoding='UTF-8', standalone=True)

-        # 更新 ContentTypes
-        ct_path = os.path.join(unpacked_dir, '[Content_Types].xml')
-        ct_tree = etree.parse(ct_path)
-        ct_root = ct_tree.getroot()
-        existing = {el.get('Extension', '') for el in ct_root}
-        if new_ext not in existing:
-            etree.SubElement(ct_root, 'Default', Extension=new_ext,
-                             ContentType=EXT_TO_MIME.get(new_ext, f'image/{new_ext}'))
-        ct_tree.write(ct_path, xml_declaration=True, encoding='UTF-8', standalone=True)
-        print(f"      格式转换 {old_ext}→{new_ext}，rels 和 ContentTypes 已更新")
+def _set_paragraph_text(para_el, text):
+    runs = list(para_el.findall(f'.//{{{W}}}r'))
+    text_runs = [run for run in runs if any(True for _ in _iter_run_text_parts(run))]
+
+    if text_runs:
+        first_run = text_runs[0]
+        for run in text_runs:
+            _clear_run_text_like_children(run)
+    else:
+        first_run = _ensure_paragraph_run(para_el)
+        _clear_run_text_like_children(first_run)
+
+    _append_text_to_run(first_run, text)
+
+
+def _paragraph_list(doc_el):
+    return list(doc_el.iter(f'{{{W}}}p'))
+
+
+def _replace_paragraph_block(doc_el, old_text, new_text):
+    old_segments = _normalize_newlines(old_text).split('\n\n')
+    new_segments = _normalize_newlines(new_text).split('\n\n')
+    if len(old_segments) <= 1:
+        return False
+
+    paras = _paragraph_list(doc_el)
+    para_texts = [_paragraph_text(p) for p in paras]
+
+    match_start = None
+    for i in range(0, len(para_texts) - len(old_segments) + 1):
+        if para_texts[i:i + len(old_segments)] == old_segments:
+            match_start = i
+            break
+
+    if match_start is None:
+        return False
+
+    matched_paras = paras[match_start:match_start + len(old_segments)]
+    parent = matched_paras[0].getparent()
+    if parent is None:
+        return False
+
+    anchor_index = parent.index(matched_paras[-1])
+
+    shared_count = min(len(matched_paras), len(new_segments))
+    for idx in range(shared_count):
+        _set_paragraph_text(matched_paras[idx], new_segments[idx])
+
+    if len(new_segments) > len(matched_paras):
+        template_para = matched_paras[-1]
+        insert_at = anchor_index + 1
+        for seg in new_segments[len(matched_paras):]:
+            new_para = copy.deepcopy(template_para)
+            _set_paragraph_text(new_para, seg)
+            parent.insert(insert_at, new_para)
+            insert_at += 1
+    elif len(new_segments) < len(matched_paras):
+        for para in matched_paras[len(new_segments):]:
+            para_parent = para.getparent()
+            if para_parent is not None:
+                para_parent.remove(para)
+
+    return True


 def paragraph_replace(para_el, replacements):
@@ -213,20 +281,27 @@ def paragraph_replace(para_el, replacements):
        return

    # 收集所有文本元素及其位置信息
-    t_elements = []
+    text_runs = []
    for run in runs:
-        for t_el in run.findall(f'{{{W}}}t'):
-            t_elements.append((run, t_el))
+        if any(True for _ in _iter_run_text_parts(run)):
+            text_runs.append(run)

-    if not t_elements:
+    if not text_runs:
        return

    # 拼接完整文本
-    full_text = ''.join(t_el.text or '' for _, t_el in t_elements)
+    full_text = _paragraph_text(para_el)
    original_text = full_text

-    # 执行所有替换
+    normalized_replacements = []
    for old, new in replacements:
+        normalized_replacements.append((
+            _normalize_newlines(old),
+            _normalize_newlines(new),
+        ))
+
+    # 执行所有替换
+    for old, new in normalized_replacements:
        if old in full_text:
            full_text = full_text.replace(old, new)

@@ -236,16 +311,11 @@ def paragraph_replace(para_el, replacements):

    print(f"段落替换: {len(original_text)} -> {len(full_text)} 字符")

-    # 将新文本重新分配到原有的 <w:t> 元素中
-    # 策略：将所有文本放入第一个元素，清空其他元素，避免不当切分导致换行
-    _, first_t_el = t_elements[0]
-    first_t_el.text = full_text
-    if full_text and (full_text[0] == ' ' or full_text[-1] == ' '):
-        first_t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
-
-    # 清空其他 <w:t> 元素
-    for i in range(1, len(t_elements)):
-        t_elements[i][1].text = ''
+    # 将规范化文本重新写回第一个文本 run，\n 会回写成 Word 的换行节点。
+    first_run = text_runs[0]
+    for run in text_runs:
+        _clear_run_text_like_children(run)
+    _append_text_to_run(first_run, full_text)


 def ensure_rpr(run_el):
@@ -271,13 +341,15 @@ def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
    当 context_text 不为空时，只在“整段文本包含该 context_text 的段落”中进行上色，
    避免同一个关键字在其他段落里被误伤（例如单独的数字 0）。
    """
+    keyword = _normalize_newlines(keyword)
+    context_text = _normalize_newlines(context_text) if context_text is not None else None
+
    # 如果提供了上下文，只在包含该上下文的段落内着色
    allowed_paras = None
    if context_text:
        allowed_paras = set()
        for p in doc_el.iter(f'{{{W}}}p'):
-            t_nodes = list(p.iter(f'{{{W}}}t'))
-            full = ''.join(t.text or '' for t in t_nodes)
+            full = _paragraph_text(p)
            if context_text in full:
                allowed_paras.add(p)

@@ -294,10 +366,9 @@ def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
            para = _find_ancestor_para(run)
            if para not in allowed_paras:
                continue
-        t_nodes = list(run.findall(f'{{{W}}}t'))
-        if not t_nodes:
+        full_text = _run_text(run)
+        if not full_text:
            continue
-        full_text = ''.join(t.text or '' for t in t_nodes)
        if keyword not in full_text:
            continue

@@ -317,10 +388,7 @@ def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
            new_r = etree.Element(f'{{{W}}}r')
            if rpr_bytes is not None:
                new_r.append(etree.fromstring(rpr_bytes))
-            t_el = etree.SubElement(new_r, f'{{{W}}}t')
-            t_el.text = text
-            if text and (text[0] == ' ' or text[-1] == ' '):
-                t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
+            _append_text_to_run(new_r, text)
            if colored:
                set_color_on_rpr(ensure_rpr(new_r), hex_color)
            return new_r
@@ -405,19 +473,13 @@ def remove_rule_blocks(doc_el):
        if parent is not None:
            parent.remove(p)

-def process(input_docx, output_docx, replacements, image_replacements,
-            color_keywords):
+def process(input_docx, output_docx, replacements, color_keywords):
    with tempfile.TemporaryDirectory() as tmpdir:
        print(f"📂 解包 {input_docx} ...")
        unpack(input_docx, tmpdir)

        doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml')

-        if image_replacements:
-            print(f"🖼️  替换 {len(image_replacements)} 张图片...")
-            for idx, new_img in image_replacements:
-                replace_image(tmpdir, idx, new_img)
-
        tree = etree.parse(doc_xml_path)
        root = tree.getroot()

@@ -426,8 +488,17 @@ def process(input_docx, output_docx, replacements, image_replacements,

        if replacements:
            print(f"✏️  替换 {len(replacements)} 条文本...")
-            for para in root.iter(f'{{{W}}}p'):
-                paragraph_replace(para, replacements)
+            remaining_replacements = []
+            for old, new in replacements:
+                if '\n\n' in _normalize_newlines(old):
+                    replaced = _replace_paragraph_block(root, old, new)
+                    if replaced:
+                        print("🧩 跨段替换命中")
+                        continue
+                remaining_replacements.append((old, new))
+            if remaining_replacements:
+                for para in root.iter(f'{{{W}}}p'):
+                    paragraph_replace(para, remaining_replacements)

        # 根据 span 解析出的关键字上色
        for item in color_keywords:
@@ -457,6 +528,8 @@ def _parse_span_replacement(new_text):
    """
    import re

+    new_text = _normalize_newlines(new_text)
+
    # 简单的命名颜色到 16 进制的映射，可按需扩展
    named_colors = {
        'red': 'FF0000',
@@ -505,33 +578,33 @@ def _parse_span_replacement(new_text):
        re.IGNORECASE | re.DOTALL,
    )

-    # 先得到去掉 span 标签后的纯文本（也是最终会写入 DOCX 的内容）
+    # 先按段落边界拆分，这样 span 上色时可以使用所在段落作为上下文。
    def _strip_repl(m):
        return m.group(2)

-    plain_text = span_pattern.sub(_strip_repl, new_text)
-
-    # 再次遍历 span，收集颜色关键字，并把“整句纯文本”作为上下文挂在每个关键字上
+    plain_segments = []
    color_keywords = []
-    for m in span_pattern.finditer(new_text):
-        raw_color = m.group(1)
-        hex_color = _normalize_color(raw_color)
-        keyword = m.group(2)
-        # 三元组: (关键字, 颜色, 该 NEW 对应的整句纯文本上下文)
-        color_keywords.append((keyword, hex_color, plain_text))
+    for segment in new_text.split('\n\n'):
+        plain_segment = span_pattern.sub(_strip_repl, segment)
+        plain_segments.append(plain_segment)
+        for m in span_pattern.finditer(segment):
+            raw_color = m.group(1)
+            hex_color = _normalize_color(raw_color)
+            keyword = m.group(2)
+            # 三元组: (关键字, 颜色, 所在段落的纯文本上下文)
+            color_keywords.append((keyword, hex_color, plain_segment))

+    plain_text = '\n\n'.join(plain_segments)
    return plain_text, color_keywords


 def main():
-    parser = argparse.ArgumentParser(description='DOCX 格式保留：替换文本/图片/颜色')
+    parser = argparse.ArgumentParser(description='DOCX 格式保留：替换文本/颜色')
    parser.add_argument('input', help='输入 .docx')
    parser.add_argument('output', nargs='?', help='输出 .docx')
    parser.add_argument('--list-images', action='store_true', help='列出所有图片')
    parser.add_argument('--replace', nargs=2, metavar=('OLD', 'NEW'),
                        action='append', default=[])
-    parser.add_argument('--image', nargs=2, metavar=('INDEX', 'FILE'),
-                        action='append', default=[], help='图片替换')
    args = parser.parse_args()

    if args.list_images:
@@ -549,12 +622,11 @@ def main():
        color_keywords.extend(spans)

    process(
-        input_docx        = args.input,
-        output_docx       = args.output,
-        replacements      = replacements,
-        image_replacements= [(int(i), f) for i, f in args.image],
-        color_keywords    = color_keywords,
+        input_docx=args.input,
+        output_docx=args.output,
+        replacements=replacements,
+        color_keywords=color_keywords,
    )

 if __name__ == '__main__':
-    main()
+    main()