From 578580537fb072612ed74598cf0ff159b0009abf Mon Sep 17 00:00:00 2001 From: liangweihao <734499798@qq.com> Date: Fri, 27 Feb 2026 17:33:45 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A4=84=E7=90=86=E8=B7=A8=E8=A1=8C=E7=9A=84?= =?UTF-8?q?=E6=96=87=E6=9C=AC=E5=8C=B9=E9=85=8D=E5=A4=B1=E8=B4=A5=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mcp_docx.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 11 deletions(-) diff --git a/mcp_docx.py b/mcp_docx.py index 2b9b588..c99a839 100644 --- a/mcp_docx.py +++ b/mcp_docx.py @@ -198,17 +198,78 @@ def replace_image(unpacked_dir, index, new_image_path): def paragraph_replace(para_el, replacements): - """在 层面替换文本,完全不碰图片和格式""" - for t_el in para_el.iter(f'{{{W}}}t'): - if not t_el.text: - continue - new_text = t_el.text - for old, new in replacements: - new_text = new_text.replace(old, new) - if new_text != t_el.text: - t_el.text = new_text - if new_text and (new_text[0] == ' ' or new_text[-1] == ' '): - t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') + """ + 在段落级别替换文本,支持跨 元素的匹配。 + + 策略: + 1. 收集段落中所有 元素及其文本 + 2. 拼接成完整文本进行替换 + 3. 如果有替换发生,重新分配文本到原有的 元素中 + """ + # 收集所有 run 元素(),保持顺序 + runs = list(para_el.findall(f'.//{{{W}}}r')) + if not runs: + return + + # 收集所有文本元素及其位置信息 + t_elements = [] + for run in runs: + for t_el in run.findall(f'{{{W}}}t'): + t_elements.append((run, t_el)) + + if not t_elements: + return + + # 拼接完整文本 + full_text = ''.join(t_el.text or '' for _, t_el in t_elements) + original_text = full_text + + # 执行所有替换 + for old, new in replacements: + if old in full_text: + full_text = full_text.replace(old, new) + + # 如果没有变化,直接返回 + if full_text == original_text: + return + + print(f"段落替换: {len(original_text)} -> {len(full_text)} 字符") + + # 将新文本重新分配到原有的 元素中 + # 策略:尽量保持原有的文本分布比例 + if len(t_elements) == 1: + # 只有一个 元素,直接替换 + _, t_el = t_elements[0] + t_el.text = full_text + if full_text and (full_text[0] == ' ' or full_text[-1] == ' '): + t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') + else: + # 多个 元素:按原始长度比例分配新文本 + original_lengths = [len(t_el.text or '') for _, t_el in t_elements] + total_original = sum(original_lengths) + + if total_original == 0: + # 原始都是空的,把所有文本放到第一个元素 + t_elements[0][1].text = full_text + for i in range(1, len(t_elements)): + t_elements[i][1].text = '' + else: + # 按比例分配 + pos = 0 + for i, (_, t_el) in enumerate(t_elements): + if i == len(t_elements) - 1: + # 最后一个元素,取剩余所有文本 + chunk = full_text[pos:] + else: + # 按比例计算应该分配的长度 + ratio = original_lengths[i] / total_original + chunk_len = int(len(full_text) * ratio) + chunk = full_text[pos:pos + chunk_len] + pos += chunk_len + + t_el.text = chunk + if chunk and (chunk[0] == ' ' or chunk[-1] == ' '): + t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') def ensure_rpr(run_el):