Files
mcp/mcp_docx.py
2026-03-26 22:56:58 +08:00

633 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
docx_editor.py — 保留原格式替换文本 + 修改字体颜色
用法:
# 列出文档中所有图片
python3 docx_editor.py input.docx --list-images
# 文本替换 + 颜色
python3 docx_editor.py input.docx output.docx \
--replace "原文" "新文" \
--color "关键词" "FF0000"
"""
import argparse
import copy
import os
import tempfile
import zipfile
from lxml import etree
import re
W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
WD = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
R = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
REL_TYPE_IMAGE = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
def unpack(docx_path, out_dir):
"""使用 zipfile 直接解包 .docx 到临时目录,替代外部 unpack.py 脚本。"""
with zipfile.ZipFile(docx_path, 'r') as zf:
zf.extractall(out_dir)
def pack(unpacked_dir, output_docx, original_docx):
"""
使用 zipfile 将修改后的目录重新打包为 .docx。
original_docx 参数目前保留只是为了兼容原函数签名,没有实际使用。
"""
# 确保输出目录存在
out_dir = os.path.dirname(os.path.abspath(output_docx))
if out_dir and not os.path.exists(out_dir):
os.makedirs(out_dir, exist_ok=True)
# 将解包目录中的所有文件打成 ZIP保持相对路径结构
with zipfile.ZipFile(output_docx, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
for root, _, files in os.walk(unpacked_dir):
for fname in files:
abs_path = os.path.join(root, fname)
# docx 内部使用 / 作为路径分隔符
arcname = os.path.relpath(abs_path, unpacked_dir).replace(os.sep, '/')
zf.write(abs_path, arcname)
def build_image_index(unpacked_dir):
"""返回按文档顺序排列的图片列表"""
word_dir = os.path.join(unpacked_dir, 'word')
doc_xml = os.path.join(word_dir, 'document.xml')
rels_xml = os.path.join(word_dir, '_rels', 'document.xml.rels')
rels_root = etree.parse(rels_xml).getroot()
rid_to_media = {}
for rel in rels_root:
if rel.get('Type', '') == REL_TYPE_IMAGE:
rid_to_media[rel.get('Id')] = rel.get('Target')
doc_root = etree.parse(doc_xml).getroot()
results = []
for blip in doc_root.iter(f'{{{A}}}blip'):
rid = blip.get(f'{{{R}}}embed')
if not rid or rid not in rid_to_media:
continue
media_rel = rid_to_media[rid]
media_abs = os.path.join(word_dir, media_rel.replace('/', os.sep))
ext = os.path.splitext(media_rel)[1].lstrip('.').lower()
inline = blip
while inline is not None and inline.tag not in (f'{{{WD}}}inline', f'{{{WD}}}anchor'):
inline = inline.getparent()
w_cm = h_cm = None
docpr_name = ''
if inline is not None:
ext_el = inline.find(f'{{{WD}}}extent')
if ext_el is not None:
w_cm = round(int(ext_el.get('cx', 0)) / 360000, 2)
h_cm = round(int(ext_el.get('cy', 0)) / 360000, 2)
dp = inline.find(f'{{{WD}}}docPr')
if dp is not None:
docpr_name = dp.get('name', '')
results.append({
'index': len(results) + 1, 'rid': rid,
'media_file': media_rel, 'abs_path': media_abs,
'ext': ext, 'docpr_name': docpr_name,
'width_cm': w_cm, 'height_cm': h_cm,
})
return results
def list_images(docx_path):
imgs = get_images_info(docx_path)
if not imgs:
print("文档中没有找到图片。")
return
print(f"共找到 {len(imgs)} 张图片:\n")
print(f" {'#':<4} {'文件名':<20} {'尺寸':<18} Word内部名称")
print(" " + "-" * 62)
for img in imgs:
size = f"{img['width_cm']}×{img['height_cm']}cm" if img['width_cm'] else "未知"
print(f" {img['index']:<4} {os.path.basename(img['media_file']):<20} {size:<18} {img['docpr_name']}")
def get_images_info(docx_path):
"""
返回给定 DOCX 文件中所有图片的结构化信息列表。
该函数专门为其他模块(例如 MCP 服务器)复用而设计,
行为等价于原来的 list_images 内部逻辑,但不做任何打印。
"""
with tempfile.TemporaryDirectory() as tmpdir:
unpack(docx_path, tmpdir)
return build_image_index(tmpdir)
def _normalize_newlines(text):
if text is None:
return ''
return str(text).replace('\r\n', '\n').replace('\r', '\n')
def _is_text_node(el):
return el.tag == f'{{{W}}}t'
def _is_break_node(el):
return el.tag in (f'{{{W}}}br', f'{{{W}}}cr')
def _is_tab_node(el):
return el.tag == f'{{{W}}}tab'
def _iter_run_text_parts(run_el):
for child in run_el:
if _is_text_node(child):
yield child, _normalize_newlines(child.text or '')
elif _is_break_node(child):
yield child, '\n'
elif _is_tab_node(child):
yield child, '\t'
def _run_text(run_el):
return ''.join(part for _, part in _iter_run_text_parts(run_el))
def _paragraph_text(para_el):
return ''.join(_run_text(run) for run in para_el.iter(f'{{{W}}}r'))
def _clear_run_text_like_children(run_el):
for child in list(run_el):
if _is_text_node(child) or _is_break_node(child) or _is_tab_node(child):
run_el.remove(child)
def _append_text_to_run(run_el, text):
text = _normalize_newlines(text)
parts = text.split('\n')
if len(parts) == 1:
t_el = etree.SubElement(run_el, f'{{{W}}}t')
t_el.text = parts[0]
if parts[0] and (parts[0][0] == ' ' or parts[0][-1] == ' '):
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
return
for idx, part in enumerate(parts):
if part:
t_el = etree.SubElement(run_el, f'{{{W}}}t')
t_el.text = part
if part[0] == ' ' or part[-1] == ' ':
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
if idx < len(parts) - 1:
etree.SubElement(run_el, f'{{{W}}}br')
def _ensure_paragraph_run(para_el):
runs = list(para_el.findall(f'.//{{{W}}}r'))
if runs:
return runs[0]
ppr = para_el.find(f'{{{W}}}pPr')
new_r = etree.Element(f'{{{W}}}r')
if ppr is None:
para_el.insert(0, new_r)
else:
para_el.insert(para_el.index(ppr) + 1, new_r)
return new_r
def _set_paragraph_text(para_el, text):
runs = list(para_el.findall(f'.//{{{W}}}r'))
text_runs = [run for run in runs if any(True for _ in _iter_run_text_parts(run))]
if text_runs:
first_run = text_runs[0]
for run in text_runs:
_clear_run_text_like_children(run)
else:
first_run = _ensure_paragraph_run(para_el)
_clear_run_text_like_children(first_run)
_append_text_to_run(first_run, text)
def _paragraph_list(doc_el):
return list(doc_el.iter(f'{{{W}}}p'))
def _replace_paragraph_block(doc_el, old_text, new_text):
old_segments = _normalize_newlines(old_text).split('\n\n')
new_segments = _normalize_newlines(new_text).split('\n\n')
if len(old_segments) <= 1:
return False
paras = _paragraph_list(doc_el)
para_texts = [_paragraph_text(p) for p in paras]
match_start = None
for i in range(0, len(para_texts) - len(old_segments) + 1):
if para_texts[i:i + len(old_segments)] == old_segments:
match_start = i
break
if match_start is None:
return False
matched_paras = paras[match_start:match_start + len(old_segments)]
parent = matched_paras[0].getparent()
if parent is None:
return False
anchor_index = parent.index(matched_paras[-1])
shared_count = min(len(matched_paras), len(new_segments))
for idx in range(shared_count):
_set_paragraph_text(matched_paras[idx], new_segments[idx])
if len(new_segments) > len(matched_paras):
template_para = matched_paras[-1]
insert_at = anchor_index + 1
for seg in new_segments[len(matched_paras):]:
new_para = copy.deepcopy(template_para)
_set_paragraph_text(new_para, seg)
parent.insert(insert_at, new_para)
insert_at += 1
elif len(new_segments) < len(matched_paras):
for para in matched_paras[len(new_segments):]:
para_parent = para.getparent()
if para_parent is not None:
para_parent.remove(para)
return True
def paragraph_replace(para_el, replacements):
"""
在段落级别替换文本,支持跨 <w:t> 元素的匹配。
策略:
1. 收集段落中所有 <w:t> 元素及其文本
2. 拼接成完整文本进行替换
3. 如果有替换发生,重新分配文本到原有的 <w:t> 元素中
"""
# 收集所有 run 元素(<w:r>),保持顺序
runs = list(para_el.findall(f'.//{{{W}}}r'))
if not runs:
return
# 收集所有文本元素及其位置信息
text_runs = []
for run in runs:
if any(True for _ in _iter_run_text_parts(run)):
text_runs.append(run)
if not text_runs:
return
# 拼接完整文本
full_text = _paragraph_text(para_el)
original_text = full_text
normalized_replacements = []
for old, new in replacements:
normalized_replacements.append((
_normalize_newlines(old),
_normalize_newlines(new),
))
# 执行所有替换
for old, new in normalized_replacements:
if old in full_text:
full_text = full_text.replace(old, new)
# 如果没有变化,直接返回
if full_text == original_text:
return
print(f"段落替换: {len(original_text)} -> {len(full_text)} 字符")
# 将规范化文本重新写回第一个文本 run\n 会回写成 Word 的换行节点。
first_run = text_runs[0]
for run in text_runs:
_clear_run_text_like_children(run)
_append_text_to_run(first_run, full_text)
def ensure_rpr(run_el):
rpr = run_el.find(f'{{{W}}}rPr')
if rpr is None:
rpr = etree.Element(f'{{{W}}}rPr')
run_el.insert(0, rpr)
return rpr
def set_color_on_rpr(rpr_el, hex_color):
c = rpr_el.find(f'{{{W}}}color')
if c is None:
c = etree.SubElement(rpr_el, f'{{{W}}}color')
c.set(f'{{{W}}}val', hex_color.lstrip('#'))
def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
"""
只给匹配到的关键字本身着色,而不是整个 run。
做法:在有关键字的 run 上,把文本拆成多段 run
[前缀][关键字][后缀],只有“关键字”这个 run 设置颜色。
当 context_text 不为空时,只在“整段文本包含该 context_text 的段落”中进行上色,
避免同一个关键字在其他段落里被误伤(例如单独的数字 0
"""
keyword = _normalize_newlines(keyword)
context_text = _normalize_newlines(context_text) if context_text is not None else None
# 如果提供了上下文,只在包含该上下文的段落内着色
allowed_paras = None
if context_text:
allowed_paras = set()
for p in doc_el.iter(f'{{{W}}}p'):
full = _paragraph_text(p)
if context_text in full:
allowed_paras.add(p)
def _find_ancestor_para(el):
cur = el
while cur is not None and cur.tag != f'{{{W}}}p':
cur = cur.getparent()
return cur
# 先 list 一下,避免在遍历时修改树结构导致问题
runs = list(doc_el.iter(f'{{{W}}}r'))
for run in runs:
if allowed_paras is not None:
para = _find_ancestor_para(run)
if para not in allowed_paras:
continue
full_text = _run_text(run)
if not full_text:
continue
if keyword not in full_text:
continue
parent = run.getparent()
if parent is None:
continue
insert_pos = parent.index(run)
# 原 run 的 rPr 复制给新 run
orig_rpr = run.find(f'{{{W}}}rPr')
if orig_rpr is not None:
rpr_bytes = etree.tostring(orig_rpr)
else:
rpr_bytes = None
def make_run(text, colored):
new_r = etree.Element(f'{{{W}}}r')
if rpr_bytes is not None:
new_r.append(etree.fromstring(rpr_bytes))
_append_text_to_run(new_r, text)
if colored:
set_color_on_rpr(ensure_rpr(new_r), hex_color)
return new_r
segments = []
s = full_text
start = 0
klen = len(keyword)
while True:
idx = s.find(keyword, start)
if idx == -1:
if start < len(s):
segments.append((s[start:], False))
break
if idx > start:
segments.append((s[start:idx], False))
segments.append((keyword, True))
start = idx + klen
# 用新 run 替换原 run
parent.remove(run)
for offset, (seg_text, colored) in enumerate(segments):
if seg_text:
parent.insert(insert_pos + offset, make_run(seg_text, colored))
def remove_rule_blocks(doc_el):
"""
删除文档中位于 <global_rule>...</global_rule>、<rule>...</rule> 和 <chart_rule>...</chart_rule> 之间的所有段落。
说明:
- 标签内容可能跨段落,这里按段落顺序遍历,记录是否处于 rule 块内。
- 一旦进入某个块(遇到起始标签),直到遇到对应的结束标签为止,整段段落都会被删除。
- 假设标签本身和其中内容都不需要出现在最终文档里。
"""
inside_global = False
inside_rule = False
inside_chart = False
paras_to_delete = []
# list(...) 防止在遍历时修改树结构
for p in list(doc_el.iter(f'{{{W}}}p')):
t_nodes = list(p.iter(f'{{{W}}}t'))
full = ''.join(t.text or '' for t in t_nodes)
if not full:
# 空段落如果在块内,也删掉
if inside_global or inside_rule or inside_chart:
paras_to_delete.append(p)
continue
# 当前是否在某个块内
if inside_global or inside_rule or inside_chart:
paras_to_delete.append(p)
# 检测 global_rule 块
if '<global_rule>' in full:
inside_global = True
if p not in paras_to_delete:
paras_to_delete.append(p)
if '</global_rule>' in full:
inside_global = False
# 检测 rule 块
if '<rule>' in full:
inside_rule = True
if p not in paras_to_delete:
paras_to_delete.append(p)
if '</rule>' in full:
inside_rule = False
# 检测 chart_rule 块
if '<chart_rule>' in full:
inside_chart = True
if p not in paras_to_delete:
paras_to_delete.append(p)
if '</chart_rule>' in full:
inside_chart = False
for p in paras_to_delete:
parent = p.getparent()
if parent is not None:
parent.remove(p)
def process(input_docx, output_docx, replacements, color_keywords):
with tempfile.TemporaryDirectory() as tmpdir:
print(f"📂 解包 {input_docx} ...")
unpack(input_docx, tmpdir)
doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml')
tree = etree.parse(doc_xml_path)
root = tree.getroot()
# 先整体删除全局规则和普通规则块(支持标签跨段落)
remove_rule_blocks(root)
if replacements:
print(f"✏️ 替换 {len(replacements)} 条文本...")
remaining_replacements = []
for old, new in replacements:
if '\n\n' in _normalize_newlines(old):
replaced = _replace_paragraph_block(root, old, new)
if replaced:
print("🧩 跨段替换命中")
continue
remaining_replacements.append((old, new))
if remaining_replacements:
for para in root.iter(f'{{{W}}}p'):
paragraph_replace(para, remaining_replacements)
# 根据 span 解析出的关键字上色
for item in color_keywords:
# 兼容旧格式: (keyword, color)
if len(item) == 2:
keyword, color = item
context_text = None
else:
keyword, color, context_text = item
print(f"🎨 关键词「{keyword}」→ #{color}")
apply_color_to_keyword(root, keyword, color, context_text)
tree.write(doc_xml_path, xml_declaration=True, encoding='UTF-8', standalone=True)
print(f"📦 打包 → {output_docx} ...")
pack(tmpdir, output_docx, input_docx)
print(f"✅ 完成!输出: {output_docx}")
def _parse_span_replacement(new_text):
"""
解析 NEW 文本中的 span 标签,用于决定颜色。
约定格式(不区分大小写):
<span color="red">待补充</span>
返回: (纯文本, [(keyword, hex_color), ...])
"""
import re
new_text = _normalize_newlines(new_text)
# 简单的命名颜色到 16 进制的映射,可按需扩展
named_colors = {
'red': 'FF0000',
'blue': '0000FF',
'green': '00FF00',
'yellow': 'FFFF00',
'black': '000000',
'white': 'FFFFFF',
'gray': '808080',
'grey': '808080',
}
def _normalize_color(raw_color: str) -> str:
"""
支持:
- FFFFFF / ffffff
- #FFFFFF / #ffffff
- red / blue 等命名颜色(见 named_colors
返回不带 # 的大写 16 进制字符串;如果无法识别命名颜色则原样返回(去掉 #)。
"""
c = (raw_color or '').strip()
if not c:
return ''
# 去掉前导 #
if c.startswith('#'):
c = c[1:]
# 纯 16 进制
if re.fullmatch(r'[0-9a-fA-F]{6}', c):
return c.upper()
# 命名颜色
mapped = named_colors.get(c.lower())
if mapped:
return mapped
# 兜底:返回去掉 # 的原值
return c.upper()
# color 属性允许:
# - 6 位 16 进制(可带 #
# - 命名颜色red / blue ...
span_pattern = re.compile(
r'<span\s+[^>]*?color=["\']?([^"\'\s>]+)["\']?[^>]*>(.*?)</span>',
re.IGNORECASE | re.DOTALL,
)
# 先按段落边界拆分,这样 span 上色时可以使用所在段落作为上下文。
def _strip_repl(m):
return m.group(2)
plain_segments = []
color_keywords = []
for segment in new_text.split('\n\n'):
plain_segment = span_pattern.sub(_strip_repl, segment)
plain_segments.append(plain_segment)
for m in span_pattern.finditer(segment):
raw_color = m.group(1)
hex_color = _normalize_color(raw_color)
keyword = m.group(2)
# 三元组: (关键字, 颜色, 所在段落的纯文本上下文)
color_keywords.append((keyword, hex_color, plain_segment))
plain_text = '\n\n'.join(plain_segments)
return plain_text, color_keywords
def main():
parser = argparse.ArgumentParser(description='DOCX 格式保留:替换文本/颜色')
parser.add_argument('input', help='输入 .docx')
parser.add_argument('output', nargs='?', help='输出 .docx')
parser.add_argument('--list-images', action='store_true', help='列出所有图片')
parser.add_argument('--replace', nargs=2, metavar=('OLD', 'NEW'),
action='append', default=[])
args = parser.parse_args()
if args.list_images:
list_images(args.input)
return
if not args.output:
parser.error("需要指定输出文件")
# 处理 span 颜色:把 NEW 中的 <span color="...">文字</span> 抽出来
replacements = []
color_keywords = []
for old, new_raw in args.replace:
new_plain, spans = _parse_span_replacement(new_raw)
replacements.append((old, new_plain))
color_keywords.extend(spans)
process(
input_docx=args.input,
output_docx=args.output,
replacements=replacements,
color_keywords=color_keywords,
)
if __name__ == '__main__':
main()