Files
mcp/mcp_docx.py
2026-02-12 16:24:41 +08:00

384 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
docx_editor.py — 保留原格式替换文本 + 修改字体颜色 + 替换图片
用法:
# 列出文档中所有图片
python3 docx_editor.py input.docx --list-images
# 文本替换 + 颜色
python3 docx_editor.py input.docx output.docx \
--replace "原文" "新文" \
--color "关键词" "FF0000"
# 图片替换按文档中出现的顺序从1开始
python3 docx_editor.py input.docx output.docx \
--image 1 new_chart.png \
--image 2 new_photo.jpg
# 同时替换文字和图片
python3 docx_editor.py input.docx output.docx \
--replace "旧标题" "新标题" \
--image 1 new_image.png \
--color "重点" "FF0000"
"""
import argparse
import os
import tempfile
import zipfile
from lxml import etree
from PIL import Image
W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
WD = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
R = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
REL_TYPE_IMAGE = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
EXT_TO_MIME = {
'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg',
'gif': 'image/gif', 'bmp': 'image/bmp', 'tiff': 'image/tiff',
'webp': 'image/webp',
}
def unpack(docx_path, out_dir):
"""使用 zipfile 直接解包 .docx 到临时目录,替代外部 unpack.py 脚本。"""
with zipfile.ZipFile(docx_path, 'r') as zf:
zf.extractall(out_dir)
def pack(unpacked_dir, output_docx, original_docx):
"""
使用 zipfile 将修改后的目录重新打包为 .docx。
original_docx 参数目前保留只是为了兼容原函数签名,没有实际使用。
"""
# 确保输出目录存在
out_dir = os.path.dirname(os.path.abspath(output_docx))
if out_dir and not os.path.exists(out_dir):
os.makedirs(out_dir, exist_ok=True)
# 将解包目录中的所有文件打成 ZIP保持相对路径结构
with zipfile.ZipFile(output_docx, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
for root, _, files in os.walk(unpacked_dir):
for fname in files:
abs_path = os.path.join(root, fname)
# docx 内部使用 / 作为路径分隔符
arcname = os.path.relpath(abs_path, unpacked_dir).replace(os.sep, '/')
zf.write(abs_path, arcname)
def build_image_index(unpacked_dir):
"""返回按文档顺序排列的图片列表"""
word_dir = os.path.join(unpacked_dir, 'word')
doc_xml = os.path.join(word_dir, 'document.xml')
rels_xml = os.path.join(word_dir, '_rels', 'document.xml.rels')
rels_root = etree.parse(rels_xml).getroot()
rid_to_media = {}
for rel in rels_root:
if rel.get('Type', '') == REL_TYPE_IMAGE:
rid_to_media[rel.get('Id')] = rel.get('Target')
doc_root = etree.parse(doc_xml).getroot()
results = []
for blip in doc_root.iter(f'{{{A}}}blip'):
rid = blip.get(f'{{{R}}}embed')
if not rid or rid not in rid_to_media:
continue
media_rel = rid_to_media[rid]
media_abs = os.path.join(word_dir, media_rel.replace('/', os.sep))
ext = os.path.splitext(media_rel)[1].lstrip('.').lower()
inline = blip
while inline is not None and inline.tag not in (f'{{{WD}}}inline', f'{{{WD}}}anchor'):
inline = inline.getparent()
w_cm = h_cm = None
docpr_name = ''
if inline is not None:
ext_el = inline.find(f'{{{WD}}}extent')
if ext_el is not None:
w_cm = round(int(ext_el.get('cx', 0)) / 360000, 2)
h_cm = round(int(ext_el.get('cy', 0)) / 360000, 2)
dp = inline.find(f'{{{WD}}}docPr')
if dp is not None:
docpr_name = dp.get('name', '')
results.append({
'index': len(results) + 1, 'rid': rid,
'media_file': media_rel, 'abs_path': media_abs,
'ext': ext, 'docpr_name': docpr_name,
'width_cm': w_cm, 'height_cm': h_cm,
})
return results
def list_images(docx_path):
imgs = get_images_info(docx_path)
if not imgs:
print("文档中没有找到图片。")
return
print(f"共找到 {len(imgs)} 张图片:\n")
print(f" {'#':<4} {'文件名':<20} {'尺寸':<18} Word内部名称")
print(" " + "-" * 62)
for img in imgs:
size = f"{img['width_cm']}×{img['height_cm']}cm" if img['width_cm'] else "未知"
print(f" {img['index']:<4} {os.path.basename(img['media_file']):<20} {size:<18} {img['docpr_name']}")
def get_images_info(docx_path):
"""
返回给定 DOCX 文件中所有图片的结构化信息列表。
该函数专门为其他模块(例如 MCP 服务器)复用而设计,
行为等价于原来的 list_images 内部逻辑,但不做任何打印。
"""
with tempfile.TemporaryDirectory() as tmpdir:
unpack(docx_path, tmpdir)
return build_image_index(tmpdir)
def replace_image(unpacked_dir, index, new_image_path):
"""替换第 index 张图片1-based"""
imgs = build_image_index(unpacked_dir)
if index < 1 or index > len(imgs):
raise ValueError(f"图片序号 {index} 超出范围(共 {len(imgs)} 张)")
info = imgs[index - 1]
old_abs = info['abs_path']
old_ext = info['ext']
new_ext = os.path.splitext(new_image_path)[1].lstrip('.').lower()
if new_ext == 'jpg':
new_ext = 'jpeg'
print(f" 图片#{index} {os.path.basename(info['media_file'])}({old_ext.upper()})"
f"{os.path.basename(new_image_path)}({new_ext.upper()})")
if old_ext == new_ext:
# ── 同格式:直接覆盖 ──────────────────────────────
import shutil
shutil.copy2(new_image_path, old_abs)
else:
# ── 不同格式Pillow 转换 + 更新 rels + ContentTypes
new_abs = os.path.splitext(old_abs)[0] + '.' + new_ext
img = Image.open(new_image_path)
fmt = {'jpeg': 'JPEG', 'png': 'PNG', 'gif': 'GIF',
'bmp': 'BMP', 'tiff': 'TIFF', 'webp': 'WEBP'}.get(new_ext, new_ext.upper())
if fmt == 'JPEG' and img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
img.save(new_abs, format=fmt)
if os.path.abspath(new_abs) != os.path.abspath(old_abs):
os.remove(old_abs)
# 更新 rels
old_media = info['media_file']
new_media = os.path.splitext(old_media)[0] + '.' + new_ext
word_dir = os.path.join(unpacked_dir, 'word')
rels_path = os.path.join(word_dir, '_rels', 'document.xml.rels')
rels_tree = etree.parse(rels_path)
for rel in rels_tree.getroot():
if rel.get('Id') == info['rid']:
rel.set('Target', new_media)
break
rels_tree.write(rels_path, xml_declaration=True, encoding='UTF-8', standalone=True)
# 更新 ContentTypes
ct_path = os.path.join(unpacked_dir, '[Content_Types].xml')
ct_tree = etree.parse(ct_path)
ct_root = ct_tree.getroot()
existing = {el.get('Extension', '') for el in ct_root}
if new_ext not in existing:
etree.SubElement(ct_root, 'Default', Extension=new_ext,
ContentType=EXT_TO_MIME.get(new_ext, f'image/{new_ext}'))
ct_tree.write(ct_path, xml_declaration=True, encoding='UTF-8', standalone=True)
print(f" 格式转换 {old_ext}{new_ext}rels 和 ContentTypes 已更新")
def paragraph_replace(para_el, replacements):
"""在 <w:t> 层面替换文本,完全不碰图片和格式"""
for t_el in para_el.iter(f'{{{W}}}t'):
if not t_el.text:
continue
new_text = t_el.text
for old, new in replacements:
new_text = new_text.replace(old, new)
if new_text != t_el.text:
t_el.text = new_text
if new_text and (new_text[0] == ' ' or new_text[-1] == ' '):
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
def ensure_rpr(run_el):
rpr = run_el.find(f'{{{W}}}rPr')
if rpr is None:
rpr = etree.Element(f'{{{W}}}rPr')
run_el.insert(0, rpr)
return rpr
def set_color_on_rpr(rpr_el, hex_color):
c = rpr_el.find(f'{{{W}}}color')
if c is None:
c = etree.SubElement(rpr_el, f'{{{W}}}color')
c.set(f'{{{W}}}val', hex_color.lstrip('#'))
def apply_color_to_keyword(doc_el, keyword, hex_color):
"""
只给匹配到的关键字本身着色,而不是整个 run。
做法:在有关键字的 run 上,把文本拆成多段 run
[前缀][关键字][后缀],只有“关键字”这个 run 设置颜色。
"""
# 先 list 一下,避免在遍历时修改树结构导致问题
runs = list(doc_el.iter(f'{{{W}}}r'))
for run in runs:
t_nodes = list(run.findall(f'{{{W}}}t'))
if not t_nodes:
continue
full_text = ''.join(t.text or '' for t in t_nodes)
if keyword not in full_text:
continue
parent = run.getparent()
if parent is None:
continue
insert_pos = parent.index(run)
# 原 run 的 rPr 复制给新 run
orig_rpr = run.find(f'{{{W}}}rPr')
if orig_rpr is not None:
rpr_bytes = etree.tostring(orig_rpr)
else:
rpr_bytes = None
def make_run(text, colored):
new_r = etree.Element(f'{{{W}}}r')
if rpr_bytes is not None:
new_r.append(etree.fromstring(rpr_bytes))
t_el = etree.SubElement(new_r, f'{{{W}}}t')
t_el.text = text
if text and (text[0] == ' ' or text[-1] == ' '):
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
if colored:
set_color_on_rpr(ensure_rpr(new_r), hex_color)
return new_r
segments = []
s = full_text
start = 0
klen = len(keyword)
while True:
idx = s.find(keyword, start)
if idx == -1:
if start < len(s):
segments.append((s[start:], False))
break
if idx > start:
segments.append((s[start:idx], False))
segments.append((keyword, True))
start = idx + klen
# 用新 run 替换原 run
parent.remove(run)
for offset, (seg_text, colored) in enumerate(segments):
if seg_text:
parent.insert(insert_pos + offset, make_run(seg_text, colored))
def process(input_docx, output_docx, replacements, image_replacements,
color_keywords):
with tempfile.TemporaryDirectory() as tmpdir:
print(f"📂 解包 {input_docx} ...")
unpack(input_docx, tmpdir)
doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml')
if image_replacements:
print(f"🖼️ 替换 {len(image_replacements)} 张图片...")
for idx, new_img in image_replacements:
replace_image(tmpdir, idx, new_img)
tree = etree.parse(doc_xml_path)
root = tree.getroot()
if replacements:
print(f"✏️ 替换 {len(replacements)} 条文本...")
for para in root.iter(f'{{{W}}}p'):
paragraph_replace(para, replacements)
# 根据 span 解析出的关键字上色
for keyword, color in color_keywords:
print(f"🎨 关键词「{keyword}」→ #{color}")
apply_color_to_keyword(root, keyword, color)
tree.write(doc_xml_path, xml_declaration=True, encoding='UTF-8', standalone=True)
print(f"📦 打包 → {output_docx} ...")
pack(tmpdir, output_docx, input_docx)
print(f"✅ 完成!输出: {output_docx}")
def _parse_span_replacement(new_text):
"""
解析 NEW 文本中的 span 标签,用于决定颜色。
约定格式(不区分大小写):
<span color="FF0000">待补充</span>
<span color="#FF0000">待补充</span>
返回: (纯文本, [(keyword, hex_color), ...])
"""
import re
span_pattern = re.compile(
r'<span\s+[^>]*?color=["\']?(#?[0-9a-fA-F]{6})["\']?[^>]*>(.*?)</span>',
re.IGNORECASE | re.DOTALL,
)
color_keywords = []
def _repl(m):
hex_color = m.group(1).lstrip('#')
keyword = m.group(2)
color_keywords.append((keyword, hex_color))
return keyword
plain_text = span_pattern.sub(_repl, new_text)
return plain_text, color_keywords
def main():
parser = argparse.ArgumentParser(description='DOCX 格式保留:替换文本/图片/颜色')
parser.add_argument('input', help='输入 .docx')
parser.add_argument('output', nargs='?', help='输出 .docx')
parser.add_argument('--list-images', action='store_true', help='列出所有图片')
parser.add_argument('--replace', nargs=2, metavar=('OLD', 'NEW'),
action='append', default=[])
parser.add_argument('--image', nargs=2, metavar=('INDEX', 'FILE'),
action='append', default=[], help='图片替换')
args = parser.parse_args()
if args.list_images:
list_images(args.input)
return
if not args.output:
parser.error("需要指定输出文件")
# 处理 span 颜色:把 NEW 中的 <span color="...">文字</span> 抽出来
replacements = []
color_keywords = []
for old, new_raw in args.replace:
new_plain, spans = _parse_span_replacement(new_raw)
replacements.append((old, new_plain))
color_keywords.extend(spans)
process(
input_docx = args.input,
output_docx = args.output,
replacements = replacements,
image_replacements= [(int(i), f) for i, f in args.image],
color_keywords = color_keywords,
)
if __name__ == '__main__':
main()