first commit

This commit is contained in:
2026-02-12 16:24:41 +08:00
commit 1b4f81a9bc
6 changed files with 674 additions and 0 deletions

384
mcp_docx.py Normal file
View File

@@ -0,0 +1,384 @@
#!/usr/bin/env python3
"""
docx_editor.py — 保留原格式替换文本 + 修改字体颜色 + 替换图片
用法:
# 列出文档中所有图片
python3 docx_editor.py input.docx --list-images
# 文本替换 + 颜色
python3 docx_editor.py input.docx output.docx \
--replace "原文" "新文" \
--color "关键词" "FF0000"
# 图片替换按文档中出现的顺序从1开始
python3 docx_editor.py input.docx output.docx \
--image 1 new_chart.png \
--image 2 new_photo.jpg
# 同时替换文字和图片
python3 docx_editor.py input.docx output.docx \
--replace "旧标题" "新标题" \
--image 1 new_image.png \
--color "重点" "FF0000"
"""
import argparse
import os
import tempfile
import zipfile
from lxml import etree
from PIL import Image
W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
WD = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
R = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
REL_TYPE_IMAGE = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
EXT_TO_MIME = {
'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg',
'gif': 'image/gif', 'bmp': 'image/bmp', 'tiff': 'image/tiff',
'webp': 'image/webp',
}
def unpack(docx_path, out_dir):
"""使用 zipfile 直接解包 .docx 到临时目录,替代外部 unpack.py 脚本。"""
with zipfile.ZipFile(docx_path, 'r') as zf:
zf.extractall(out_dir)
def pack(unpacked_dir, output_docx, original_docx):
"""
使用 zipfile 将修改后的目录重新打包为 .docx。
original_docx 参数目前保留只是为了兼容原函数签名,没有实际使用。
"""
# 确保输出目录存在
out_dir = os.path.dirname(os.path.abspath(output_docx))
if out_dir and not os.path.exists(out_dir):
os.makedirs(out_dir, exist_ok=True)
# 将解包目录中的所有文件打成 ZIP保持相对路径结构
with zipfile.ZipFile(output_docx, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
for root, _, files in os.walk(unpacked_dir):
for fname in files:
abs_path = os.path.join(root, fname)
# docx 内部使用 / 作为路径分隔符
arcname = os.path.relpath(abs_path, unpacked_dir).replace(os.sep, '/')
zf.write(abs_path, arcname)
def build_image_index(unpacked_dir):
"""返回按文档顺序排列的图片列表"""
word_dir = os.path.join(unpacked_dir, 'word')
doc_xml = os.path.join(word_dir, 'document.xml')
rels_xml = os.path.join(word_dir, '_rels', 'document.xml.rels')
rels_root = etree.parse(rels_xml).getroot()
rid_to_media = {}
for rel in rels_root:
if rel.get('Type', '') == REL_TYPE_IMAGE:
rid_to_media[rel.get('Id')] = rel.get('Target')
doc_root = etree.parse(doc_xml).getroot()
results = []
for blip in doc_root.iter(f'{{{A}}}blip'):
rid = blip.get(f'{{{R}}}embed')
if not rid or rid not in rid_to_media:
continue
media_rel = rid_to_media[rid]
media_abs = os.path.join(word_dir, media_rel.replace('/', os.sep))
ext = os.path.splitext(media_rel)[1].lstrip('.').lower()
inline = blip
while inline is not None and inline.tag not in (f'{{{WD}}}inline', f'{{{WD}}}anchor'):
inline = inline.getparent()
w_cm = h_cm = None
docpr_name = ''
if inline is not None:
ext_el = inline.find(f'{{{WD}}}extent')
if ext_el is not None:
w_cm = round(int(ext_el.get('cx', 0)) / 360000, 2)
h_cm = round(int(ext_el.get('cy', 0)) / 360000, 2)
dp = inline.find(f'{{{WD}}}docPr')
if dp is not None:
docpr_name = dp.get('name', '')
results.append({
'index': len(results) + 1, 'rid': rid,
'media_file': media_rel, 'abs_path': media_abs,
'ext': ext, 'docpr_name': docpr_name,
'width_cm': w_cm, 'height_cm': h_cm,
})
return results
def list_images(docx_path):
imgs = get_images_info(docx_path)
if not imgs:
print("文档中没有找到图片。")
return
print(f"共找到 {len(imgs)} 张图片:\n")
print(f" {'#':<4} {'文件名':<20} {'尺寸':<18} Word内部名称")
print(" " + "-" * 62)
for img in imgs:
size = f"{img['width_cm']}×{img['height_cm']}cm" if img['width_cm'] else "未知"
print(f" {img['index']:<4} {os.path.basename(img['media_file']):<20} {size:<18} {img['docpr_name']}")
def get_images_info(docx_path):
"""
返回给定 DOCX 文件中所有图片的结构化信息列表。
该函数专门为其他模块(例如 MCP 服务器)复用而设计,
行为等价于原来的 list_images 内部逻辑,但不做任何打印。
"""
with tempfile.TemporaryDirectory() as tmpdir:
unpack(docx_path, tmpdir)
return build_image_index(tmpdir)
def replace_image(unpacked_dir, index, new_image_path):
"""替换第 index 张图片1-based"""
imgs = build_image_index(unpacked_dir)
if index < 1 or index > len(imgs):
raise ValueError(f"图片序号 {index} 超出范围(共 {len(imgs)} 张)")
info = imgs[index - 1]
old_abs = info['abs_path']
old_ext = info['ext']
new_ext = os.path.splitext(new_image_path)[1].lstrip('.').lower()
if new_ext == 'jpg':
new_ext = 'jpeg'
print(f" 图片#{index} {os.path.basename(info['media_file'])}({old_ext.upper()})"
f"{os.path.basename(new_image_path)}({new_ext.upper()})")
if old_ext == new_ext:
# ── 同格式:直接覆盖 ──────────────────────────────
import shutil
shutil.copy2(new_image_path, old_abs)
else:
# ── 不同格式Pillow 转换 + 更新 rels + ContentTypes
new_abs = os.path.splitext(old_abs)[0] + '.' + new_ext
img = Image.open(new_image_path)
fmt = {'jpeg': 'JPEG', 'png': 'PNG', 'gif': 'GIF',
'bmp': 'BMP', 'tiff': 'TIFF', 'webp': 'WEBP'}.get(new_ext, new_ext.upper())
if fmt == 'JPEG' and img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
img.save(new_abs, format=fmt)
if os.path.abspath(new_abs) != os.path.abspath(old_abs):
os.remove(old_abs)
# 更新 rels
old_media = info['media_file']
new_media = os.path.splitext(old_media)[0] + '.' + new_ext
word_dir = os.path.join(unpacked_dir, 'word')
rels_path = os.path.join(word_dir, '_rels', 'document.xml.rels')
rels_tree = etree.parse(rels_path)
for rel in rels_tree.getroot():
if rel.get('Id') == info['rid']:
rel.set('Target', new_media)
break
rels_tree.write(rels_path, xml_declaration=True, encoding='UTF-8', standalone=True)
# 更新 ContentTypes
ct_path = os.path.join(unpacked_dir, '[Content_Types].xml')
ct_tree = etree.parse(ct_path)
ct_root = ct_tree.getroot()
existing = {el.get('Extension', '') for el in ct_root}
if new_ext not in existing:
etree.SubElement(ct_root, 'Default', Extension=new_ext,
ContentType=EXT_TO_MIME.get(new_ext, f'image/{new_ext}'))
ct_tree.write(ct_path, xml_declaration=True, encoding='UTF-8', standalone=True)
print(f" 格式转换 {old_ext}{new_ext}rels 和 ContentTypes 已更新")
def paragraph_replace(para_el, replacements):
"""在 <w:t> 层面替换文本,完全不碰图片和格式"""
for t_el in para_el.iter(f'{{{W}}}t'):
if not t_el.text:
continue
new_text = t_el.text
for old, new in replacements:
new_text = new_text.replace(old, new)
if new_text != t_el.text:
t_el.text = new_text
if new_text and (new_text[0] == ' ' or new_text[-1] == ' '):
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
def ensure_rpr(run_el):
rpr = run_el.find(f'{{{W}}}rPr')
if rpr is None:
rpr = etree.Element(f'{{{W}}}rPr')
run_el.insert(0, rpr)
return rpr
def set_color_on_rpr(rpr_el, hex_color):
c = rpr_el.find(f'{{{W}}}color')
if c is None:
c = etree.SubElement(rpr_el, f'{{{W}}}color')
c.set(f'{{{W}}}val', hex_color.lstrip('#'))
def apply_color_to_keyword(doc_el, keyword, hex_color):
"""
只给匹配到的关键字本身着色,而不是整个 run。
做法:在有关键字的 run 上,把文本拆成多段 run
[前缀][关键字][后缀],只有“关键字”这个 run 设置颜色。
"""
# 先 list 一下,避免在遍历时修改树结构导致问题
runs = list(doc_el.iter(f'{{{W}}}r'))
for run in runs:
t_nodes = list(run.findall(f'{{{W}}}t'))
if not t_nodes:
continue
full_text = ''.join(t.text or '' for t in t_nodes)
if keyword not in full_text:
continue
parent = run.getparent()
if parent is None:
continue
insert_pos = parent.index(run)
# 原 run 的 rPr 复制给新 run
orig_rpr = run.find(f'{{{W}}}rPr')
if orig_rpr is not None:
rpr_bytes = etree.tostring(orig_rpr)
else:
rpr_bytes = None
def make_run(text, colored):
new_r = etree.Element(f'{{{W}}}r')
if rpr_bytes is not None:
new_r.append(etree.fromstring(rpr_bytes))
t_el = etree.SubElement(new_r, f'{{{W}}}t')
t_el.text = text
if text and (text[0] == ' ' or text[-1] == ' '):
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
if colored:
set_color_on_rpr(ensure_rpr(new_r), hex_color)
return new_r
segments = []
s = full_text
start = 0
klen = len(keyword)
while True:
idx = s.find(keyword, start)
if idx == -1:
if start < len(s):
segments.append((s[start:], False))
break
if idx > start:
segments.append((s[start:idx], False))
segments.append((keyword, True))
start = idx + klen
# 用新 run 替换原 run
parent.remove(run)
for offset, (seg_text, colored) in enumerate(segments):
if seg_text:
parent.insert(insert_pos + offset, make_run(seg_text, colored))
def process(input_docx, output_docx, replacements, image_replacements,
color_keywords):
with tempfile.TemporaryDirectory() as tmpdir:
print(f"📂 解包 {input_docx} ...")
unpack(input_docx, tmpdir)
doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml')
if image_replacements:
print(f"🖼️ 替换 {len(image_replacements)} 张图片...")
for idx, new_img in image_replacements:
replace_image(tmpdir, idx, new_img)
tree = etree.parse(doc_xml_path)
root = tree.getroot()
if replacements:
print(f"✏️ 替换 {len(replacements)} 条文本...")
for para in root.iter(f'{{{W}}}p'):
paragraph_replace(para, replacements)
# 根据 span 解析出的关键字上色
for keyword, color in color_keywords:
print(f"🎨 关键词「{keyword}」→ #{color}")
apply_color_to_keyword(root, keyword, color)
tree.write(doc_xml_path, xml_declaration=True, encoding='UTF-8', standalone=True)
print(f"📦 打包 → {output_docx} ...")
pack(tmpdir, output_docx, input_docx)
print(f"✅ 完成!输出: {output_docx}")
def _parse_span_replacement(new_text):
"""
解析 NEW 文本中的 span 标签,用于决定颜色。
约定格式(不区分大小写):
<span color="FF0000">待补充</span>
<span color="#FF0000">待补充</span>
返回: (纯文本, [(keyword, hex_color), ...])
"""
import re
span_pattern = re.compile(
r'<span\s+[^>]*?color=["\']?(#?[0-9a-fA-F]{6})["\']?[^>]*>(.*?)</span>',
re.IGNORECASE | re.DOTALL,
)
color_keywords = []
def _repl(m):
hex_color = m.group(1).lstrip('#')
keyword = m.group(2)
color_keywords.append((keyword, hex_color))
return keyword
plain_text = span_pattern.sub(_repl, new_text)
return plain_text, color_keywords
def main():
parser = argparse.ArgumentParser(description='DOCX 格式保留:替换文本/图片/颜色')
parser.add_argument('input', help='输入 .docx')
parser.add_argument('output', nargs='?', help='输出 .docx')
parser.add_argument('--list-images', action='store_true', help='列出所有图片')
parser.add_argument('--replace', nargs=2, metavar=('OLD', 'NEW'),
action='append', default=[])
parser.add_argument('--image', nargs=2, metavar=('INDEX', 'FILE'),
action='append', default=[], help='图片替换')
args = parser.parse_args()
if args.list_images:
list_images(args.input)
return
if not args.output:
parser.error("需要指定输出文件")
# 处理 span 颜色:把 NEW 中的 <span color="...">文字</span> 抽出来
replacements = []
color_keywords = []
for old, new_raw in args.replace:
new_plain, spans = _parse_span_replacement(new_raw)
replacements.append((old, new_plain))
color_keywords.extend(spans)
process(
input_docx = args.input,
output_docx = args.output,
replacements = replacements,
image_replacements= [(int(i), f) for i, f in args.image],
color_keywords = color_keywords,
)
if __name__ == '__main__':
main()