Files
mcp/mcp_docx.py
2026-03-20 19:19:26 +08:00

560 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
docx_editor.py — 保留原格式替换文本 + 修改字体颜色 + 替换图片
用法:
# 列出文档中所有图片
python3 docx_editor.py input.docx --list-images
# 文本替换 + 颜色
python3 docx_editor.py input.docx output.docx \
--replace "原文" "新文" \
--color "关键词" "FF0000"
# 图片替换按文档中出现的顺序从1开始
python3 docx_editor.py input.docx output.docx \
--image 1 new_chart.png \
--image 2 new_photo.jpg
# 同时替换文字和图片
python3 docx_editor.py input.docx output.docx \
--replace "旧标题" "新标题" \
--image 1 new_image.png \
--color "重点" "FF0000"
"""
import argparse
import os
import tempfile
import zipfile
from lxml import etree
from PIL import Image
import re
W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
WD = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
R = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
REL_TYPE_IMAGE = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
EXT_TO_MIME = {
'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg',
'gif': 'image/gif', 'bmp': 'image/bmp', 'tiff': 'image/tiff',
'webp': 'image/webp',
}
def unpack(docx_path, out_dir):
"""使用 zipfile 直接解包 .docx 到临时目录,替代外部 unpack.py 脚本。"""
with zipfile.ZipFile(docx_path, 'r') as zf:
zf.extractall(out_dir)
def pack(unpacked_dir, output_docx, original_docx):
"""
使用 zipfile 将修改后的目录重新打包为 .docx。
original_docx 参数目前保留只是为了兼容原函数签名,没有实际使用。
"""
# 确保输出目录存在
out_dir = os.path.dirname(os.path.abspath(output_docx))
if out_dir and not os.path.exists(out_dir):
os.makedirs(out_dir, exist_ok=True)
# 将解包目录中的所有文件打成 ZIP保持相对路径结构
with zipfile.ZipFile(output_docx, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
for root, _, files in os.walk(unpacked_dir):
for fname in files:
abs_path = os.path.join(root, fname)
# docx 内部使用 / 作为路径分隔符
arcname = os.path.relpath(abs_path, unpacked_dir).replace(os.sep, '/')
zf.write(abs_path, arcname)
def build_image_index(unpacked_dir):
"""返回按文档顺序排列的图片列表"""
word_dir = os.path.join(unpacked_dir, 'word')
doc_xml = os.path.join(word_dir, 'document.xml')
rels_xml = os.path.join(word_dir, '_rels', 'document.xml.rels')
rels_root = etree.parse(rels_xml).getroot()
rid_to_media = {}
for rel in rels_root:
if rel.get('Type', '') == REL_TYPE_IMAGE:
rid_to_media[rel.get('Id')] = rel.get('Target')
doc_root = etree.parse(doc_xml).getroot()
results = []
for blip in doc_root.iter(f'{{{A}}}blip'):
rid = blip.get(f'{{{R}}}embed')
if not rid or rid not in rid_to_media:
continue
media_rel = rid_to_media[rid]
media_abs = os.path.join(word_dir, media_rel.replace('/', os.sep))
ext = os.path.splitext(media_rel)[1].lstrip('.').lower()
inline = blip
while inline is not None and inline.tag not in (f'{{{WD}}}inline', f'{{{WD}}}anchor'):
inline = inline.getparent()
w_cm = h_cm = None
docpr_name = ''
if inline is not None:
ext_el = inline.find(f'{{{WD}}}extent')
if ext_el is not None:
w_cm = round(int(ext_el.get('cx', 0)) / 360000, 2)
h_cm = round(int(ext_el.get('cy', 0)) / 360000, 2)
dp = inline.find(f'{{{WD}}}docPr')
if dp is not None:
docpr_name = dp.get('name', '')
results.append({
'index': len(results) + 1, 'rid': rid,
'media_file': media_rel, 'abs_path': media_abs,
'ext': ext, 'docpr_name': docpr_name,
'width_cm': w_cm, 'height_cm': h_cm,
})
return results
def list_images(docx_path):
imgs = get_images_info(docx_path)
if not imgs:
print("文档中没有找到图片。")
return
print(f"共找到 {len(imgs)} 张图片:\n")
print(f" {'#':<4} {'文件名':<20} {'尺寸':<18} Word内部名称")
print(" " + "-" * 62)
for img in imgs:
size = f"{img['width_cm']}×{img['height_cm']}cm" if img['width_cm'] else "未知"
print(f" {img['index']:<4} {os.path.basename(img['media_file']):<20} {size:<18} {img['docpr_name']}")
def get_images_info(docx_path):
"""
返回给定 DOCX 文件中所有图片的结构化信息列表。
该函数专门为其他模块(例如 MCP 服务器)复用而设计,
行为等价于原来的 list_images 内部逻辑,但不做任何打印。
"""
with tempfile.TemporaryDirectory() as tmpdir:
unpack(docx_path, tmpdir)
return build_image_index(tmpdir)
def replace_image(unpacked_dir, index, new_image_path):
"""替换第 index 张图片1-based"""
imgs = build_image_index(unpacked_dir)
if index < 1 or index > len(imgs):
raise ValueError(f"图片序号 {index} 超出范围(共 {len(imgs)} 张)")
info = imgs[index - 1]
old_abs = info['abs_path']
old_ext = info['ext']
new_ext = os.path.splitext(new_image_path)[1].lstrip('.').lower()
if new_ext == 'jpg':
new_ext = 'jpeg'
print(f" 图片#{index} {os.path.basename(info['media_file'])}({old_ext.upper()})"
f"{os.path.basename(new_image_path)}({new_ext.upper()})")
if old_ext == new_ext:
# ── 同格式:直接覆盖 ──────────────────────────────
import shutil
shutil.copy2(new_image_path, old_abs)
else:
# ── 不同格式Pillow 转换 + 更新 rels + ContentTypes
new_abs = os.path.splitext(old_abs)[0] + '.' + new_ext
img = Image.open(new_image_path)
fmt = {'jpeg': 'JPEG', 'png': 'PNG', 'gif': 'GIF',
'bmp': 'BMP', 'tiff': 'TIFF', 'webp': 'WEBP'}.get(new_ext, new_ext.upper())
if fmt == 'JPEG' and img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
img.save(new_abs, format=fmt)
if os.path.abspath(new_abs) != os.path.abspath(old_abs):
os.remove(old_abs)
# 更新 rels
old_media = info['media_file']
new_media = os.path.splitext(old_media)[0] + '.' + new_ext
word_dir = os.path.join(unpacked_dir, 'word')
rels_path = os.path.join(word_dir, '_rels', 'document.xml.rels')
rels_tree = etree.parse(rels_path)
for rel in rels_tree.getroot():
if rel.get('Id') == info['rid']:
rel.set('Target', new_media)
break
rels_tree.write(rels_path, xml_declaration=True, encoding='UTF-8', standalone=True)
# 更新 ContentTypes
ct_path = os.path.join(unpacked_dir, '[Content_Types].xml')
ct_tree = etree.parse(ct_path)
ct_root = ct_tree.getroot()
existing = {el.get('Extension', '') for el in ct_root}
if new_ext not in existing:
etree.SubElement(ct_root, 'Default', Extension=new_ext,
ContentType=EXT_TO_MIME.get(new_ext, f'image/{new_ext}'))
ct_tree.write(ct_path, xml_declaration=True, encoding='UTF-8', standalone=True)
print(f" 格式转换 {old_ext}{new_ext}rels 和 ContentTypes 已更新")
def paragraph_replace(para_el, replacements):
"""
在段落级别替换文本,支持跨 <w:t> 元素的匹配。
策略:
1. 收集段落中所有 <w:t> 元素及其文本
2. 拼接成完整文本进行替换
3. 如果有替换发生,重新分配文本到原有的 <w:t> 元素中
"""
# 收集所有 run 元素(<w:r>),保持顺序
runs = list(para_el.findall(f'.//{{{W}}}r'))
if not runs:
return
# 收集所有文本元素及其位置信息
t_elements = []
for run in runs:
for t_el in run.findall(f'{{{W}}}t'):
t_elements.append((run, t_el))
if not t_elements:
return
# 拼接完整文本
full_text = ''.join(t_el.text or '' for _, t_el in t_elements)
original_text = full_text
# 执行所有替换
for old, new in replacements:
if old in full_text:
full_text = full_text.replace(old, new)
# 如果没有变化,直接返回
if full_text == original_text:
return
print(f"段落替换: {len(original_text)} -> {len(full_text)} 字符")
# 将新文本重新分配到原有的 <w:t> 元素中
# 策略:将所有文本放入第一个元素,清空其他元素,避免不当切分导致换行
_, first_t_el = t_elements[0]
first_t_el.text = full_text
if full_text and (full_text[0] == ' ' or full_text[-1] == ' '):
first_t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
# 清空其他 <w:t> 元素
for i in range(1, len(t_elements)):
t_elements[i][1].text = ''
def ensure_rpr(run_el):
rpr = run_el.find(f'{{{W}}}rPr')
if rpr is None:
rpr = etree.Element(f'{{{W}}}rPr')
run_el.insert(0, rpr)
return rpr
def set_color_on_rpr(rpr_el, hex_color):
c = rpr_el.find(f'{{{W}}}color')
if c is None:
c = etree.SubElement(rpr_el, f'{{{W}}}color')
c.set(f'{{{W}}}val', hex_color.lstrip('#'))
def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
"""
只给匹配到的关键字本身着色,而不是整个 run。
做法:在有关键字的 run 上,把文本拆成多段 run
[前缀][关键字][后缀],只有“关键字”这个 run 设置颜色。
当 context_text 不为空时,只在“整段文本包含该 context_text 的段落”中进行上色,
避免同一个关键字在其他段落里被误伤(例如单独的数字 0
"""
# 如果提供了上下文,只在包含该上下文的段落内着色
allowed_paras = None
if context_text:
allowed_paras = set()
for p in doc_el.iter(f'{{{W}}}p'):
t_nodes = list(p.iter(f'{{{W}}}t'))
full = ''.join(t.text or '' for t in t_nodes)
if context_text in full:
allowed_paras.add(p)
def _find_ancestor_para(el):
cur = el
while cur is not None and cur.tag != f'{{{W}}}p':
cur = cur.getparent()
return cur
# 先 list 一下,避免在遍历时修改树结构导致问题
runs = list(doc_el.iter(f'{{{W}}}r'))
for run in runs:
if allowed_paras is not None:
para = _find_ancestor_para(run)
if para not in allowed_paras:
continue
t_nodes = list(run.findall(f'{{{W}}}t'))
if not t_nodes:
continue
full_text = ''.join(t.text or '' for t in t_nodes)
if keyword not in full_text:
continue
parent = run.getparent()
if parent is None:
continue
insert_pos = parent.index(run)
# 原 run 的 rPr 复制给新 run
orig_rpr = run.find(f'{{{W}}}rPr')
if orig_rpr is not None:
rpr_bytes = etree.tostring(orig_rpr)
else:
rpr_bytes = None
def make_run(text, colored):
new_r = etree.Element(f'{{{W}}}r')
if rpr_bytes is not None:
new_r.append(etree.fromstring(rpr_bytes))
t_el = etree.SubElement(new_r, f'{{{W}}}t')
t_el.text = text
if text and (text[0] == ' ' or text[-1] == ' '):
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
if colored:
set_color_on_rpr(ensure_rpr(new_r), hex_color)
return new_r
segments = []
s = full_text
start = 0
klen = len(keyword)
while True:
idx = s.find(keyword, start)
if idx == -1:
if start < len(s):
segments.append((s[start:], False))
break
if idx > start:
segments.append((s[start:idx], False))
segments.append((keyword, True))
start = idx + klen
# 用新 run 替换原 run
parent.remove(run)
for offset, (seg_text, colored) in enumerate(segments):
if seg_text:
parent.insert(insert_pos + offset, make_run(seg_text, colored))
def remove_rule_blocks(doc_el):
"""
删除文档中位于 <global_rule>...</global_rule>、<rule>...</rule> 和 <chart_rule>...</chart_rule> 之间的所有段落。
说明:
- 标签内容可能跨段落,这里按段落顺序遍历,记录是否处于 rule 块内。
- 一旦进入某个块(遇到起始标签),直到遇到对应的结束标签为止,整段段落都会被删除。
- 假设标签本身和其中内容都不需要出现在最终文档里。
"""
inside_global = False
inside_rule = False
inside_chart = False
paras_to_delete = []
# list(...) 防止在遍历时修改树结构
for p in list(doc_el.iter(f'{{{W}}}p')):
t_nodes = list(p.iter(f'{{{W}}}t'))
full = ''.join(t.text or '' for t in t_nodes)
if not full:
# 空段落如果在块内,也删掉
if inside_global or inside_rule or inside_chart:
paras_to_delete.append(p)
continue
# 当前是否在某个块内
if inside_global or inside_rule or inside_chart:
paras_to_delete.append(p)
# 检测 global_rule 块
if '<global_rule>' in full:
inside_global = True
if p not in paras_to_delete:
paras_to_delete.append(p)
if '</global_rule>' in full:
inside_global = False
# 检测 rule 块
if '<rule>' in full:
inside_rule = True
if p not in paras_to_delete:
paras_to_delete.append(p)
if '</rule>' in full:
inside_rule = False
# 检测 chart_rule 块
if '<chart_rule>' in full:
inside_chart = True
if p not in paras_to_delete:
paras_to_delete.append(p)
if '</chart_rule>' in full:
inside_chart = False
for p in paras_to_delete:
parent = p.getparent()
if parent is not None:
parent.remove(p)
def process(input_docx, output_docx, replacements, image_replacements,
color_keywords):
with tempfile.TemporaryDirectory() as tmpdir:
print(f"📂 解包 {input_docx} ...")
unpack(input_docx, tmpdir)
doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml')
if image_replacements:
print(f"🖼️ 替换 {len(image_replacements)} 张图片...")
for idx, new_img in image_replacements:
replace_image(tmpdir, idx, new_img)
tree = etree.parse(doc_xml_path)
root = tree.getroot()
# 先整体删除全局规则和普通规则块(支持标签跨段落)
remove_rule_blocks(root)
if replacements:
print(f"✏️ 替换 {len(replacements)} 条文本...")
for para in root.iter(f'{{{W}}}p'):
paragraph_replace(para, replacements)
# 根据 span 解析出的关键字上色
for item in color_keywords:
# 兼容旧格式: (keyword, color)
if len(item) == 2:
keyword, color = item
context_text = None
else:
keyword, color, context_text = item
print(f"🎨 关键词「{keyword}」→ #{color}")
apply_color_to_keyword(root, keyword, color, context_text)
tree.write(doc_xml_path, xml_declaration=True, encoding='UTF-8', standalone=True)
print(f"📦 打包 → {output_docx} ...")
pack(tmpdir, output_docx, input_docx)
print(f"✅ 完成!输出: {output_docx}")
def _parse_span_replacement(new_text):
"""
解析 NEW 文本中的 span 标签,用于决定颜色。
约定格式(不区分大小写):
<span color="red">待补充</span>
返回: (纯文本, [(keyword, hex_color), ...])
"""
import re
# 简单的命名颜色到 16 进制的映射,可按需扩展
named_colors = {
'red': 'FF0000',
'blue': '0000FF',
'green': '00FF00',
'yellow': 'FFFF00',
'black': '000000',
'white': 'FFFFFF',
'gray': '808080',
'grey': '808080',
}
def _normalize_color(raw_color: str) -> str:
"""
支持:
- FFFFFF / ffffff
- #FFFFFF / #ffffff
- red / blue 等命名颜色(见 named_colors
返回不带 # 的大写 16 进制字符串;如果无法识别命名颜色则原样返回(去掉 #)。
"""
c = (raw_color or '').strip()
if not c:
return ''
# 去掉前导 #
if c.startswith('#'):
c = c[1:]
# 纯 16 进制
if re.fullmatch(r'[0-9a-fA-F]{6}', c):
return c.upper()
# 命名颜色
mapped = named_colors.get(c.lower())
if mapped:
return mapped
# 兜底:返回去掉 # 的原值
return c.upper()
# color 属性允许:
# - 6 位 16 进制(可带 #
# - 命名颜色red / blue ...
span_pattern = re.compile(
r'<span\s+[^>]*?color=["\']?([^"\'\s>]+)["\']?[^>]*>(.*?)</span>',
re.IGNORECASE | re.DOTALL,
)
# 先得到去掉 span 标签后的纯文本(也是最终会写入 DOCX 的内容)
def _strip_repl(m):
return m.group(2)
plain_text = span_pattern.sub(_strip_repl, new_text)
# 再次遍历 span收集颜色关键字并把“整句纯文本”作为上下文挂在每个关键字上
color_keywords = []
for m in span_pattern.finditer(new_text):
raw_color = m.group(1)
hex_color = _normalize_color(raw_color)
keyword = m.group(2)
# 三元组: (关键字, 颜色, 该 NEW 对应的整句纯文本上下文)
color_keywords.append((keyword, hex_color, plain_text))
return plain_text, color_keywords
def main():
parser = argparse.ArgumentParser(description='DOCX 格式保留:替换文本/图片/颜色')
parser.add_argument('input', help='输入 .docx')
parser.add_argument('output', nargs='?', help='输出 .docx')
parser.add_argument('--list-images', action='store_true', help='列出所有图片')
parser.add_argument('--replace', nargs=2, metavar=('OLD', 'NEW'),
action='append', default=[])
parser.add_argument('--image', nargs=2, metavar=('INDEX', 'FILE'),
action='append', default=[], help='图片替换')
args = parser.parse_args()
if args.list_images:
list_images(args.input)
return
if not args.output:
parser.error("需要指定输出文件")
# 处理 span 颜色:把 NEW 中的 <span color="...">文字</span> 抽出来
replacements = []
color_keywords = []
for old, new_raw in args.replace:
new_plain, spans = _parse_span_replacement(new_raw)
replacements.append((old, new_plain))
color_keywords.extend(spans)
process(
input_docx = args.input,
output_docx = args.output,
replacements = replacements,
image_replacements= [(int(i), f) for i, f in args.image],
color_keywords = color_keywords,
)
if __name__ == '__main__':
main()