1.0
This commit is contained in:
312
mcp_docx.py
312
mcp_docx.py
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
docx_editor.py — 保留原格式替换文本 + 修改字体颜色 + 替换图片
|
||||
docx_editor.py — 保留原格式替换文本 + 修改字体颜色
|
||||
|
||||
用法:
|
||||
# 列出文档中所有图片
|
||||
@@ -10,25 +10,14 @@ docx_editor.py — 保留原格式替换文本 + 修改字体颜色 + 替换图
|
||||
python3 docx_editor.py input.docx output.docx \
|
||||
--replace "原文" "新文" \
|
||||
--color "关键词" "FF0000"
|
||||
|
||||
# 图片替换(按文档中出现的顺序,从1开始)
|
||||
python3 docx_editor.py input.docx output.docx \
|
||||
--image 1 new_chart.png \
|
||||
--image 2 new_photo.jpg
|
||||
|
||||
# 同时替换文字和图片
|
||||
python3 docx_editor.py input.docx output.docx \
|
||||
--replace "旧标题" "新标题" \
|
||||
--image 1 new_image.png \
|
||||
--color "重点" "FF0000"
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
import os
|
||||
import tempfile
|
||||
import zipfile
|
||||
from lxml import etree
|
||||
from PIL import Image
|
||||
import re
|
||||
|
||||
W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||
@@ -37,12 +26,6 @@ A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
|
||||
R = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
|
||||
REL_TYPE_IMAGE = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
|
||||
|
||||
EXT_TO_MIME = {
|
||||
'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg',
|
||||
'gif': 'image/gif', 'bmp': 'image/bmp', 'tiff': 'image/tiff',
|
||||
'webp': 'image/webp',
|
||||
}
|
||||
|
||||
|
||||
def unpack(docx_path, out_dir):
|
||||
"""使用 zipfile 直接解包 .docx 到临时目录,替代外部 unpack.py 脚本。"""
|
||||
@@ -141,61 +124,146 @@ def get_images_info(docx_path):
|
||||
return build_image_index(tmpdir)
|
||||
|
||||
|
||||
def replace_image(unpacked_dir, index, new_image_path):
|
||||
"""替换第 index 张图片(1-based)"""
|
||||
imgs = build_image_index(unpacked_dir)
|
||||
if index < 1 or index > len(imgs):
|
||||
raise ValueError(f"图片序号 {index} 超出范围(共 {len(imgs)} 张)")
|
||||
def _normalize_newlines(text):
|
||||
if text is None:
|
||||
return ''
|
||||
return str(text).replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
info = imgs[index - 1]
|
||||
old_abs = info['abs_path']
|
||||
old_ext = info['ext']
|
||||
new_ext = os.path.splitext(new_image_path)[1].lstrip('.').lower()
|
||||
if new_ext == 'jpg':
|
||||
new_ext = 'jpeg'
|
||||
|
||||
print(f" 图片#{index} {os.path.basename(info['media_file'])}({old_ext.upper()})"
|
||||
f" ← {os.path.basename(new_image_path)}({new_ext.upper()})")
|
||||
def _is_text_node(el):
|
||||
return el.tag == f'{{{W}}}t'
|
||||
|
||||
if old_ext == new_ext:
|
||||
# ── 同格式:直接覆盖 ──────────────────────────────
|
||||
import shutil
|
||||
shutil.copy2(new_image_path, old_abs)
|
||||
|
||||
def _is_break_node(el):
|
||||
return el.tag in (f'{{{W}}}br', f'{{{W}}}cr')
|
||||
|
||||
|
||||
def _is_tab_node(el):
|
||||
return el.tag == f'{{{W}}}tab'
|
||||
|
||||
|
||||
def _iter_run_text_parts(run_el):
|
||||
for child in run_el:
|
||||
if _is_text_node(child):
|
||||
yield child, _normalize_newlines(child.text or '')
|
||||
elif _is_break_node(child):
|
||||
yield child, '\n'
|
||||
elif _is_tab_node(child):
|
||||
yield child, '\t'
|
||||
|
||||
|
||||
def _run_text(run_el):
|
||||
return ''.join(part for _, part in _iter_run_text_parts(run_el))
|
||||
|
||||
|
||||
def _paragraph_text(para_el):
|
||||
return ''.join(_run_text(run) for run in para_el.iter(f'{{{W}}}r'))
|
||||
|
||||
|
||||
def _clear_run_text_like_children(run_el):
|
||||
for child in list(run_el):
|
||||
if _is_text_node(child) or _is_break_node(child) or _is_tab_node(child):
|
||||
run_el.remove(child)
|
||||
|
||||
|
||||
def _append_text_to_run(run_el, text):
|
||||
text = _normalize_newlines(text)
|
||||
parts = text.split('\n')
|
||||
|
||||
if len(parts) == 1:
|
||||
t_el = etree.SubElement(run_el, f'{{{W}}}t')
|
||||
t_el.text = parts[0]
|
||||
if parts[0] and (parts[0][0] == ' ' or parts[0][-1] == ' '):
|
||||
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
||||
return
|
||||
|
||||
for idx, part in enumerate(parts):
|
||||
if part:
|
||||
t_el = etree.SubElement(run_el, f'{{{W}}}t')
|
||||
t_el.text = part
|
||||
if part[0] == ' ' or part[-1] == ' ':
|
||||
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
||||
if idx < len(parts) - 1:
|
||||
etree.SubElement(run_el, f'{{{W}}}br')
|
||||
|
||||
|
||||
def _ensure_paragraph_run(para_el):
|
||||
runs = list(para_el.findall(f'.//{{{W}}}r'))
|
||||
if runs:
|
||||
return runs[0]
|
||||
|
||||
ppr = para_el.find(f'{{{W}}}pPr')
|
||||
new_r = etree.Element(f'{{{W}}}r')
|
||||
if ppr is None:
|
||||
para_el.insert(0, new_r)
|
||||
else:
|
||||
# ── 不同格式:Pillow 转换 + 更新 rels + ContentTypes
|
||||
new_abs = os.path.splitext(old_abs)[0] + '.' + new_ext
|
||||
img = Image.open(new_image_path)
|
||||
fmt = {'jpeg': 'JPEG', 'png': 'PNG', 'gif': 'GIF',
|
||||
'bmp': 'BMP', 'tiff': 'TIFF', 'webp': 'WEBP'}.get(new_ext, new_ext.upper())
|
||||
if fmt == 'JPEG' and img.mode in ('RGBA', 'P'):
|
||||
img = img.convert('RGB')
|
||||
img.save(new_abs, format=fmt)
|
||||
if os.path.abspath(new_abs) != os.path.abspath(old_abs):
|
||||
os.remove(old_abs)
|
||||
para_el.insert(para_el.index(ppr) + 1, new_r)
|
||||
return new_r
|
||||
|
||||
# 更新 rels
|
||||
old_media = info['media_file']
|
||||
new_media = os.path.splitext(old_media)[0] + '.' + new_ext
|
||||
word_dir = os.path.join(unpacked_dir, 'word')
|
||||
rels_path = os.path.join(word_dir, '_rels', 'document.xml.rels')
|
||||
rels_tree = etree.parse(rels_path)
|
||||
for rel in rels_tree.getroot():
|
||||
if rel.get('Id') == info['rid']:
|
||||
rel.set('Target', new_media)
|
||||
break
|
||||
rels_tree.write(rels_path, xml_declaration=True, encoding='UTF-8', standalone=True)
|
||||
|
||||
# 更新 ContentTypes
|
||||
ct_path = os.path.join(unpacked_dir, '[Content_Types].xml')
|
||||
ct_tree = etree.parse(ct_path)
|
||||
ct_root = ct_tree.getroot()
|
||||
existing = {el.get('Extension', '') for el in ct_root}
|
||||
if new_ext not in existing:
|
||||
etree.SubElement(ct_root, 'Default', Extension=new_ext,
|
||||
ContentType=EXT_TO_MIME.get(new_ext, f'image/{new_ext}'))
|
||||
ct_tree.write(ct_path, xml_declaration=True, encoding='UTF-8', standalone=True)
|
||||
print(f" 格式转换 {old_ext}→{new_ext},rels 和 ContentTypes 已更新")
|
||||
def _set_paragraph_text(para_el, text):
|
||||
runs = list(para_el.findall(f'.//{{{W}}}r'))
|
||||
text_runs = [run for run in runs if any(True for _ in _iter_run_text_parts(run))]
|
||||
|
||||
if text_runs:
|
||||
first_run = text_runs[0]
|
||||
for run in text_runs:
|
||||
_clear_run_text_like_children(run)
|
||||
else:
|
||||
first_run = _ensure_paragraph_run(para_el)
|
||||
_clear_run_text_like_children(first_run)
|
||||
|
||||
_append_text_to_run(first_run, text)
|
||||
|
||||
|
||||
def _paragraph_list(doc_el):
|
||||
return list(doc_el.iter(f'{{{W}}}p'))
|
||||
|
||||
|
||||
def _replace_paragraph_block(doc_el, old_text, new_text):
|
||||
old_segments = _normalize_newlines(old_text).split('\n\n')
|
||||
new_segments = _normalize_newlines(new_text).split('\n\n')
|
||||
if len(old_segments) <= 1:
|
||||
return False
|
||||
|
||||
paras = _paragraph_list(doc_el)
|
||||
para_texts = [_paragraph_text(p) for p in paras]
|
||||
|
||||
match_start = None
|
||||
for i in range(0, len(para_texts) - len(old_segments) + 1):
|
||||
if para_texts[i:i + len(old_segments)] == old_segments:
|
||||
match_start = i
|
||||
break
|
||||
|
||||
if match_start is None:
|
||||
return False
|
||||
|
||||
matched_paras = paras[match_start:match_start + len(old_segments)]
|
||||
parent = matched_paras[0].getparent()
|
||||
if parent is None:
|
||||
return False
|
||||
|
||||
anchor_index = parent.index(matched_paras[-1])
|
||||
|
||||
shared_count = min(len(matched_paras), len(new_segments))
|
||||
for idx in range(shared_count):
|
||||
_set_paragraph_text(matched_paras[idx], new_segments[idx])
|
||||
|
||||
if len(new_segments) > len(matched_paras):
|
||||
template_para = matched_paras[-1]
|
||||
insert_at = anchor_index + 1
|
||||
for seg in new_segments[len(matched_paras):]:
|
||||
new_para = copy.deepcopy(template_para)
|
||||
_set_paragraph_text(new_para, seg)
|
||||
parent.insert(insert_at, new_para)
|
||||
insert_at += 1
|
||||
elif len(new_segments) < len(matched_paras):
|
||||
for para in matched_paras[len(new_segments):]:
|
||||
para_parent = para.getparent()
|
||||
if para_parent is not None:
|
||||
para_parent.remove(para)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def paragraph_replace(para_el, replacements):
|
||||
@@ -213,20 +281,27 @@ def paragraph_replace(para_el, replacements):
|
||||
return
|
||||
|
||||
# 收集所有文本元素及其位置信息
|
||||
t_elements = []
|
||||
text_runs = []
|
||||
for run in runs:
|
||||
for t_el in run.findall(f'{{{W}}}t'):
|
||||
t_elements.append((run, t_el))
|
||||
if any(True for _ in _iter_run_text_parts(run)):
|
||||
text_runs.append(run)
|
||||
|
||||
if not t_elements:
|
||||
if not text_runs:
|
||||
return
|
||||
|
||||
# 拼接完整文本
|
||||
full_text = ''.join(t_el.text or '' for _, t_el in t_elements)
|
||||
full_text = _paragraph_text(para_el)
|
||||
original_text = full_text
|
||||
|
||||
# 执行所有替换
|
||||
normalized_replacements = []
|
||||
for old, new in replacements:
|
||||
normalized_replacements.append((
|
||||
_normalize_newlines(old),
|
||||
_normalize_newlines(new),
|
||||
))
|
||||
|
||||
# 执行所有替换
|
||||
for old, new in normalized_replacements:
|
||||
if old in full_text:
|
||||
full_text = full_text.replace(old, new)
|
||||
|
||||
@@ -236,16 +311,11 @@ def paragraph_replace(para_el, replacements):
|
||||
|
||||
print(f"段落替换: {len(original_text)} -> {len(full_text)} 字符")
|
||||
|
||||
# 将新文本重新分配到原有的 <w:t> 元素中
|
||||
# 策略:将所有文本放入第一个元素,清空其他元素,避免不当切分导致换行
|
||||
_, first_t_el = t_elements[0]
|
||||
first_t_el.text = full_text
|
||||
if full_text and (full_text[0] == ' ' or full_text[-1] == ' '):
|
||||
first_t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
||||
|
||||
# 清空其他 <w:t> 元素
|
||||
for i in range(1, len(t_elements)):
|
||||
t_elements[i][1].text = ''
|
||||
# 将规范化文本重新写回第一个文本 run,\n 会回写成 Word 的换行节点。
|
||||
first_run = text_runs[0]
|
||||
for run in text_runs:
|
||||
_clear_run_text_like_children(run)
|
||||
_append_text_to_run(first_run, full_text)
|
||||
|
||||
|
||||
def ensure_rpr(run_el):
|
||||
@@ -271,13 +341,15 @@ def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
|
||||
当 context_text 不为空时,只在“整段文本包含该 context_text 的段落”中进行上色,
|
||||
避免同一个关键字在其他段落里被误伤(例如单独的数字 0)。
|
||||
"""
|
||||
keyword = _normalize_newlines(keyword)
|
||||
context_text = _normalize_newlines(context_text) if context_text is not None else None
|
||||
|
||||
# 如果提供了上下文,只在包含该上下文的段落内着色
|
||||
allowed_paras = None
|
||||
if context_text:
|
||||
allowed_paras = set()
|
||||
for p in doc_el.iter(f'{{{W}}}p'):
|
||||
t_nodes = list(p.iter(f'{{{W}}}t'))
|
||||
full = ''.join(t.text or '' for t in t_nodes)
|
||||
full = _paragraph_text(p)
|
||||
if context_text in full:
|
||||
allowed_paras.add(p)
|
||||
|
||||
@@ -294,10 +366,9 @@ def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
|
||||
para = _find_ancestor_para(run)
|
||||
if para not in allowed_paras:
|
||||
continue
|
||||
t_nodes = list(run.findall(f'{{{W}}}t'))
|
||||
if not t_nodes:
|
||||
full_text = _run_text(run)
|
||||
if not full_text:
|
||||
continue
|
||||
full_text = ''.join(t.text or '' for t in t_nodes)
|
||||
if keyword not in full_text:
|
||||
continue
|
||||
|
||||
@@ -317,10 +388,7 @@ def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
|
||||
new_r = etree.Element(f'{{{W}}}r')
|
||||
if rpr_bytes is not None:
|
||||
new_r.append(etree.fromstring(rpr_bytes))
|
||||
t_el = etree.SubElement(new_r, f'{{{W}}}t')
|
||||
t_el.text = text
|
||||
if text and (text[0] == ' ' or text[-1] == ' '):
|
||||
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
||||
_append_text_to_run(new_r, text)
|
||||
if colored:
|
||||
set_color_on_rpr(ensure_rpr(new_r), hex_color)
|
||||
return new_r
|
||||
@@ -405,19 +473,13 @@ def remove_rule_blocks(doc_el):
|
||||
if parent is not None:
|
||||
parent.remove(p)
|
||||
|
||||
def process(input_docx, output_docx, replacements, image_replacements,
|
||||
color_keywords):
|
||||
def process(input_docx, output_docx, replacements, color_keywords):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
print(f"📂 解包 {input_docx} ...")
|
||||
unpack(input_docx, tmpdir)
|
||||
|
||||
doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml')
|
||||
|
||||
if image_replacements:
|
||||
print(f"🖼️ 替换 {len(image_replacements)} 张图片...")
|
||||
for idx, new_img in image_replacements:
|
||||
replace_image(tmpdir, idx, new_img)
|
||||
|
||||
tree = etree.parse(doc_xml_path)
|
||||
root = tree.getroot()
|
||||
|
||||
@@ -426,8 +488,17 @@ def process(input_docx, output_docx, replacements, image_replacements,
|
||||
|
||||
if replacements:
|
||||
print(f"✏️ 替换 {len(replacements)} 条文本...")
|
||||
for para in root.iter(f'{{{W}}}p'):
|
||||
paragraph_replace(para, replacements)
|
||||
remaining_replacements = []
|
||||
for old, new in replacements:
|
||||
if '\n\n' in _normalize_newlines(old):
|
||||
replaced = _replace_paragraph_block(root, old, new)
|
||||
if replaced:
|
||||
print("🧩 跨段替换命中")
|
||||
continue
|
||||
remaining_replacements.append((old, new))
|
||||
if remaining_replacements:
|
||||
for para in root.iter(f'{{{W}}}p'):
|
||||
paragraph_replace(para, remaining_replacements)
|
||||
|
||||
# 根据 span 解析出的关键字上色
|
||||
for item in color_keywords:
|
||||
@@ -457,6 +528,8 @@ def _parse_span_replacement(new_text):
|
||||
"""
|
||||
import re
|
||||
|
||||
new_text = _normalize_newlines(new_text)
|
||||
|
||||
# 简单的命名颜色到 16 进制的映射,可按需扩展
|
||||
named_colors = {
|
||||
'red': 'FF0000',
|
||||
@@ -505,33 +578,33 @@ def _parse_span_replacement(new_text):
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
# 先得到去掉 span 标签后的纯文本(也是最终会写入 DOCX 的内容)
|
||||
# 先按段落边界拆分,这样 span 上色时可以使用所在段落作为上下文。
|
||||
def _strip_repl(m):
|
||||
return m.group(2)
|
||||
|
||||
plain_text = span_pattern.sub(_strip_repl, new_text)
|
||||
|
||||
# 再次遍历 span,收集颜色关键字,并把“整句纯文本”作为上下文挂在每个关键字上
|
||||
plain_segments = []
|
||||
color_keywords = []
|
||||
for m in span_pattern.finditer(new_text):
|
||||
raw_color = m.group(1)
|
||||
hex_color = _normalize_color(raw_color)
|
||||
keyword = m.group(2)
|
||||
# 三元组: (关键字, 颜色, 该 NEW 对应的整句纯文本上下文)
|
||||
color_keywords.append((keyword, hex_color, plain_text))
|
||||
for segment in new_text.split('\n\n'):
|
||||
plain_segment = span_pattern.sub(_strip_repl, segment)
|
||||
plain_segments.append(plain_segment)
|
||||
for m in span_pattern.finditer(segment):
|
||||
raw_color = m.group(1)
|
||||
hex_color = _normalize_color(raw_color)
|
||||
keyword = m.group(2)
|
||||
# 三元组: (关键字, 颜色, 所在段落的纯文本上下文)
|
||||
color_keywords.append((keyword, hex_color, plain_segment))
|
||||
|
||||
plain_text = '\n\n'.join(plain_segments)
|
||||
return plain_text, color_keywords
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='DOCX 格式保留:替换文本/图片/颜色')
|
||||
parser = argparse.ArgumentParser(description='DOCX 格式保留:替换文本/颜色')
|
||||
parser.add_argument('input', help='输入 .docx')
|
||||
parser.add_argument('output', nargs='?', help='输出 .docx')
|
||||
parser.add_argument('--list-images', action='store_true', help='列出所有图片')
|
||||
parser.add_argument('--replace', nargs=2, metavar=('OLD', 'NEW'),
|
||||
action='append', default=[])
|
||||
parser.add_argument('--image', nargs=2, metavar=('INDEX', 'FILE'),
|
||||
action='append', default=[], help='图片替换')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.list_images:
|
||||
@@ -549,12 +622,11 @@ def main():
|
||||
color_keywords.extend(spans)
|
||||
|
||||
process(
|
||||
input_docx = args.input,
|
||||
output_docx = args.output,
|
||||
replacements = replacements,
|
||||
image_replacements= [(int(i), f) for i, f in args.image],
|
||||
color_keywords = color_keywords,
|
||||
input_docx=args.input,
|
||||
output_docx=args.output,
|
||||
replacements=replacements,
|
||||
color_keywords=color_keywords,
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user