This commit is contained in:
2026-03-26 22:56:58 +08:00
parent a337aa4540
commit 2bb850e8e2
2 changed files with 564 additions and 265 deletions

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3
"""
docx_editor.py — 保留原格式替换文本 + 修改字体颜色 + 替换图片
docx_editor.py — 保留原格式替换文本 + 修改字体颜色
用法:
# 列出文档中所有图片
@@ -10,25 +10,14 @@ docx_editor.py — 保留原格式替换文本 + 修改字体颜色 + 替换图
python3 docx_editor.py input.docx output.docx \
--replace "原文" "新文" \
--color "关键词" "FF0000"
# 图片替换按文档中出现的顺序从1开始
python3 docx_editor.py input.docx output.docx \
--image 1 new_chart.png \
--image 2 new_photo.jpg
# 同时替换文字和图片
python3 docx_editor.py input.docx output.docx \
--replace "旧标题" "新标题" \
--image 1 new_image.png \
--color "重点" "FF0000"
"""
import argparse
import copy
import os
import tempfile
import zipfile
from lxml import etree
from PIL import Image
import re
W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
@@ -37,12 +26,6 @@ A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
R = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
REL_TYPE_IMAGE = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
EXT_TO_MIME = {
'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg',
'gif': 'image/gif', 'bmp': 'image/bmp', 'tiff': 'image/tiff',
'webp': 'image/webp',
}
def unpack(docx_path, out_dir):
"""使用 zipfile 直接解包 .docx 到临时目录,替代外部 unpack.py 脚本。"""
@@ -141,61 +124,146 @@ def get_images_info(docx_path):
return build_image_index(tmpdir)
def replace_image(unpacked_dir, index, new_image_path):
"""替换第 index 张图片1-based"""
imgs = build_image_index(unpacked_dir)
if index < 1 or index > len(imgs):
raise ValueError(f"图片序号 {index} 超出范围(共 {len(imgs)} 张)")
def _normalize_newlines(text):
if text is None:
return ''
return str(text).replace('\r\n', '\n').replace('\r', '\n')
info = imgs[index - 1]
old_abs = info['abs_path']
old_ext = info['ext']
new_ext = os.path.splitext(new_image_path)[1].lstrip('.').lower()
if new_ext == 'jpg':
new_ext = 'jpeg'
print(f" 图片#{index} {os.path.basename(info['media_file'])}({old_ext.upper()})"
f"{os.path.basename(new_image_path)}({new_ext.upper()})")
def _is_text_node(el):
return el.tag == f'{{{W}}}t'
if old_ext == new_ext:
# ── 同格式:直接覆盖 ──────────────────────────────
import shutil
shutil.copy2(new_image_path, old_abs)
def _is_break_node(el):
return el.tag in (f'{{{W}}}br', f'{{{W}}}cr')
def _is_tab_node(el):
return el.tag == f'{{{W}}}tab'
def _iter_run_text_parts(run_el):
for child in run_el:
if _is_text_node(child):
yield child, _normalize_newlines(child.text or '')
elif _is_break_node(child):
yield child, '\n'
elif _is_tab_node(child):
yield child, '\t'
def _run_text(run_el):
return ''.join(part for _, part in _iter_run_text_parts(run_el))
def _paragraph_text(para_el):
return ''.join(_run_text(run) for run in para_el.iter(f'{{{W}}}r'))
def _clear_run_text_like_children(run_el):
for child in list(run_el):
if _is_text_node(child) or _is_break_node(child) or _is_tab_node(child):
run_el.remove(child)
def _append_text_to_run(run_el, text):
text = _normalize_newlines(text)
parts = text.split('\n')
if len(parts) == 1:
t_el = etree.SubElement(run_el, f'{{{W}}}t')
t_el.text = parts[0]
if parts[0] and (parts[0][0] == ' ' or parts[0][-1] == ' '):
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
return
for idx, part in enumerate(parts):
if part:
t_el = etree.SubElement(run_el, f'{{{W}}}t')
t_el.text = part
if part[0] == ' ' or part[-1] == ' ':
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
if idx < len(parts) - 1:
etree.SubElement(run_el, f'{{{W}}}br')
def _ensure_paragraph_run(para_el):
runs = list(para_el.findall(f'.//{{{W}}}r'))
if runs:
return runs[0]
ppr = para_el.find(f'{{{W}}}pPr')
new_r = etree.Element(f'{{{W}}}r')
if ppr is None:
para_el.insert(0, new_r)
else:
# ── 不同格式Pillow 转换 + 更新 rels + ContentTypes
new_abs = os.path.splitext(old_abs)[0] + '.' + new_ext
img = Image.open(new_image_path)
fmt = {'jpeg': 'JPEG', 'png': 'PNG', 'gif': 'GIF',
'bmp': 'BMP', 'tiff': 'TIFF', 'webp': 'WEBP'}.get(new_ext, new_ext.upper())
if fmt == 'JPEG' and img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
img.save(new_abs, format=fmt)
if os.path.abspath(new_abs) != os.path.abspath(old_abs):
os.remove(old_abs)
para_el.insert(para_el.index(ppr) + 1, new_r)
return new_r
# 更新 rels
old_media = info['media_file']
new_media = os.path.splitext(old_media)[0] + '.' + new_ext
word_dir = os.path.join(unpacked_dir, 'word')
rels_path = os.path.join(word_dir, '_rels', 'document.xml.rels')
rels_tree = etree.parse(rels_path)
for rel in rels_tree.getroot():
if rel.get('Id') == info['rid']:
rel.set('Target', new_media)
break
rels_tree.write(rels_path, xml_declaration=True, encoding='UTF-8', standalone=True)
# 更新 ContentTypes
ct_path = os.path.join(unpacked_dir, '[Content_Types].xml')
ct_tree = etree.parse(ct_path)
ct_root = ct_tree.getroot()
existing = {el.get('Extension', '') for el in ct_root}
if new_ext not in existing:
etree.SubElement(ct_root, 'Default', Extension=new_ext,
ContentType=EXT_TO_MIME.get(new_ext, f'image/{new_ext}'))
ct_tree.write(ct_path, xml_declaration=True, encoding='UTF-8', standalone=True)
print(f" 格式转换 {old_ext}{new_ext}rels 和 ContentTypes 已更新")
def _set_paragraph_text(para_el, text):
runs = list(para_el.findall(f'.//{{{W}}}r'))
text_runs = [run for run in runs if any(True for _ in _iter_run_text_parts(run))]
if text_runs:
first_run = text_runs[0]
for run in text_runs:
_clear_run_text_like_children(run)
else:
first_run = _ensure_paragraph_run(para_el)
_clear_run_text_like_children(first_run)
_append_text_to_run(first_run, text)
def _paragraph_list(doc_el):
return list(doc_el.iter(f'{{{W}}}p'))
def _replace_paragraph_block(doc_el, old_text, new_text):
old_segments = _normalize_newlines(old_text).split('\n\n')
new_segments = _normalize_newlines(new_text).split('\n\n')
if len(old_segments) <= 1:
return False
paras = _paragraph_list(doc_el)
para_texts = [_paragraph_text(p) for p in paras]
match_start = None
for i in range(0, len(para_texts) - len(old_segments) + 1):
if para_texts[i:i + len(old_segments)] == old_segments:
match_start = i
break
if match_start is None:
return False
matched_paras = paras[match_start:match_start + len(old_segments)]
parent = matched_paras[0].getparent()
if parent is None:
return False
anchor_index = parent.index(matched_paras[-1])
shared_count = min(len(matched_paras), len(new_segments))
for idx in range(shared_count):
_set_paragraph_text(matched_paras[idx], new_segments[idx])
if len(new_segments) > len(matched_paras):
template_para = matched_paras[-1]
insert_at = anchor_index + 1
for seg in new_segments[len(matched_paras):]:
new_para = copy.deepcopy(template_para)
_set_paragraph_text(new_para, seg)
parent.insert(insert_at, new_para)
insert_at += 1
elif len(new_segments) < len(matched_paras):
for para in matched_paras[len(new_segments):]:
para_parent = para.getparent()
if para_parent is not None:
para_parent.remove(para)
return True
def paragraph_replace(para_el, replacements):
@@ -213,20 +281,27 @@ def paragraph_replace(para_el, replacements):
return
# 收集所有文本元素及其位置信息
t_elements = []
text_runs = []
for run in runs:
for t_el in run.findall(f'{{{W}}}t'):
t_elements.append((run, t_el))
if any(True for _ in _iter_run_text_parts(run)):
text_runs.append(run)
if not t_elements:
if not text_runs:
return
# 拼接完整文本
full_text = ''.join(t_el.text or '' for _, t_el in t_elements)
full_text = _paragraph_text(para_el)
original_text = full_text
# 执行所有替换
normalized_replacements = []
for old, new in replacements:
normalized_replacements.append((
_normalize_newlines(old),
_normalize_newlines(new),
))
# 执行所有替换
for old, new in normalized_replacements:
if old in full_text:
full_text = full_text.replace(old, new)
@@ -236,16 +311,11 @@ def paragraph_replace(para_el, replacements):
print(f"段落替换: {len(original_text)} -> {len(full_text)} 字符")
# 将文本重新分配到原有的 <w:t> 元素中
# 策略:将所有文本放入第一个元素,清空其他元素,避免不当切分导致换行
_, first_t_el = t_elements[0]
first_t_el.text = full_text
if full_text and (full_text[0] == ' ' or full_text[-1] == ' '):
first_t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
# 清空其他 <w:t> 元素
for i in range(1, len(t_elements)):
t_elements[i][1].text = ''
# 将规范化文本重新写回第一个文本 run\n 会回写成 Word 的换行节点。
first_run = text_runs[0]
for run in text_runs:
_clear_run_text_like_children(run)
_append_text_to_run(first_run, full_text)
def ensure_rpr(run_el):
@@ -271,13 +341,15 @@ def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
当 context_text 不为空时,只在“整段文本包含该 context_text 的段落”中进行上色,
避免同一个关键字在其他段落里被误伤(例如单独的数字 0
"""
keyword = _normalize_newlines(keyword)
context_text = _normalize_newlines(context_text) if context_text is not None else None
# 如果提供了上下文,只在包含该上下文的段落内着色
allowed_paras = None
if context_text:
allowed_paras = set()
for p in doc_el.iter(f'{{{W}}}p'):
t_nodes = list(p.iter(f'{{{W}}}t'))
full = ''.join(t.text or '' for t in t_nodes)
full = _paragraph_text(p)
if context_text in full:
allowed_paras.add(p)
@@ -294,10 +366,9 @@ def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
para = _find_ancestor_para(run)
if para not in allowed_paras:
continue
t_nodes = list(run.findall(f'{{{W}}}t'))
if not t_nodes:
full_text = _run_text(run)
if not full_text:
continue
full_text = ''.join(t.text or '' for t in t_nodes)
if keyword not in full_text:
continue
@@ -317,10 +388,7 @@ def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
new_r = etree.Element(f'{{{W}}}r')
if rpr_bytes is not None:
new_r.append(etree.fromstring(rpr_bytes))
t_el = etree.SubElement(new_r, f'{{{W}}}t')
t_el.text = text
if text and (text[0] == ' ' or text[-1] == ' '):
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
_append_text_to_run(new_r, text)
if colored:
set_color_on_rpr(ensure_rpr(new_r), hex_color)
return new_r
@@ -405,19 +473,13 @@ def remove_rule_blocks(doc_el):
if parent is not None:
parent.remove(p)
def process(input_docx, output_docx, replacements, image_replacements,
color_keywords):
def process(input_docx, output_docx, replacements, color_keywords):
with tempfile.TemporaryDirectory() as tmpdir:
print(f"📂 解包 {input_docx} ...")
unpack(input_docx, tmpdir)
doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml')
if image_replacements:
print(f"🖼️ 替换 {len(image_replacements)} 张图片...")
for idx, new_img in image_replacements:
replace_image(tmpdir, idx, new_img)
tree = etree.parse(doc_xml_path)
root = tree.getroot()
@@ -426,8 +488,17 @@ def process(input_docx, output_docx, replacements, image_replacements,
if replacements:
print(f"✏️ 替换 {len(replacements)} 条文本...")
for para in root.iter(f'{{{W}}}p'):
paragraph_replace(para, replacements)
remaining_replacements = []
for old, new in replacements:
if '\n\n' in _normalize_newlines(old):
replaced = _replace_paragraph_block(root, old, new)
if replaced:
print("🧩 跨段替换命中")
continue
remaining_replacements.append((old, new))
if remaining_replacements:
for para in root.iter(f'{{{W}}}p'):
paragraph_replace(para, remaining_replacements)
# 根据 span 解析出的关键字上色
for item in color_keywords:
@@ -457,6 +528,8 @@ def _parse_span_replacement(new_text):
"""
import re
new_text = _normalize_newlines(new_text)
# 简单的命名颜色到 16 进制的映射,可按需扩展
named_colors = {
'red': 'FF0000',
@@ -505,33 +578,33 @@ def _parse_span_replacement(new_text):
re.IGNORECASE | re.DOTALL,
)
# 先得到去掉 span 标签后的纯文本(也是最终会写入 DOCX 的内容)
# 先按段落边界拆分,这样 span 上色时可以使用所在段落作为上下文。
def _strip_repl(m):
return m.group(2)
plain_text = span_pattern.sub(_strip_repl, new_text)
# 再次遍历 span收集颜色关键字并把“整句纯文本”作为上下文挂在每个关键字上
plain_segments = []
color_keywords = []
for m in span_pattern.finditer(new_text):
raw_color = m.group(1)
hex_color = _normalize_color(raw_color)
keyword = m.group(2)
# 三元组: (关键字, 颜色, 该 NEW 对应的整句纯文本上下文)
color_keywords.append((keyword, hex_color, plain_text))
for segment in new_text.split('\n\n'):
plain_segment = span_pattern.sub(_strip_repl, segment)
plain_segments.append(plain_segment)
for m in span_pattern.finditer(segment):
raw_color = m.group(1)
hex_color = _normalize_color(raw_color)
keyword = m.group(2)
# 三元组: (关键字, 颜色, 所在段落的纯文本上下文)
color_keywords.append((keyword, hex_color, plain_segment))
plain_text = '\n\n'.join(plain_segments)
return plain_text, color_keywords
def main():
parser = argparse.ArgumentParser(description='DOCX 格式保留:替换文本/图片/颜色')
parser = argparse.ArgumentParser(description='DOCX 格式保留:替换文本/颜色')
parser.add_argument('input', help='输入 .docx')
parser.add_argument('output', nargs='?', help='输出 .docx')
parser.add_argument('--list-images', action='store_true', help='列出所有图片')
parser.add_argument('--replace', nargs=2, metavar=('OLD', 'NEW'),
action='append', default=[])
parser.add_argument('--image', nargs=2, metavar=('INDEX', 'FILE'),
action='append', default=[], help='图片替换')
args = parser.parse_args()
if args.list_images:
@@ -549,12 +622,11 @@ def main():
color_keywords.extend(spans)
process(
input_docx = args.input,
output_docx = args.output,
replacements = replacements,
image_replacements= [(int(i), f) for i, f in args.image],
color_keywords = color_keywords,
input_docx=args.input,
output_docx=args.output,
replacements=replacements,
color_keywords=color_keywords,
)
if __name__ == '__main__':
main()
main()