#!/usr/bin/env python3 """ docx_editor.py — 保留原格式替换文本 + 修改字体颜色 用法: # 列出文档中所有图片 python3 docx_editor.py input.docx --list-images # 文本替换 + 颜色 python3 docx_editor.py input.docx output.docx \ --replace "原文" "新文" \ --color "关键词" "FF0000" """ import argparse import copy import os import tempfile import zipfile from lxml import etree import re W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' WD = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing' A = 'http://schemas.openxmlformats.org/drawingml/2006/main' R = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' REL_TYPE_IMAGE = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image' def unpack(docx_path, out_dir): """使用 zipfile 直接解包 .docx 到临时目录,替代外部 unpack.py 脚本。""" with zipfile.ZipFile(docx_path, 'r') as zf: zf.extractall(out_dir) def pack(unpacked_dir, output_docx, original_docx): """ 使用 zipfile 将修改后的目录重新打包为 .docx。 original_docx 参数目前保留只是为了兼容原函数签名,没有实际使用。 """ # 确保输出目录存在 out_dir = os.path.dirname(os.path.abspath(output_docx)) if out_dir and not os.path.exists(out_dir): os.makedirs(out_dir, exist_ok=True) # 将解包目录中的所有文件打成 ZIP(保持相对路径结构) with zipfile.ZipFile(output_docx, 'w', compression=zipfile.ZIP_DEFLATED) as zf: for root, _, files in os.walk(unpacked_dir): for fname in files: abs_path = os.path.join(root, fname) # docx 内部使用 / 作为路径分隔符 arcname = os.path.relpath(abs_path, unpacked_dir).replace(os.sep, '/') zf.write(abs_path, arcname) def build_image_index(unpacked_dir): """返回按文档顺序排列的图片列表""" word_dir = os.path.join(unpacked_dir, 'word') doc_xml = os.path.join(word_dir, 'document.xml') rels_xml = os.path.join(word_dir, '_rels', 'document.xml.rels') rels_root = etree.parse(rels_xml).getroot() rid_to_media = {} for rel in rels_root: if rel.get('Type', '') == REL_TYPE_IMAGE: rid_to_media[rel.get('Id')] = rel.get('Target') doc_root = etree.parse(doc_xml).getroot() results = [] for blip in doc_root.iter(f'{{{A}}}blip'): rid = blip.get(f'{{{R}}}embed') if not rid or rid not in rid_to_media: continue media_rel = rid_to_media[rid] media_abs = os.path.join(word_dir, media_rel.replace('/', os.sep)) ext = os.path.splitext(media_rel)[1].lstrip('.').lower() inline = blip while inline is not None and inline.tag not in (f'{{{WD}}}inline', f'{{{WD}}}anchor'): inline = inline.getparent() w_cm = h_cm = None docpr_name = '' if inline is not None: ext_el = inline.find(f'{{{WD}}}extent') if ext_el is not None: w_cm = round(int(ext_el.get('cx', 0)) / 360000, 2) h_cm = round(int(ext_el.get('cy', 0)) / 360000, 2) dp = inline.find(f'{{{WD}}}docPr') if dp is not None: docpr_name = dp.get('name', '') results.append({ 'index': len(results) + 1, 'rid': rid, 'media_file': media_rel, 'abs_path': media_abs, 'ext': ext, 'docpr_name': docpr_name, 'width_cm': w_cm, 'height_cm': h_cm, }) return results def list_images(docx_path): imgs = get_images_info(docx_path) if not imgs: print("文档中没有找到图片。") return print(f"共找到 {len(imgs)} 张图片:\n") print(f" {'#':<4} {'文件名':<20} {'尺寸':<18} Word内部名称") print(" " + "-" * 62) for img in imgs: size = f"{img['width_cm']}×{img['height_cm']}cm" if img['width_cm'] else "未知" print(f" {img['index']:<4} {os.path.basename(img['media_file']):<20} {size:<18} {img['docpr_name']}") def get_images_info(docx_path): """ 返回给定 DOCX 文件中所有图片的结构化信息列表。 该函数专门为其他模块(例如 MCP 服务器)复用而设计, 行为等价于原来的 list_images 内部逻辑,但不做任何打印。 """ with tempfile.TemporaryDirectory() as tmpdir: unpack(docx_path, tmpdir) return build_image_index(tmpdir) def _normalize_newlines(text): if text is None: return '' return str(text).replace('\r\n', '\n').replace('\r', '\n') def _is_text_node(el): return el.tag == f'{{{W}}}t' def _is_break_node(el): return el.tag in (f'{{{W}}}br', f'{{{W}}}cr') def _is_tab_node(el): return el.tag == f'{{{W}}}tab' def _iter_run_text_parts(run_el): for child in run_el: if _is_text_node(child): yield child, _normalize_newlines(child.text or '') elif _is_break_node(child): yield child, '\n' elif _is_tab_node(child): yield child, '\t' def _run_text(run_el): return ''.join(part for _, part in _iter_run_text_parts(run_el)) def _paragraph_text(para_el): return ''.join(_run_text(run) for run in para_el.iter(f'{{{W}}}r')) def _clear_run_text_like_children(run_el): for child in list(run_el): if _is_text_node(child) or _is_break_node(child) or _is_tab_node(child): run_el.remove(child) def _append_text_to_run(run_el, text): text = _normalize_newlines(text) parts = text.split('\n') if len(parts) == 1: t_el = etree.SubElement(run_el, f'{{{W}}}t') t_el.text = parts[0] if parts[0] and (parts[0][0] == ' ' or parts[0][-1] == ' '): t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') return for idx, part in enumerate(parts): if part: t_el = etree.SubElement(run_el, f'{{{W}}}t') t_el.text = part if part[0] == ' ' or part[-1] == ' ': t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') if idx < len(parts) - 1: etree.SubElement(run_el, f'{{{W}}}br') def _ensure_paragraph_run(para_el): runs = list(para_el.findall(f'.//{{{W}}}r')) if runs: return runs[0] ppr = para_el.find(f'{{{W}}}pPr') new_r = etree.Element(f'{{{W}}}r') if ppr is None: para_el.insert(0, new_r) else: para_el.insert(para_el.index(ppr) + 1, new_r) return new_r def _set_paragraph_text(para_el, text): runs = list(para_el.findall(f'.//{{{W}}}r')) text_runs = [run for run in runs if any(True for _ in _iter_run_text_parts(run))] if text_runs: first_run = text_runs[0] for run in text_runs: _clear_run_text_like_children(run) else: first_run = _ensure_paragraph_run(para_el) _clear_run_text_like_children(first_run) _append_text_to_run(first_run, text) def _paragraph_list(doc_el): return list(doc_el.iter(f'{{{W}}}p')) def _replace_paragraph_block(doc_el, old_text, new_text): old_segments = _normalize_newlines(old_text).split('\n\n') new_segments = _normalize_newlines(new_text).split('\n\n') if len(old_segments) <= 1: return False paras = _paragraph_list(doc_el) para_texts = [_paragraph_text(p) for p in paras] match_start = None for i in range(0, len(para_texts) - len(old_segments) + 1): if para_texts[i:i + len(old_segments)] == old_segments: match_start = i break if match_start is None: return False matched_paras = paras[match_start:match_start + len(old_segments)] parent = matched_paras[0].getparent() if parent is None: return False anchor_index = parent.index(matched_paras[-1]) shared_count = min(len(matched_paras), len(new_segments)) for idx in range(shared_count): _set_paragraph_text(matched_paras[idx], new_segments[idx]) if len(new_segments) > len(matched_paras): template_para = matched_paras[-1] insert_at = anchor_index + 1 for seg in new_segments[len(matched_paras):]: new_para = copy.deepcopy(template_para) _set_paragraph_text(new_para, seg) parent.insert(insert_at, new_para) insert_at += 1 elif len(new_segments) < len(matched_paras): for para in matched_paras[len(new_segments):]: para_parent = para.getparent() if para_parent is not None: para_parent.remove(para) return True def paragraph_replace(para_el, replacements): """ 在段落级别替换文本,支持跨 元素的匹配。 策略: 1. 收集段落中所有 元素及其文本 2. 拼接成完整文本进行替换 3. 如果有替换发生,重新分配文本到原有的 元素中 """ # 收集所有 run 元素(),保持顺序 runs = list(para_el.findall(f'.//{{{W}}}r')) if not runs: return # 收集所有文本元素及其位置信息 text_runs = [] for run in runs: if any(True for _ in _iter_run_text_parts(run)): text_runs.append(run) if not text_runs: return # 拼接完整文本 full_text = _paragraph_text(para_el) original_text = full_text normalized_replacements = [] for old, new in replacements: normalized_replacements.append(( _normalize_newlines(old), _normalize_newlines(new), )) # 执行所有替换 for old, new in normalized_replacements: if old in full_text: full_text = full_text.replace(old, new) # 如果没有变化,直接返回 if full_text == original_text: return print(f"段落替换: {len(original_text)} -> {len(full_text)} 字符") # 将规范化文本重新写回第一个文本 run,\n 会回写成 Word 的换行节点。 first_run = text_runs[0] for run in text_runs: _clear_run_text_like_children(run) _append_text_to_run(first_run, full_text) def ensure_rpr(run_el): rpr = run_el.find(f'{{{W}}}rPr') if rpr is None: rpr = etree.Element(f'{{{W}}}rPr') run_el.insert(0, rpr) return rpr def set_color_on_rpr(rpr_el, hex_color): c = rpr_el.find(f'{{{W}}}color') if c is None: c = etree.SubElement(rpr_el, f'{{{W}}}color') c.set(f'{{{W}}}val', hex_color.lstrip('#')) def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None): """ 只给匹配到的关键字本身着色,而不是整个 run。 做法:在有关键字的 run 上,把文本拆成多段 run: [前缀][关键字][后缀],只有“关键字”这个 run 设置颜色。 当 context_text 不为空时,只在“整段文本包含该 context_text 的段落”中进行上色, 避免同一个关键字在其他段落里被误伤(例如单独的数字 0)。 """ keyword = _normalize_newlines(keyword) context_text = _normalize_newlines(context_text) if context_text is not None else None # 如果提供了上下文,只在包含该上下文的段落内着色 allowed_paras = None if context_text: allowed_paras = set() for p in doc_el.iter(f'{{{W}}}p'): full = _paragraph_text(p) if context_text in full: allowed_paras.add(p) def _find_ancestor_para(el): cur = el while cur is not None and cur.tag != f'{{{W}}}p': cur = cur.getparent() return cur # 先 list 一下,避免在遍历时修改树结构导致问题 runs = list(doc_el.iter(f'{{{W}}}r')) for run in runs: if allowed_paras is not None: para = _find_ancestor_para(run) if para not in allowed_paras: continue full_text = _run_text(run) if not full_text: continue if keyword not in full_text: continue parent = run.getparent() if parent is None: continue insert_pos = parent.index(run) # 原 run 的 rPr 复制给新 run orig_rpr = run.find(f'{{{W}}}rPr') if orig_rpr is not None: rpr_bytes = etree.tostring(orig_rpr) else: rpr_bytes = None def make_run(text, colored): new_r = etree.Element(f'{{{W}}}r') if rpr_bytes is not None: new_r.append(etree.fromstring(rpr_bytes)) _append_text_to_run(new_r, text) if colored: set_color_on_rpr(ensure_rpr(new_r), hex_color) return new_r segments = [] s = full_text start = 0 klen = len(keyword) while True: idx = s.find(keyword, start) if idx == -1: if start < len(s): segments.append((s[start:], False)) break if idx > start: segments.append((s[start:idx], False)) segments.append((keyword, True)) start = idx + klen # 用新 run 替换原 run parent.remove(run) for offset, (seg_text, colored) in enumerate(segments): if seg_text: parent.insert(insert_pos + offset, make_run(seg_text, colored)) def remove_rule_blocks(doc_el): """ 删除文档中位于 ......... 之间的所有段落。 说明: - 标签内容可能跨段落,这里按段落顺序遍历,记录是否处于 rule 块内。 - 一旦进入某个块(遇到起始标签),直到遇到对应的结束标签为止,整段段落都会被删除。 - 假设标签本身和其中内容都不需要出现在最终文档里。 """ inside_global = False inside_rule = False inside_chart = False paras_to_delete = [] # list(...) 防止在遍历时修改树结构 for p in list(doc_el.iter(f'{{{W}}}p')): t_nodes = list(p.iter(f'{{{W}}}t')) full = ''.join(t.text or '' for t in t_nodes) if not full: # 空段落如果在块内,也删掉 if inside_global or inside_rule or inside_chart: paras_to_delete.append(p) continue # 当前是否在某个块内 if inside_global or inside_rule or inside_chart: paras_to_delete.append(p) # 检测 global_rule 块 if '' in full: inside_global = True if p not in paras_to_delete: paras_to_delete.append(p) if '' in full: inside_global = False # 检测 rule 块 if '' in full: inside_rule = True if p not in paras_to_delete: paras_to_delete.append(p) if '' in full: inside_rule = False # 检测 chart_rule 块 if '' in full: inside_chart = True if p not in paras_to_delete: paras_to_delete.append(p) if '' in full: inside_chart = False for p in paras_to_delete: parent = p.getparent() if parent is not None: parent.remove(p) def process(input_docx, output_docx, replacements, color_keywords): with tempfile.TemporaryDirectory() as tmpdir: print(f"📂 解包 {input_docx} ...") unpack(input_docx, tmpdir) doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml') tree = etree.parse(doc_xml_path) root = tree.getroot() # 先整体删除全局规则和普通规则块(支持标签跨段落) remove_rule_blocks(root) if replacements: print(f"✏️ 替换 {len(replacements)} 条文本...") remaining_replacements = [] for old, new in replacements: if '\n\n' in _normalize_newlines(old): replaced = _replace_paragraph_block(root, old, new) if replaced: print("🧩 跨段替换命中") continue remaining_replacements.append((old, new)) if remaining_replacements: for para in root.iter(f'{{{W}}}p'): paragraph_replace(para, remaining_replacements) # 根据 span 解析出的关键字上色 for item in color_keywords: # 兼容旧格式: (keyword, color) if len(item) == 2: keyword, color = item context_text = None else: keyword, color, context_text = item print(f"🎨 关键词「{keyword}」→ #{color}") apply_color_to_keyword(root, keyword, color, context_text) tree.write(doc_xml_path, xml_declaration=True, encoding='UTF-8', standalone=True) print(f"📦 打包 → {output_docx} ...") pack(tmpdir, output_docx, input_docx) print(f"✅ 完成!输出: {output_docx}") def _parse_span_replacement(new_text): """ 解析 NEW 文本中的 span 标签,用于决定颜色。 约定格式(不区分大小写): 待补充 返回: (纯文本, [(keyword, hex_color), ...]) """ import re new_text = _normalize_newlines(new_text) # 简单的命名颜色到 16 进制的映射,可按需扩展 named_colors = { 'red': 'FF0000', 'blue': '0000FF', 'green': '00FF00', 'yellow': 'FFFF00', 'black': '000000', 'white': 'FFFFFF', 'gray': '808080', 'grey': '808080', } def _normalize_color(raw_color: str) -> str: """ 支持: - FFFFFF / ffffff - #FFFFFF / #ffffff - red / blue 等命名颜色(见 named_colors) 返回不带 # 的大写 16 进制字符串;如果无法识别命名颜色则原样返回(去掉 #)。 """ c = (raw_color or '').strip() if not c: return '' # 去掉前导 # if c.startswith('#'): c = c[1:] # 纯 16 进制 if re.fullmatch(r'[0-9a-fA-F]{6}', c): return c.upper() # 命名颜色 mapped = named_colors.get(c.lower()) if mapped: return mapped # 兜底:返回去掉 # 的原值 return c.upper() # color 属性允许: # - 6 位 16 进制(可带 #) # - 命名颜色(red / blue ...) span_pattern = re.compile( r']*?color=["\']?([^"\'\s>]+)["\']?[^>]*>(.*?)', re.IGNORECASE | re.DOTALL, ) # 先按段落边界拆分,这样 span 上色时可以使用所在段落作为上下文。 def _strip_repl(m): return m.group(2) plain_segments = [] color_keywords = [] for segment in new_text.split('\n\n'): plain_segment = span_pattern.sub(_strip_repl, segment) plain_segments.append(plain_segment) for m in span_pattern.finditer(segment): raw_color = m.group(1) hex_color = _normalize_color(raw_color) keyword = m.group(2) # 三元组: (关键字, 颜色, 所在段落的纯文本上下文) color_keywords.append((keyword, hex_color, plain_segment)) plain_text = '\n\n'.join(plain_segments) return plain_text, color_keywords def main(): parser = argparse.ArgumentParser(description='DOCX 格式保留:替换文本/颜色') parser.add_argument('input', help='输入 .docx') parser.add_argument('output', nargs='?', help='输出 .docx') parser.add_argument('--list-images', action='store_true', help='列出所有图片') parser.add_argument('--replace', nargs=2, metavar=('OLD', 'NEW'), action='append', default=[]) args = parser.parse_args() if args.list_images: list_images(args.input) return if not args.output: parser.error("需要指定输出文件") # 处理 span 颜色:把 NEW 中的 文字 抽出来 replacements = [] color_keywords = [] for old, new_raw in args.replace: new_plain, spans = _parse_span_replacement(new_raw) replacements.append((old, new_plain)) color_keywords.extend(spans) process( input_docx=args.input, output_docx=args.output, replacements=replacements, color_keywords=color_keywords, ) if __name__ == '__main__': main()