This commit is contained in:
2026-03-20 19:19:26 +08:00
parent f009258769
commit 248f7a2637
4 changed files with 457 additions and 53 deletions

View File

@@ -29,6 +29,7 @@ import tempfile
import zipfile
from lxml import etree
from PIL import Image
import re
W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
WD = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
@@ -260,16 +261,39 @@ def set_color_on_rpr(rpr_el, hex_color):
c = etree.SubElement(rpr_el, f'{{{W}}}color')
c.set(f'{{{W}}}val', hex_color.lstrip('#'))
def apply_color_to_keyword(doc_el, keyword, hex_color):
def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
"""
只给匹配到的关键字本身着色,而不是整个 run。
做法:在有关键字的 run 上,把文本拆成多段 run
[前缀][关键字][后缀],只有“关键字”这个 run 设置颜色。
当 context_text 不为空时,只在“整段文本包含该 context_text 的段落”中进行上色,
避免同一个关键字在其他段落里被误伤(例如单独的数字 0
"""
# 如果提供了上下文,只在包含该上下文的段落内着色
allowed_paras = None
if context_text:
allowed_paras = set()
for p in doc_el.iter(f'{{{W}}}p'):
t_nodes = list(p.iter(f'{{{W}}}t'))
full = ''.join(t.text or '' for t in t_nodes)
if context_text in full:
allowed_paras.add(p)
def _find_ancestor_para(el):
cur = el
while cur is not None and cur.tag != f'{{{W}}}p':
cur = cur.getparent()
return cur
# 先 list 一下,避免在遍历时修改树结构导致问题
runs = list(doc_el.iter(f'{{{W}}}r'))
for run in runs:
if allowed_paras is not None:
para = _find_ancestor_para(run)
if para not in allowed_paras:
continue
t_nodes = list(run.findall(f'{{{W}}}t'))
if not t_nodes:
continue
@@ -322,6 +346,65 @@ def apply_color_to_keyword(doc_el, keyword, hex_color):
if seg_text:
parent.insert(insert_pos + offset, make_run(seg_text, colored))
def remove_rule_blocks(doc_el):
"""
删除文档中位于 <global_rule>...</global_rule>、<rule>...</rule> 和 <chart_rule>...</chart_rule> 之间的所有段落。
说明:
- 标签内容可能跨段落,这里按段落顺序遍历,记录是否处于 rule 块内。
- 一旦进入某个块(遇到起始标签),直到遇到对应的结束标签为止,整段段落都会被删除。
- 假设标签本身和其中内容都不需要出现在最终文档里。
"""
inside_global = False
inside_rule = False
inside_chart = False
paras_to_delete = []
# list(...) 防止在遍历时修改树结构
for p in list(doc_el.iter(f'{{{W}}}p')):
t_nodes = list(p.iter(f'{{{W}}}t'))
full = ''.join(t.text or '' for t in t_nodes)
if not full:
# 空段落如果在块内,也删掉
if inside_global or inside_rule or inside_chart:
paras_to_delete.append(p)
continue
# 当前是否在某个块内
if inside_global or inside_rule or inside_chart:
paras_to_delete.append(p)
# 检测 global_rule 块
if '<global_rule>' in full:
inside_global = True
if p not in paras_to_delete:
paras_to_delete.append(p)
if '</global_rule>' in full:
inside_global = False
# 检测 rule 块
if '<rule>' in full:
inside_rule = True
if p not in paras_to_delete:
paras_to_delete.append(p)
if '</rule>' in full:
inside_rule = False
# 检测 chart_rule 块
if '<chart_rule>' in full:
inside_chart = True
if p not in paras_to_delete:
paras_to_delete.append(p)
if '</chart_rule>' in full:
inside_chart = False
for p in paras_to_delete:
parent = p.getparent()
if parent is not None:
parent.remove(p)
def process(input_docx, output_docx, replacements, image_replacements,
color_keywords):
with tempfile.TemporaryDirectory() as tmpdir:
@@ -338,15 +421,24 @@ def process(input_docx, output_docx, replacements, image_replacements,
tree = etree.parse(doc_xml_path)
root = tree.getroot()
# 先整体删除全局规则和普通规则块(支持标签跨段落)
remove_rule_blocks(root)
if replacements:
print(f"✏️ 替换 {len(replacements)} 条文本...")
for para in root.iter(f'{{{W}}}p'):
paragraph_replace(para, replacements)
# 根据 span 解析出的关键字上色
for keyword, color in color_keywords:
for item in color_keywords:
# 兼容旧格式: (keyword, color)
if len(item) == 2:
keyword, color = item
context_text = None
else:
keyword, color, context_text = item
print(f"🎨 关键词「{keyword}」→ #{color}")
apply_color_to_keyword(root, keyword, color)
apply_color_to_keyword(root, keyword, color, context_text)
tree.write(doc_xml_path, xml_declaration=True, encoding='UTF-8', standalone=True)
print(f"📦 打包 → {output_docx} ...")
@@ -359,27 +451,75 @@ def _parse_span_replacement(new_text):
解析 NEW 文本中的 span 标签,用于决定颜色。
约定格式(不区分大小写):
<span color="FF0000">待补充</span>
<span color="#FF0000">待补充</span>
<span color="red">待补充</span>
返回: (纯文本, [(keyword, hex_color), ...])
"""
import re
# 简单的命名颜色到 16 进制的映射,可按需扩展
named_colors = {
'red': 'FF0000',
'blue': '0000FF',
'green': '00FF00',
'yellow': 'FFFF00',
'black': '000000',
'white': 'FFFFFF',
'gray': '808080',
'grey': '808080',
}
def _normalize_color(raw_color: str) -> str:
"""
支持:
- FFFFFF / ffffff
- #FFFFFF / #ffffff
- red / blue 等命名颜色(见 named_colors
返回不带 # 的大写 16 进制字符串;如果无法识别命名颜色则原样返回(去掉 #)。
"""
c = (raw_color or '').strip()
if not c:
return ''
# 去掉前导 #
if c.startswith('#'):
c = c[1:]
# 纯 16 进制
if re.fullmatch(r'[0-9a-fA-F]{6}', c):
return c.upper()
# 命名颜色
mapped = named_colors.get(c.lower())
if mapped:
return mapped
# 兜底:返回去掉 # 的原值
return c.upper()
# color 属性允许:
# - 6 位 16 进制(可带 #
# - 命名颜色red / blue ...
span_pattern = re.compile(
r'<span\s+[^>]*?color=["\']?(#?[0-9a-fA-F]{6})["\']?[^>]*>(.*?)</span>',
r'<span\s+[^>]*?color=["\']?([^"\'\s>]+)["\']?[^>]*>(.*?)</span>',
re.IGNORECASE | re.DOTALL,
)
# 先得到去掉 span 标签后的纯文本(也是最终会写入 DOCX 的内容)
def _strip_repl(m):
return m.group(2)
plain_text = span_pattern.sub(_strip_repl, new_text)
# 再次遍历 span收集颜色关键字并把“整句纯文本”作为上下文挂在每个关键字上
color_keywords = []
def _repl(m):
hex_color = m.group(1).lstrip('#')
for m in span_pattern.finditer(new_text):
raw_color = m.group(1)
hex_color = _normalize_color(raw_color)
keyword = m.group(2)
color_keywords.append((keyword, hex_color))
return keyword
# 三元组: (关键字, 颜色, 该 NEW 对应的整句纯文本上下文)
color_keywords.append((keyword, hex_color, plain_text))
plain_text = span_pattern.sub(_repl, new_text)
return plain_text, color_keywords