update
This commit is contained in:
164
mcp_docx.py
164
mcp_docx.py
@@ -29,6 +29,7 @@ import tempfile
|
||||
import zipfile
|
||||
from lxml import etree
|
||||
from PIL import Image
|
||||
import re
|
||||
|
||||
W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||
WD = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
|
||||
@@ -260,16 +261,39 @@ def set_color_on_rpr(rpr_el, hex_color):
|
||||
c = etree.SubElement(rpr_el, f'{{{W}}}color')
|
||||
c.set(f'{{{W}}}val', hex_color.lstrip('#'))
|
||||
|
||||
def apply_color_to_keyword(doc_el, keyword, hex_color):
|
||||
def apply_color_to_keyword(doc_el, keyword, hex_color, context_text=None):
|
||||
"""
|
||||
只给匹配到的关键字本身着色,而不是整个 run。
|
||||
|
||||
做法:在有关键字的 run 上,把文本拆成多段 run:
|
||||
[前缀][关键字][后缀],只有“关键字”这个 run 设置颜色。
|
||||
|
||||
当 context_text 不为空时,只在“整段文本包含该 context_text 的段落”中进行上色,
|
||||
避免同一个关键字在其他段落里被误伤(例如单独的数字 0)。
|
||||
"""
|
||||
# 如果提供了上下文,只在包含该上下文的段落内着色
|
||||
allowed_paras = None
|
||||
if context_text:
|
||||
allowed_paras = set()
|
||||
for p in doc_el.iter(f'{{{W}}}p'):
|
||||
t_nodes = list(p.iter(f'{{{W}}}t'))
|
||||
full = ''.join(t.text or '' for t in t_nodes)
|
||||
if context_text in full:
|
||||
allowed_paras.add(p)
|
||||
|
||||
def _find_ancestor_para(el):
|
||||
cur = el
|
||||
while cur is not None and cur.tag != f'{{{W}}}p':
|
||||
cur = cur.getparent()
|
||||
return cur
|
||||
|
||||
# 先 list 一下,避免在遍历时修改树结构导致问题
|
||||
runs = list(doc_el.iter(f'{{{W}}}r'))
|
||||
for run in runs:
|
||||
if allowed_paras is not None:
|
||||
para = _find_ancestor_para(run)
|
||||
if para not in allowed_paras:
|
||||
continue
|
||||
t_nodes = list(run.findall(f'{{{W}}}t'))
|
||||
if not t_nodes:
|
||||
continue
|
||||
@@ -322,6 +346,65 @@ def apply_color_to_keyword(doc_el, keyword, hex_color):
|
||||
if seg_text:
|
||||
parent.insert(insert_pos + offset, make_run(seg_text, colored))
|
||||
|
||||
|
||||
def remove_rule_blocks(doc_el):
|
||||
"""
|
||||
删除文档中位于 <global_rule>...</global_rule>、<rule>...</rule> 和 <chart_rule>...</chart_rule> 之间的所有段落。
|
||||
|
||||
说明:
|
||||
- 标签内容可能跨段落,这里按段落顺序遍历,记录是否处于 rule 块内。
|
||||
- 一旦进入某个块(遇到起始标签),直到遇到对应的结束标签为止,整段段落都会被删除。
|
||||
- 假设标签本身和其中内容都不需要出现在最终文档里。
|
||||
"""
|
||||
inside_global = False
|
||||
inside_rule = False
|
||||
inside_chart = False
|
||||
paras_to_delete = []
|
||||
|
||||
# list(...) 防止在遍历时修改树结构
|
||||
for p in list(doc_el.iter(f'{{{W}}}p')):
|
||||
t_nodes = list(p.iter(f'{{{W}}}t'))
|
||||
full = ''.join(t.text or '' for t in t_nodes)
|
||||
|
||||
if not full:
|
||||
# 空段落如果在块内,也删掉
|
||||
if inside_global or inside_rule or inside_chart:
|
||||
paras_to_delete.append(p)
|
||||
continue
|
||||
|
||||
# 当前是否在某个块内
|
||||
if inside_global or inside_rule or inside_chart:
|
||||
paras_to_delete.append(p)
|
||||
|
||||
# 检测 global_rule 块
|
||||
if '<global_rule>' in full:
|
||||
inside_global = True
|
||||
if p not in paras_to_delete:
|
||||
paras_to_delete.append(p)
|
||||
if '</global_rule>' in full:
|
||||
inside_global = False
|
||||
|
||||
# 检测 rule 块
|
||||
if '<rule>' in full:
|
||||
inside_rule = True
|
||||
if p not in paras_to_delete:
|
||||
paras_to_delete.append(p)
|
||||
if '</rule>' in full:
|
||||
inside_rule = False
|
||||
|
||||
# 检测 chart_rule 块
|
||||
if '<chart_rule>' in full:
|
||||
inside_chart = True
|
||||
if p not in paras_to_delete:
|
||||
paras_to_delete.append(p)
|
||||
if '</chart_rule>' in full:
|
||||
inside_chart = False
|
||||
|
||||
for p in paras_to_delete:
|
||||
parent = p.getparent()
|
||||
if parent is not None:
|
||||
parent.remove(p)
|
||||
|
||||
def process(input_docx, output_docx, replacements, image_replacements,
|
||||
color_keywords):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -338,15 +421,24 @@ def process(input_docx, output_docx, replacements, image_replacements,
|
||||
tree = etree.parse(doc_xml_path)
|
||||
root = tree.getroot()
|
||||
|
||||
# 先整体删除全局规则和普通规则块(支持标签跨段落)
|
||||
remove_rule_blocks(root)
|
||||
|
||||
if replacements:
|
||||
print(f"✏️ 替换 {len(replacements)} 条文本...")
|
||||
for para in root.iter(f'{{{W}}}p'):
|
||||
paragraph_replace(para, replacements)
|
||||
|
||||
# 根据 span 解析出的关键字上色
|
||||
for keyword, color in color_keywords:
|
||||
for item in color_keywords:
|
||||
# 兼容旧格式: (keyword, color)
|
||||
if len(item) == 2:
|
||||
keyword, color = item
|
||||
context_text = None
|
||||
else:
|
||||
keyword, color, context_text = item
|
||||
print(f"🎨 关键词「{keyword}」→ #{color}")
|
||||
apply_color_to_keyword(root, keyword, color)
|
||||
apply_color_to_keyword(root, keyword, color, context_text)
|
||||
|
||||
tree.write(doc_xml_path, xml_declaration=True, encoding='UTF-8', standalone=True)
|
||||
print(f"📦 打包 → {output_docx} ...")
|
||||
@@ -359,27 +451,75 @@ def _parse_span_replacement(new_text):
|
||||
解析 NEW 文本中的 span 标签,用于决定颜色。
|
||||
|
||||
约定格式(不区分大小写):
|
||||
<span color="FF0000">待补充</span>
|
||||
<span color="#FF0000">待补充</span>
|
||||
<span color="red">待补充</span>
|
||||
|
||||
返回: (纯文本, [(keyword, hex_color), ...])
|
||||
"""
|
||||
import re
|
||||
|
||||
# 简单的命名颜色到 16 进制的映射,可按需扩展
|
||||
named_colors = {
|
||||
'red': 'FF0000',
|
||||
'blue': '0000FF',
|
||||
'green': '00FF00',
|
||||
'yellow': 'FFFF00',
|
||||
'black': '000000',
|
||||
'white': 'FFFFFF',
|
||||
'gray': '808080',
|
||||
'grey': '808080',
|
||||
}
|
||||
|
||||
def _normalize_color(raw_color: str) -> str:
|
||||
"""
|
||||
支持:
|
||||
- FFFFFF / ffffff
|
||||
- #FFFFFF / #ffffff
|
||||
- red / blue 等命名颜色(见 named_colors)
|
||||
返回不带 # 的大写 16 进制字符串;如果无法识别命名颜色则原样返回(去掉 #)。
|
||||
"""
|
||||
c = (raw_color or '').strip()
|
||||
if not c:
|
||||
return ''
|
||||
|
||||
# 去掉前导 #
|
||||
if c.startswith('#'):
|
||||
c = c[1:]
|
||||
|
||||
# 纯 16 进制
|
||||
if re.fullmatch(r'[0-9a-fA-F]{6}', c):
|
||||
return c.upper()
|
||||
|
||||
# 命名颜色
|
||||
mapped = named_colors.get(c.lower())
|
||||
if mapped:
|
||||
return mapped
|
||||
|
||||
# 兜底:返回去掉 # 的原值
|
||||
return c.upper()
|
||||
|
||||
# color 属性允许:
|
||||
# - 6 位 16 进制(可带 #)
|
||||
# - 命名颜色(red / blue ...)
|
||||
span_pattern = re.compile(
|
||||
r'<span\s+[^>]*?color=["\']?(#?[0-9a-fA-F]{6})["\']?[^>]*>(.*?)</span>',
|
||||
r'<span\s+[^>]*?color=["\']?([^"\'\s>]+)["\']?[^>]*>(.*?)</span>',
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
# 先得到去掉 span 标签后的纯文本(也是最终会写入 DOCX 的内容)
|
||||
def _strip_repl(m):
|
||||
return m.group(2)
|
||||
|
||||
plain_text = span_pattern.sub(_strip_repl, new_text)
|
||||
|
||||
# 再次遍历 span,收集颜色关键字,并把“整句纯文本”作为上下文挂在每个关键字上
|
||||
color_keywords = []
|
||||
|
||||
def _repl(m):
|
||||
hex_color = m.group(1).lstrip('#')
|
||||
for m in span_pattern.finditer(new_text):
|
||||
raw_color = m.group(1)
|
||||
hex_color = _normalize_color(raw_color)
|
||||
keyword = m.group(2)
|
||||
color_keywords.append((keyword, hex_color))
|
||||
return keyword
|
||||
# 三元组: (关键字, 颜色, 该 NEW 对应的整句纯文本上下文)
|
||||
color_keywords.append((keyword, hex_color, plain_text))
|
||||
|
||||
plain_text = span_pattern.sub(_repl, new_text)
|
||||
return plain_text, color_keywords
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user