first commit
This commit is contained in:
24
Dockerfile
Normal file
24
Dockerfile
Normal file
@@ -0,0 +1,24 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# 避免交互与时区问题
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 安装系统依赖(Pandoc)
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends pandoc \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 安装 Python 依赖
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# 拷贝代码和相关资源(包括 ref.docx、color.lua 等)
|
||||
COPY . .
|
||||
|
||||
# 默认启动 MCP 服务器
|
||||
CMD ["python", "mcp_docx_server.py"]
|
||||
|
||||
|
||||
107
README.md
Normal file
107
README.md
Normal file
@@ -0,0 +1,107 @@
|
||||
## DOCX 转换工具 MCP 服务器
|
||||
|
||||
这是一个基于 MCP (Model Context Protocol) 的服务器,目前**只提供 HTML → DOCX** 的转换能力,底层通过 Pandoc 实现高质量排版。
|
||||
|
||||
### 功能
|
||||
|
||||
- **html_to_docx_pandoc**:将包含 HTML 标签的文本转换为 DOCX 文件,支持引用模板、Lua 过滤器等高级格式控制。
|
||||
|
||||
### 安装依赖(本机运行)
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
请确保系统已安装 Pandoc(`pandoc --version` 可正常执行)。
|
||||
|
||||
### 使用方法
|
||||
|
||||
#### 方式一:本机直接运行
|
||||
|
||||
运行服务器:
|
||||
|
||||
```bash
|
||||
python mcp_docx_server.py
|
||||
```
|
||||
|
||||
在 MCP 客户端中连接该服务器后,会看到一个名为 `html_to_docx_pandoc` 的工具。
|
||||
|
||||
#### 方式二:使用 Docker 封装运行
|
||||
|
||||
本项目已提供 `Dockerfile`,可以直接构建镜像并运行:
|
||||
|
||||
```bash
|
||||
# 构建镜像
|
||||
docker build -t mcp-docx-server .
|
||||
|
||||
# 运行容器(前台运行)
|
||||
docker run --rm -it mcp-docx-server
|
||||
```
|
||||
|
||||
如果你希望在容器外部自定义 `ref.docx`、`color.lua` 或输出目录,可以通过挂载卷的方式:
|
||||
|
||||
```bash
|
||||
docker run --rm -it ^
|
||||
-v %cd%/ref.docx:/app/ref.docx ^
|
||||
-v %cd%/color.lua:/app/color.lua ^
|
||||
-v %cd%/output:/app/output ^
|
||||
mcp-docx-server
|
||||
```
|
||||
|
||||
在类 Unix 系统(如 macOS / Linux)中可改为:
|
||||
|
||||
```bash
|
||||
docker run --rm -it \
|
||||
-v "$(pwd)/ref.docx:/app/ref.docx" \
|
||||
-v "$(pwd)/color.lua:/app/color.lua" \
|
||||
-v "$(pwd)/output:/app/output" \
|
||||
mcp-docx-server
|
||||
```
|
||||
|
||||
#### 方式三:使用 docker-compose 运行
|
||||
|
||||
已提供 `docker-compose.yml`,可以一条命令完成构建与运行:
|
||||
|
||||
```bash
|
||||
# 构建并启动(前台)
|
||||
docker-compose up --build
|
||||
```
|
||||
|
||||
默认会:
|
||||
|
||||
- **构建镜像**:使用当前目录下的 `Dockerfile`
|
||||
- **挂载当前目录到容器 `/app`**:方便直接访问 `ref.docx`、`color.lua` 和输出文件
|
||||
- **在容器内执行**:`python mcp_docx_server.py`
|
||||
|
||||
如需在后台运行,可使用:
|
||||
|
||||
```bash
|
||||
docker-compose up -d --build
|
||||
```
|
||||
|
||||
### 工具说明:html_to_docx_pandoc
|
||||
|
||||
**作用:** 使用 Pandoc 将 HTML 文本转换为 DOCX 文件,尽可能保留原始样式,并支持:
|
||||
- 使用 `ref.docx` 作为参考模板(如果文件存在)
|
||||
- 使用 `color.lua` 作为 Lua 过滤器(如果文件存在)
|
||||
- 独立 HTML 模式、图片提取、自定义 CSS 等选项
|
||||
|
||||
**参数:**
|
||||
- `html_text`(必需):需要转换的 HTML 文本内容
|
||||
- `output_path`(必需):输出 DOCX 文件的完整路径
|
||||
- `standalone`(可选,默认 `true`):是否以独立 HTML 模式调用 Pandoc
|
||||
- `extract_media`(可选):图片提取目录(如 `./media`),不需要提取可不传
|
||||
- `css_file`(可选):CSS 样式文件路径
|
||||
|
||||
### 依赖项
|
||||
|
||||
- `mcp`: MCP Python SDK
|
||||
- `python-docx`、`lxml`:内部保留的 DOCX/HTML 处理能力(当前未通过 MCP 暴露)
|
||||
- **外部工具**:Pandoc(必须预先在系统中安装)
|
||||
|
||||
### 注意事项
|
||||
|
||||
- 必须安装 MCP SDK 才能运行服务器
|
||||
- 确保有足够的权限读取输入文件和写入输出文件
|
||||
- 大文件转换可能需要较长时间
|
||||
|
||||
18
docker-compose.yml
Normal file
18
docker-compose.yml
Normal file
@@ -0,0 +1,18 @@
|
||||
version: "3.9"
|
||||
|
||||
services:
|
||||
mcp-docx-server:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
container_name: mcp-docx-server
|
||||
working_dir: /app
|
||||
# MCP 通常通过标准输入/输出与客户端通信,因此不需要暴露端口
|
||||
stdin_open: true
|
||||
tty: true
|
||||
volumes:
|
||||
# 可选:将当前目录挂载到容器内,便于共享 ref.docx、color.lua 和输出文件
|
||||
- ./:/app
|
||||
command: ["python", "mcp_docx_server.py"]
|
||||
|
||||
|
||||
384
mcp_docx.py
Normal file
384
mcp_docx.py
Normal file
@@ -0,0 +1,384 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
docx_editor.py — 保留原格式替换文本 + 修改字体颜色 + 替换图片
|
||||
|
||||
用法:
|
||||
# 列出文档中所有图片
|
||||
python3 docx_editor.py input.docx --list-images
|
||||
|
||||
# 文本替换 + 颜色
|
||||
python3 docx_editor.py input.docx output.docx \
|
||||
--replace "原文" "新文" \
|
||||
--color "关键词" "FF0000"
|
||||
|
||||
# 图片替换(按文档中出现的顺序,从1开始)
|
||||
python3 docx_editor.py input.docx output.docx \
|
||||
--image 1 new_chart.png \
|
||||
--image 2 new_photo.jpg
|
||||
|
||||
# 同时替换文字和图片
|
||||
python3 docx_editor.py input.docx output.docx \
|
||||
--replace "旧标题" "新标题" \
|
||||
--image 1 new_image.png \
|
||||
--color "重点" "FF0000"
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import tempfile
|
||||
import zipfile
|
||||
from lxml import etree
|
||||
from PIL import Image
|
||||
|
||||
W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||
WD = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
|
||||
A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
|
||||
R = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
|
||||
REL_TYPE_IMAGE = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
|
||||
|
||||
EXT_TO_MIME = {
|
||||
'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg',
|
||||
'gif': 'image/gif', 'bmp': 'image/bmp', 'tiff': 'image/tiff',
|
||||
'webp': 'image/webp',
|
||||
}
|
||||
|
||||
|
||||
def unpack(docx_path, out_dir):
|
||||
"""使用 zipfile 直接解包 .docx 到临时目录,替代外部 unpack.py 脚本。"""
|
||||
with zipfile.ZipFile(docx_path, 'r') as zf:
|
||||
zf.extractall(out_dir)
|
||||
|
||||
|
||||
def pack(unpacked_dir, output_docx, original_docx):
|
||||
"""
|
||||
使用 zipfile 将修改后的目录重新打包为 .docx。
|
||||
|
||||
original_docx 参数目前保留只是为了兼容原函数签名,没有实际使用。
|
||||
"""
|
||||
# 确保输出目录存在
|
||||
out_dir = os.path.dirname(os.path.abspath(output_docx))
|
||||
if out_dir and not os.path.exists(out_dir):
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
# 将解包目录中的所有文件打成 ZIP(保持相对路径结构)
|
||||
with zipfile.ZipFile(output_docx, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
|
||||
for root, _, files in os.walk(unpacked_dir):
|
||||
for fname in files:
|
||||
abs_path = os.path.join(root, fname)
|
||||
# docx 内部使用 / 作为路径分隔符
|
||||
arcname = os.path.relpath(abs_path, unpacked_dir).replace(os.sep, '/')
|
||||
zf.write(abs_path, arcname)
|
||||
|
||||
|
||||
def build_image_index(unpacked_dir):
|
||||
"""返回按文档顺序排列的图片列表"""
|
||||
word_dir = os.path.join(unpacked_dir, 'word')
|
||||
doc_xml = os.path.join(word_dir, 'document.xml')
|
||||
rels_xml = os.path.join(word_dir, '_rels', 'document.xml.rels')
|
||||
|
||||
rels_root = etree.parse(rels_xml).getroot()
|
||||
rid_to_media = {}
|
||||
for rel in rels_root:
|
||||
if rel.get('Type', '') == REL_TYPE_IMAGE:
|
||||
rid_to_media[rel.get('Id')] = rel.get('Target')
|
||||
|
||||
doc_root = etree.parse(doc_xml).getroot()
|
||||
results = []
|
||||
for blip in doc_root.iter(f'{{{A}}}blip'):
|
||||
rid = blip.get(f'{{{R}}}embed')
|
||||
if not rid or rid not in rid_to_media:
|
||||
continue
|
||||
media_rel = rid_to_media[rid]
|
||||
media_abs = os.path.join(word_dir, media_rel.replace('/', os.sep))
|
||||
ext = os.path.splitext(media_rel)[1].lstrip('.').lower()
|
||||
|
||||
inline = blip
|
||||
while inline is not None and inline.tag not in (f'{{{WD}}}inline', f'{{{WD}}}anchor'):
|
||||
inline = inline.getparent()
|
||||
w_cm = h_cm = None
|
||||
docpr_name = ''
|
||||
if inline is not None:
|
||||
ext_el = inline.find(f'{{{WD}}}extent')
|
||||
if ext_el is not None:
|
||||
w_cm = round(int(ext_el.get('cx', 0)) / 360000, 2)
|
||||
h_cm = round(int(ext_el.get('cy', 0)) / 360000, 2)
|
||||
dp = inline.find(f'{{{WD}}}docPr')
|
||||
if dp is not None:
|
||||
docpr_name = dp.get('name', '')
|
||||
|
||||
results.append({
|
||||
'index': len(results) + 1, 'rid': rid,
|
||||
'media_file': media_rel, 'abs_path': media_abs,
|
||||
'ext': ext, 'docpr_name': docpr_name,
|
||||
'width_cm': w_cm, 'height_cm': h_cm,
|
||||
})
|
||||
return results
|
||||
|
||||
|
||||
def list_images(docx_path):
|
||||
imgs = get_images_info(docx_path)
|
||||
if not imgs:
|
||||
print("文档中没有找到图片。")
|
||||
return
|
||||
print(f"共找到 {len(imgs)} 张图片:\n")
|
||||
print(f" {'#':<4} {'文件名':<20} {'尺寸':<18} Word内部名称")
|
||||
print(" " + "-" * 62)
|
||||
for img in imgs:
|
||||
size = f"{img['width_cm']}×{img['height_cm']}cm" if img['width_cm'] else "未知"
|
||||
print(f" {img['index']:<4} {os.path.basename(img['media_file']):<20} {size:<18} {img['docpr_name']}")
|
||||
|
||||
|
||||
def get_images_info(docx_path):
|
||||
"""
|
||||
返回给定 DOCX 文件中所有图片的结构化信息列表。
|
||||
|
||||
该函数专门为其他模块(例如 MCP 服务器)复用而设计,
|
||||
行为等价于原来的 list_images 内部逻辑,但不做任何打印。
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
unpack(docx_path, tmpdir)
|
||||
return build_image_index(tmpdir)
|
||||
|
||||
|
||||
def replace_image(unpacked_dir, index, new_image_path):
|
||||
"""替换第 index 张图片(1-based)"""
|
||||
imgs = build_image_index(unpacked_dir)
|
||||
if index < 1 or index > len(imgs):
|
||||
raise ValueError(f"图片序号 {index} 超出范围(共 {len(imgs)} 张)")
|
||||
|
||||
info = imgs[index - 1]
|
||||
old_abs = info['abs_path']
|
||||
old_ext = info['ext']
|
||||
new_ext = os.path.splitext(new_image_path)[1].lstrip('.').lower()
|
||||
if new_ext == 'jpg':
|
||||
new_ext = 'jpeg'
|
||||
|
||||
print(f" 图片#{index} {os.path.basename(info['media_file'])}({old_ext.upper()})"
|
||||
f" ← {os.path.basename(new_image_path)}({new_ext.upper()})")
|
||||
|
||||
if old_ext == new_ext:
|
||||
# ── 同格式:直接覆盖 ──────────────────────────────
|
||||
import shutil
|
||||
shutil.copy2(new_image_path, old_abs)
|
||||
|
||||
else:
|
||||
# ── 不同格式:Pillow 转换 + 更新 rels + ContentTypes
|
||||
new_abs = os.path.splitext(old_abs)[0] + '.' + new_ext
|
||||
img = Image.open(new_image_path)
|
||||
fmt = {'jpeg': 'JPEG', 'png': 'PNG', 'gif': 'GIF',
|
||||
'bmp': 'BMP', 'tiff': 'TIFF', 'webp': 'WEBP'}.get(new_ext, new_ext.upper())
|
||||
if fmt == 'JPEG' and img.mode in ('RGBA', 'P'):
|
||||
img = img.convert('RGB')
|
||||
img.save(new_abs, format=fmt)
|
||||
if os.path.abspath(new_abs) != os.path.abspath(old_abs):
|
||||
os.remove(old_abs)
|
||||
|
||||
# 更新 rels
|
||||
old_media = info['media_file']
|
||||
new_media = os.path.splitext(old_media)[0] + '.' + new_ext
|
||||
word_dir = os.path.join(unpacked_dir, 'word')
|
||||
rels_path = os.path.join(word_dir, '_rels', 'document.xml.rels')
|
||||
rels_tree = etree.parse(rels_path)
|
||||
for rel in rels_tree.getroot():
|
||||
if rel.get('Id') == info['rid']:
|
||||
rel.set('Target', new_media)
|
||||
break
|
||||
rels_tree.write(rels_path, xml_declaration=True, encoding='UTF-8', standalone=True)
|
||||
|
||||
# 更新 ContentTypes
|
||||
ct_path = os.path.join(unpacked_dir, '[Content_Types].xml')
|
||||
ct_tree = etree.parse(ct_path)
|
||||
ct_root = ct_tree.getroot()
|
||||
existing = {el.get('Extension', '') for el in ct_root}
|
||||
if new_ext not in existing:
|
||||
etree.SubElement(ct_root, 'Default', Extension=new_ext,
|
||||
ContentType=EXT_TO_MIME.get(new_ext, f'image/{new_ext}'))
|
||||
ct_tree.write(ct_path, xml_declaration=True, encoding='UTF-8', standalone=True)
|
||||
print(f" 格式转换 {old_ext}→{new_ext},rels 和 ContentTypes 已更新")
|
||||
|
||||
|
||||
def paragraph_replace(para_el, replacements):
|
||||
"""在 <w:t> 层面替换文本,完全不碰图片和格式"""
|
||||
for t_el in para_el.iter(f'{{{W}}}t'):
|
||||
if not t_el.text:
|
||||
continue
|
||||
new_text = t_el.text
|
||||
for old, new in replacements:
|
||||
new_text = new_text.replace(old, new)
|
||||
if new_text != t_el.text:
|
||||
t_el.text = new_text
|
||||
if new_text and (new_text[0] == ' ' or new_text[-1] == ' '):
|
||||
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
||||
|
||||
|
||||
def ensure_rpr(run_el):
|
||||
rpr = run_el.find(f'{{{W}}}rPr')
|
||||
if rpr is None:
|
||||
rpr = etree.Element(f'{{{W}}}rPr')
|
||||
run_el.insert(0, rpr)
|
||||
return rpr
|
||||
|
||||
def set_color_on_rpr(rpr_el, hex_color):
|
||||
c = rpr_el.find(f'{{{W}}}color')
|
||||
if c is None:
|
||||
c = etree.SubElement(rpr_el, f'{{{W}}}color')
|
||||
c.set(f'{{{W}}}val', hex_color.lstrip('#'))
|
||||
|
||||
def apply_color_to_keyword(doc_el, keyword, hex_color):
|
||||
"""
|
||||
只给匹配到的关键字本身着色,而不是整个 run。
|
||||
|
||||
做法:在有关键字的 run 上,把文本拆成多段 run:
|
||||
[前缀][关键字][后缀],只有“关键字”这个 run 设置颜色。
|
||||
"""
|
||||
# 先 list 一下,避免在遍历时修改树结构导致问题
|
||||
runs = list(doc_el.iter(f'{{{W}}}r'))
|
||||
for run in runs:
|
||||
t_nodes = list(run.findall(f'{{{W}}}t'))
|
||||
if not t_nodes:
|
||||
continue
|
||||
full_text = ''.join(t.text or '' for t in t_nodes)
|
||||
if keyword not in full_text:
|
||||
continue
|
||||
|
||||
parent = run.getparent()
|
||||
if parent is None:
|
||||
continue
|
||||
insert_pos = parent.index(run)
|
||||
|
||||
# 原 run 的 rPr 复制给新 run
|
||||
orig_rpr = run.find(f'{{{W}}}rPr')
|
||||
if orig_rpr is not None:
|
||||
rpr_bytes = etree.tostring(orig_rpr)
|
||||
else:
|
||||
rpr_bytes = None
|
||||
|
||||
def make_run(text, colored):
|
||||
new_r = etree.Element(f'{{{W}}}r')
|
||||
if rpr_bytes is not None:
|
||||
new_r.append(etree.fromstring(rpr_bytes))
|
||||
t_el = etree.SubElement(new_r, f'{{{W}}}t')
|
||||
t_el.text = text
|
||||
if text and (text[0] == ' ' or text[-1] == ' '):
|
||||
t_el.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
||||
if colored:
|
||||
set_color_on_rpr(ensure_rpr(new_r), hex_color)
|
||||
return new_r
|
||||
|
||||
segments = []
|
||||
s = full_text
|
||||
start = 0
|
||||
klen = len(keyword)
|
||||
while True:
|
||||
idx = s.find(keyword, start)
|
||||
if idx == -1:
|
||||
if start < len(s):
|
||||
segments.append((s[start:], False))
|
||||
break
|
||||
if idx > start:
|
||||
segments.append((s[start:idx], False))
|
||||
segments.append((keyword, True))
|
||||
start = idx + klen
|
||||
|
||||
# 用新 run 替换原 run
|
||||
parent.remove(run)
|
||||
for offset, (seg_text, colored) in enumerate(segments):
|
||||
if seg_text:
|
||||
parent.insert(insert_pos + offset, make_run(seg_text, colored))
|
||||
|
||||
def process(input_docx, output_docx, replacements, image_replacements,
|
||||
color_keywords):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
print(f"📂 解包 {input_docx} ...")
|
||||
unpack(input_docx, tmpdir)
|
||||
|
||||
doc_xml_path = os.path.join(tmpdir, 'word', 'document.xml')
|
||||
|
||||
if image_replacements:
|
||||
print(f"🖼️ 替换 {len(image_replacements)} 张图片...")
|
||||
for idx, new_img in image_replacements:
|
||||
replace_image(tmpdir, idx, new_img)
|
||||
|
||||
tree = etree.parse(doc_xml_path)
|
||||
root = tree.getroot()
|
||||
|
||||
if replacements:
|
||||
print(f"✏️ 替换 {len(replacements)} 条文本...")
|
||||
for para in root.iter(f'{{{W}}}p'):
|
||||
paragraph_replace(para, replacements)
|
||||
|
||||
# 根据 span 解析出的关键字上色
|
||||
for keyword, color in color_keywords:
|
||||
print(f"🎨 关键词「{keyword}」→ #{color}")
|
||||
apply_color_to_keyword(root, keyword, color)
|
||||
|
||||
tree.write(doc_xml_path, xml_declaration=True, encoding='UTF-8', standalone=True)
|
||||
print(f"📦 打包 → {output_docx} ...")
|
||||
pack(tmpdir, output_docx, input_docx)
|
||||
print(f"✅ 完成!输出: {output_docx}")
|
||||
|
||||
|
||||
def _parse_span_replacement(new_text):
|
||||
"""
|
||||
解析 NEW 文本中的 span 标签,用于决定颜色。
|
||||
|
||||
约定格式(不区分大小写):
|
||||
<span color="FF0000">待补充</span>
|
||||
<span color="#FF0000">待补充</span>
|
||||
|
||||
返回: (纯文本, [(keyword, hex_color), ...])
|
||||
"""
|
||||
import re
|
||||
|
||||
span_pattern = re.compile(
|
||||
r'<span\s+[^>]*?color=["\']?(#?[0-9a-fA-F]{6})["\']?[^>]*>(.*?)</span>',
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
color_keywords = []
|
||||
|
||||
def _repl(m):
|
||||
hex_color = m.group(1).lstrip('#')
|
||||
keyword = m.group(2)
|
||||
color_keywords.append((keyword, hex_color))
|
||||
return keyword
|
||||
|
||||
plain_text = span_pattern.sub(_repl, new_text)
|
||||
return plain_text, color_keywords
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='DOCX 格式保留:替换文本/图片/颜色')
|
||||
parser.add_argument('input', help='输入 .docx')
|
||||
parser.add_argument('output', nargs='?', help='输出 .docx')
|
||||
parser.add_argument('--list-images', action='store_true', help='列出所有图片')
|
||||
parser.add_argument('--replace', nargs=2, metavar=('OLD', 'NEW'),
|
||||
action='append', default=[])
|
||||
parser.add_argument('--image', nargs=2, metavar=('INDEX', 'FILE'),
|
||||
action='append', default=[], help='图片替换')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.list_images:
|
||||
list_images(args.input)
|
||||
return
|
||||
if not args.output:
|
||||
parser.error("需要指定输出文件")
|
||||
|
||||
# 处理 span 颜色:把 NEW 中的 <span color="...">文字</span> 抽出来
|
||||
replacements = []
|
||||
color_keywords = []
|
||||
for old, new_raw in args.replace:
|
||||
new_plain, spans = _parse_span_replacement(new_raw)
|
||||
replacements.append((old, new_plain))
|
||||
color_keywords.extend(spans)
|
||||
|
||||
process(
|
||||
input_docx = args.input,
|
||||
output_docx = args.output,
|
||||
replacements = replacements,
|
||||
image_replacements= [(int(i), f) for i, f in args.image],
|
||||
color_keywords = color_keywords,
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
136
mcp_docx_server.py
Normal file
136
mcp_docx_server.py
Normal file
@@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
基于 mcp_docx.py 封装的 MCP 服务器。
|
||||
|
||||
暴露两个主要工具:
|
||||
- list_docx_images:列出 DOCX 中的图片信息
|
||||
- edit_docx: 进行文本替换 / 关键字上色 / 图片替换
|
||||
|
||||
注意:底层仍然完全复用 mcp_docx.py 中的逻辑,只是通过 MCP SDK 对外提供。
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from mcp.server.fastmcp import FastMCPServer
|
||||
|
||||
from mcp_docx import get_images_info, process, _parse_span_replacement
|
||||
|
||||
|
||||
server = FastMCPServer(
|
||||
"docx-editor",
|
||||
version="0.1.0",
|
||||
description="DOCX 文本和图片编辑工具(基于 mcp_docx.py 封装)",
|
||||
)
|
||||
|
||||
|
||||
@server.tool()
|
||||
async def list_docx_images(docx_path: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
列出指定 DOCX 文件中的所有图片信息。
|
||||
|
||||
参数:
|
||||
- docx_path: DOCX 文件的路径(相对或绝对)
|
||||
|
||||
返回:
|
||||
- 图片信息列表,每一项包含:
|
||||
- index: 图片在文档中的顺序(从 1 开始)
|
||||
- media_file: DOCX 内部的资源路径
|
||||
- abs_path: 解包后的绝对路径(仅用于调试)
|
||||
- ext: 图片扩展名
|
||||
- docpr_name: Word 内部的图片名称
|
||||
- width_cm / height_cm: 近似尺寸(厘米),可能为 None
|
||||
"""
|
||||
if not os.path.exists(docx_path):
|
||||
raise FileNotFoundError(f"DOCX 文件不存在: {docx_path}")
|
||||
|
||||
imgs = get_images_info(docx_path)
|
||||
# 为了避免泄露容器内部路径,可选择屏蔽 abs_path 字段
|
||||
for img in imgs:
|
||||
img.pop("abs_path", None)
|
||||
return imgs
|
||||
|
||||
|
||||
@server.tool()
|
||||
async def edit_docx(
|
||||
input_docx: str,
|
||||
output_docx: str,
|
||||
replacements: Optional[List[Dict[str, str]]] = None,
|
||||
image_replacements: Optional[List[Dict[str, Any]]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用原始 mcp_docx 逻辑对 DOCX 文件进行编辑。
|
||||
|
||||
支持:
|
||||
- 纯文本替换
|
||||
- 通过 <span color=\"FF0000\">关键字</span> 语法设置关键字颜色
|
||||
- 替换指定序号的图片
|
||||
|
||||
参数:
|
||||
- input_docx: 输入 DOCX 文件路径
|
||||
- output_docx: 输出 DOCX 文件路径
|
||||
- replacements: 文本替换规则列表,例如:
|
||||
[
|
||||
{\"old\": \"旧标题\", \"new\": \"<span color='#FF0000'>新标题</span>\"},
|
||||
{\"old\": \"原文\", \"new\": \"新文\"}
|
||||
]
|
||||
- image_replacements: 图片替换规则列表,例如:
|
||||
[
|
||||
{\"index\": 1, \"file\": \"new_chart.png\"},
|
||||
{\"index\": 2, \"file\": \"new_photo.jpg\"}
|
||||
]
|
||||
|
||||
返回:
|
||||
- {\"output_path\": 生成的 DOCX 绝对路径}
|
||||
"""
|
||||
if not os.path.exists(input_docx):
|
||||
raise FileNotFoundError(f"输入 DOCX 文件不存在: {input_docx}")
|
||||
|
||||
if replacements is None:
|
||||
replacements = []
|
||||
if image_replacements is None:
|
||||
image_replacements = []
|
||||
|
||||
# 解析文本替换与颜色关键字(复用 CLI 逻辑)
|
||||
rep_pairs = []
|
||||
color_keywords = []
|
||||
for item in replacements:
|
||||
old = item.get("old")
|
||||
new_raw = item.get("new")
|
||||
if not old:
|
||||
continue
|
||||
if new_raw is None:
|
||||
new_raw = ""
|
||||
new_plain, spans = _parse_span_replacement(new_raw)
|
||||
rep_pairs.append((old, new_plain))
|
||||
color_keywords.extend(spans)
|
||||
|
||||
# 处理图片替换参数
|
||||
img_pairs = []
|
||||
for item in image_replacements:
|
||||
try:
|
||||
idx = int(item.get("index"))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
path = item.get("file")
|
||||
if not path:
|
||||
continue
|
||||
if not os.path.exists(path):
|
||||
raise FileNotFoundError(f"图片文件不存在: {path}")
|
||||
img_pairs.append((idx, path))
|
||||
|
||||
# 复用原始处理函数
|
||||
process(
|
||||
input_docx=input_docx,
|
||||
output_docx=output_docx,
|
||||
replacements=rep_pairs,
|
||||
image_replacements=img_pairs,
|
||||
color_keywords=color_keywords,
|
||||
)
|
||||
|
||||
return {"output_path": os.path.abspath(output_docx)}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 通过 stdio 运行 MCP 服务器
|
||||
server.run()
|
||||
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
mcp>=1.0.0
|
||||
python-docx>=1.1.0
|
||||
lxml>=5.0.0
|
||||
Pillow>=10.0.0
|
||||
|
||||
Reference in New Issue
Block a user