zcbot/skills/paper/scripts/render_docx.py

503 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""把 sections/*.md 渲染成期刊投稿稿 .docx (manuscript draft)。
与 proposal/render_docx.py 同源, 差异:
- 无 fund-type; 改用 --lang {zh,en} (默认 en) 标注语言, 仅影响信息打印与首行缩进策略
- 目录 (TOC) 默认**不生成** (期刊投稿稿无需目录); 要草稿带目录加 --toc
- 字体规范保持: 中文宋体小四 / 英文 Times New Roman 小四 / 行距 1.5 / 首行缩进 2 字符
(eastAsia=宋体 只对 CJK 字符生效, 纯英文论文正文走 Times New Roman, 同一套 style 通吃)
支持: **加粗** / *斜体* / `等宽`; 列表 / 表格 / ![caption](png) 居中插图 + 图题自增;
```mermaid``` 块按 caption 查 figures/fig_<caption>.png (由 render_diagrams.py 预生成)。
用法:
python render_docx.py <sections_dir> --lang en -o <out.docx>
python render_docx.py <sections_dir> --lang zh --toc -o <out.docx>
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Cm, Pt, RGBColor
# ───────────────────────── 字体辅助 ─────────────────────────
def _set_run_fonts(run, *, cn_font: str = "宋体", en_font: str = "Times New Roman") -> None:
rPr = run._element.get_or_add_rPr()
rFonts = rPr.find(qn("w:rFonts"))
if rFonts is None:
rFonts = OxmlElement("w:rFonts")
rPr.append(rFonts)
rFonts.set(qn("w:eastAsia"), cn_font)
rFonts.set(qn("w:ascii"), en_font)
rFonts.set(qn("w:hAnsi"), en_font)
def _set_style_fonts(style, *, cn_font: str = "宋体", en_font: str = "Times New Roman") -> None:
el = style.element
rPr = el.find(qn("w:rPr"))
if rPr is None:
rPr = OxmlElement("w:rPr")
el.insert(0, rPr)
rFonts = rPr.find(qn("w:rFonts"))
if rFonts is None:
rFonts = OxmlElement("w:rFonts")
rPr.append(rFonts)
rFonts.set(qn("w:eastAsia"), cn_font)
rFonts.set(qn("w:ascii"), en_font)
rFonts.set(qn("w:hAnsi"), en_font)
# ───────────────────────── 文档初始化 ─────────────────────────
def init_doc() -> Document:
doc = Document()
section = doc.sections[0]
section.page_height = Cm(29.7)
section.page_width = Cm(21)
section.top_margin = Cm(2.5)
section.bottom_margin = Cm(2.5)
section.left_margin = Cm(2.5)
section.right_margin = Cm(2.5)
normal = doc.styles["Normal"]
normal.font.name = "Times New Roman"
normal.font.size = Pt(12)
_set_style_fonts(normal, cn_font="宋体")
pf = normal.paragraph_format
pf.line_spacing = 1.5
pf.space_before = Pt(0)
pf.space_after = Pt(0)
for lvl, sz, cn in [(1, Pt(14), "黑体"), (2, Pt(12), "黑体"), (3, Pt(12), "宋体")]:
h = doc.styles[f"Heading {lvl}"]
h.font.name = "Times New Roman"
h.font.size = sz
h.font.bold = True
h.font.color.rgb = RGBColor(0, 0, 0)
_set_style_fonts(h, cn_font=cn)
h.paragraph_format.line_spacing = 1.5
h.paragraph_format.space_before = Pt(6)
h.paragraph_format.space_after = Pt(3)
h.paragraph_format.first_line_indent = None
return doc
# ───────────────────────── TOC (opt-in) ─────────────────────────
def add_toc(doc: Document, depth: int = 3) -> None:
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = None
p.paragraph_format.space_before = Pt(12)
p.paragraph_format.space_after = Pt(6)
run = p.add_run("Contents")
run.font.size = Pt(16)
run.font.bold = True
_set_run_fonts(run, cn_font="黑体")
p = doc.add_paragraph()
p.paragraph_format.first_line_indent = None
run = p.add_run()
fldChar1 = OxmlElement("w:fldChar")
fldChar1.set(qn("w:fldCharType"), "begin")
instrText = OxmlElement("w:instrText")
instrText.set(qn("xml:space"), "preserve")
instrText.text = f' TOC \\o "1-{depth}" \\h \\z \\u '
fldChar2 = OxmlElement("w:fldChar")
fldChar2.set(qn("w:fldCharType"), "separate")
fldChar3 = OxmlElement("w:fldChar")
fldChar3.set(qn("w:fldCharType"), "end")
placeholder_t = OxmlElement("w:t")
placeholder_t.set(qn("xml:space"), "preserve")
placeholder_t.text = "[Press F9 in Word to generate the table of contents]"
run._element.append(fldChar1)
run._element.append(instrText)
run._element.append(fldChar2)
run._element.append(placeholder_t)
run._element.append(fldChar3)
doc.add_page_break()
# ───────────────────────── 内联 markdown ─────────────────────────
_INLINE_RE = re.compile(
r"(?P<bold>\*\*(?P<bold_t>[^*\n]+?)\*\*)"
r"|(?P<italic>(?<![\*\w])\*(?P<italic_t>[^*\n]+?)\*(?!\*))"
r"|(?P<code>`(?P<code_t>[^`\n]+?)`)"
)
def parse_inline(text: str) -> list[tuple[str, str]]:
out: list[tuple[str, str]] = []
pos = 0
for m in _INLINE_RE.finditer(text):
if m.start() > pos:
out.append(("plain", text[pos:m.start()]))
if m.group("bold"):
out.append(("bold", m.group("bold_t")))
elif m.group("italic"):
out.append(("italic", m.group("italic_t")))
elif m.group("code"):
out.append(("code", m.group("code_t")))
pos = m.end()
if pos < len(text):
out.append(("plain", text[pos:]))
return out or [("plain", text)]
def add_inline(paragraph, text: str, *, size: Pt = Pt(12), cn_font: str = "宋体") -> None:
for style, seg in parse_inline(text):
run = paragraph.add_run(seg)
run.font.size = size
if style == "bold":
run.bold = True
_set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
elif style == "italic":
run.italic = True
_set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
elif style == "code":
_set_run_fonts(run, cn_font=cn_font, en_font="Consolas")
else:
_set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
# ───────────────────────── 段落 / 标题 / 列表 ─────────────────────────
def add_heading(doc: Document, text: str, level: int) -> None:
p = doc.add_paragraph(style=f"Heading {level}")
p.paragraph_format.first_line_indent = None
sizes = {1: Pt(14), 2: Pt(12), 3: Pt(12)}
cn = {1: "黑体", 2: "黑体", 3: "宋体"}
add_inline(p, text, size=sizes[level], cn_font=cn[level])
for run in p.runs:
run.bold = True
def add_body_paragraph(doc: Document, text: str, *, indent: bool = True) -> None:
p = doc.add_paragraph()
pf = p.paragraph_format
pf.line_spacing = 1.5
if indent:
pf.first_line_indent = Pt(24)
else:
pf.first_line_indent = None
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
add_inline(p, text)
# ───────────────────────── 行类型识别 ─────────────────────────
_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$")
_TABLE_LINE_RE = re.compile(r"^\s*\|.*\|\s*$")
_BLOCKQUOTE_RE = re.compile(r"^\s*>\s?")
_HR_RE = re.compile(r"^\s*-{3,}\s*$|^\s*={3,}\s*$|^\s*_{3,}\s*$")
_FENCE_RE = re.compile(r"^\s*(`{3,}|~{3,})\s*(\S*)\s*$")
_LIST_PATTERNS = [
re.compile(r"^\[\d+\]\s"),
re.compile(r"^[-*+]\s"),
re.compile(r"^\d+[\.、.]\s*"),
re.compile(r"^\(\d+\)\s*"),
re.compile(r"^\d+\s*"),
re.compile(r"^[一二三四五六七八九十百千]+[、.\.]"),
re.compile(r"^[(][一二三四五六七八九十百千]+[)]"),
re.compile(r"^[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮]"),
]
def is_list_item(line: str) -> bool:
return any(p.match(line) for p in _LIST_PATTERNS)
def is_table_line(line: str) -> bool:
return bool(_TABLE_LINE_RE.match(line))
def is_heading(line: str) -> bool:
return bool(_HEADING_RE.match(line))
def is_blockquote(line: str) -> bool:
return bool(_BLOCKQUOTE_RE.match(line))
def is_hr(line: str) -> bool:
return bool(_HR_RE.match(line))
# ───────────────────────── 代码块 / ASCII 图 ─────────────────────────
def add_code_block(doc: Document, lines: list[str], lang: str = "") -> None:
for ln in lines:
p = doc.add_paragraph()
pf = p.paragraph_format
pf.first_line_indent = None
pf.line_spacing = 1.0
pf.space_before = Pt(0)
pf.space_after = Pt(0)
run = p.add_run(ln if ln else " ")
run.font.size = Pt(10.5)
_set_run_fonts(run, cn_font="新宋体", en_font="Consolas")
for t in run._element.iter(qn("w:t")):
t.set(qn("xml:space"), "preserve")
# ───────────────────────── 表格 ─────────────────────────
def _split_md_row(line: str) -> list[str]:
return [c.strip() for c in line.strip().strip("|").split("|")]
def _is_separator_row(cells: list[str]) -> bool:
return all(re.match(r"^[-:\s]+$", c) for c in cells if c != "")
def render_table(doc: Document, table_lines: list[str]) -> None:
rows: list[list[str]] = []
for ln in table_lines:
cells = _split_md_row(ln)
if not cells or _is_separator_row(cells):
continue
rows.append(cells)
if not rows:
return
n_cols = max(len(r) for r in rows)
for r in rows:
while len(r) < n_cols:
r.append("")
table = doc.add_table(rows=len(rows), cols=n_cols)
try:
table.style = "Light Grid Accent 1"
except KeyError:
pass
for ri, row in enumerate(rows):
for ci, val in enumerate(row):
cell = table.rows[ri].cells[ci]
cell.text = ""
p = cell.paragraphs[0]
p.paragraph_format.first_line_indent = None
p.paragraph_format.line_spacing = 1.2
add_inline(p, val, size=Pt(10.5), cn_font="宋体")
if ri == 0:
for run in p.runs:
run.bold = True
# ───────────────────────── 图片 + 图题 ─────────────────────────
_IMAGE_LINE_RE = re.compile(r"^\s*!\[(?P<cap>[^\]]*)\]\((?P<src>[^)\s]+)\)\s*$")
_MERMAID_CAPTION_RE = re.compile(r"^\s*%%\s*caption\s*:\s*(.+?)\s*$", re.IGNORECASE)
_FILENAME_INVALID_RE = re.compile(r"[^一-鿿A-Za-z0-9]+")
_MAX_IMG_WIDTH = Cm(15)
def caption_to_stem(caption: str) -> str:
cleaned = _FILENAME_INVALID_RE.sub("_", caption).strip("_")[:40]
if not cleaned:
return ""
return f"fig_{cleaned}"
def extract_mermaid_caption(source: str) -> str | None:
for ln in source.splitlines():
m = _MERMAID_CAPTION_RE.match(ln)
if m:
return m.group(1).strip()
return None
def _resolve_image_path(src: str, base_dir: Path) -> Path | None:
p = Path(src)
if not p.is_absolute():
p = (base_dir / p).resolve()
return p if p.is_file() else None
def add_image(doc: Document, png_path: Path, caption: str | None, ctx: dict) -> None:
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = None
p.paragraph_format.space_before = Pt(6)
p.paragraph_format.space_after = Pt(3)
run = p.add_run()
try:
run.add_picture(str(png_path), width=_MAX_IMG_WIDTH)
except Exception as e:
run.add_text(f"[image failed: {png_path.name}: {e}]")
return
ctx["fig_no"] = ctx.get("fig_no", 0) + 1
cap_p = doc.add_paragraph()
cap_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
cap_p.paragraph_format.first_line_indent = None
cap_p.paragraph_format.space_before = Pt(0)
cap_p.paragraph_format.space_after = Pt(6)
label = ctx.get("fig_label", "Fig.")
cap_text = f"{label} {ctx['fig_no']} {caption}" if caption else f"{label} {ctx['fig_no']}"
cap_run = cap_p.add_run(cap_text)
cap_run.font.size = Pt(10.5)
cap_run.bold = True
_set_run_fonts(cap_run, cn_font="宋体", en_font="Times New Roman")
# ───────────────────────── 主渲染 ─────────────────────────
def render_md_block(doc: Document, md_text: str, ctx: dict) -> None:
lines = md_text.splitlines()
i = 0
n = len(lines)
while i < n:
line = lines[i].rstrip()
if not line.strip():
i += 1
continue
if is_hr(line):
i += 1
continue
m_img = _IMAGE_LINE_RE.match(line)
if m_img:
src = m_img.group("src")
cap = m_img.group("cap").strip() or None
png = _resolve_image_path(src, ctx["sections_dir"])
if png is not None:
add_image(doc, png, cap, ctx)
else:
add_body_paragraph(doc, f"[image missing: {src}]", indent=False)
i += 1
continue
m_fence = _FENCE_RE.match(line)
if m_fence:
fence = m_fence.group(1)
lang = m_fence.group(2) or ""
code: list[str] = []
i += 1
while i < n:
m_close = _FENCE_RE.match(lines[i])
if m_close and m_close.group(1)[0] == fence[0] and len(m_close.group(1)) >= len(fence):
i += 1
break
code.append(lines[i])
i += 1
if lang.lower() == "mermaid":
source = "\n".join(code)
cap = extract_mermaid_caption(source)
if cap:
stem = caption_to_stem(cap)
if stem:
png = ctx["figures_dir"] / f"{stem}.png"
if png.is_file():
add_image(doc, png, cap, ctx)
continue
add_code_block(doc, code, lang)
continue
if is_table_line(line):
block: list[str] = []
while i < n and is_table_line(lines[i]):
block.append(lines[i])
i += 1
render_table(doc, block)
continue
m = _HEADING_RE.match(line)
if m:
level = min(len(m.group(1)), 3)
add_heading(doc, m.group(2).strip(), level)
i += 1
continue
if is_blockquote(line):
i += 1
continue
if is_list_item(line):
add_body_paragraph(doc, line.strip(), indent=False)
i += 1
continue
buf = [line.strip()]
j = i + 1
while j < n:
nxt = lines[j].rstrip()
if not nxt.strip():
break
if is_heading(nxt) or is_blockquote(nxt) or is_table_line(nxt) or is_list_item(nxt) or is_hr(nxt):
break
buf.append(nxt.strip())
j += 1
add_body_paragraph(doc, " ".join(buf), indent=True)
i = j
# ───────────────────────── 入口 ─────────────────────────
def render_sections(sections_dir: Path, out: Path, lang: str, toc: bool) -> None:
if not sections_dir.is_dir():
print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr)
sys.exit(2)
md_files = sorted(sections_dir.glob("*.md"))
if not md_files:
print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr)
sys.exit(2)
figures_dir = sections_dir.parent / "figures"
ctx: dict = {
"sections_dir": sections_dir,
"figures_dir": figures_dir,
"fig_no": 0,
"fig_label": "" if lang == "zh" else "Fig.",
}
doc = init_doc()
if toc:
add_toc(doc)
for idx, f in enumerate(md_files):
text = f.read_text(encoding="utf-8")
render_md_block(doc, text, ctx)
if idx != len(md_files) - 1:
doc.add_page_break()
out.parent.mkdir(parents=True, exist_ok=True)
doc.save(str(out))
paras = sum(1 for _ in doc.paragraphs)
chars = sum(len(p.text) for p in doc.paragraphs)
tbls = len(doc.tables)
print(f"[OK] rendered {len(md_files)} sections -> {out}")
print(f" paragraphs: {paras} | tables: {tbls} | figures: {ctx['fig_no']} | total chars: {chars}")
print(f" lang: {lang} | toc: {toc}")
print(f" font: 中文宋体小四 / 英文 Times New Roman 小四 / 行距 1.5 / 首行缩进 2 字符")
def main() -> None:
ap = argparse.ArgumentParser(description="渲染章节 md → 论文投稿稿 docx")
ap.add_argument("sections_dir", type=Path, help="sections/*.md 目录")
ap.add_argument("--lang", choices=["zh", "en"], default="en",
help="论文语言 (影响图题前缀 图/Fig. 与信息打印); 默认 en")
ap.add_argument("--toc", action="store_true",
help="生成目录页 (期刊投稿稿通常不需要; 内部草稿评阅时可加)")
ap.add_argument("-o", "--output", type=Path, required=True, help="输出 .docx 路径")
args = ap.parse_args()
render_sections(args.sections_dir, args.output, args.lang, args.toc)
if __name__ == "__main__":
main()