zcbot/skills/proposal/scripts/render_docx.py

415 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""把 sections/*.md 渲染成符合中国基金申报书排版规范的 .docx。
字体规范 (来自 typography.md):
- 标题黑体 (一二级) / 三级标题宋体 / 正文中文宋体 / 英文 Times New Roman
- 行距 1.5 倍 / 首行缩进 2 字符
- A4 纸 / 上下 2.5cm / 左 3.0cm / 右 2.0cm
特性:
- 自动插入"目录"页 (Word 内右键更新域 / F9 即生成 TOC)
- 内联 markdown 解析: **加粗** / *斜体* / `等宽`
- 列表/引用文献项 ([N], 1., (1), 一、, -, *) 各自独立成段
- markdown 表格自动识别, 包含分隔行 |---|---|
用法:
python render_docx.py <sections_dir> --fund-type key_rd -o <out.docx>
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Cm, Pt, RGBColor
# ───────────────────────── 字体辅助 ─────────────────────────
def _set_run_fonts(run, *, cn_font: str = "宋体", en_font: str = "Times New Roman") -> None:
"""同时设置 run 的中文 (eastAsia) 和西文 (ascii/hAnsi) 字体。"""
rPr = run._element.get_or_add_rPr()
rFonts = rPr.find(qn("w:rFonts"))
if rFonts is None:
rFonts = OxmlElement("w:rFonts")
rPr.append(rFonts)
rFonts.set(qn("w:eastAsia"), cn_font)
rFonts.set(qn("w:ascii"), en_font)
rFonts.set(qn("w:hAnsi"), en_font)
def _set_style_fonts(style, *, cn_font: str = "宋体", en_font: str = "Times New Roman") -> None:
"""直接给 style 写 rFonts, 这样基于该 style 的所有段落都继承字体。"""
el = style.element
rPr = el.find(qn("w:rPr"))
if rPr is None:
rPr = OxmlElement("w:rPr")
el.insert(0, rPr)
rFonts = rPr.find(qn("w:rFonts"))
if rFonts is None:
rFonts = OxmlElement("w:rFonts")
rPr.append(rFonts)
rFonts.set(qn("w:eastAsia"), cn_font)
rFonts.set(qn("w:ascii"), en_font)
rFonts.set(qn("w:hAnsi"), en_font)
# ───────────────────────── 文档初始化 ─────────────────────────
def init_doc() -> Document:
doc = Document()
# 页面
section = doc.sections[0]
section.page_height = Cm(29.7)
section.page_width = Cm(21)
section.top_margin = Cm(2.5)
section.bottom_margin = Cm(2.5)
section.left_margin = Cm(3.0)
section.right_margin = Cm(2.0)
# Normal 样式 (正文)
normal = doc.styles["Normal"]
normal.font.name = "Times New Roman"
normal.font.size = Pt(12) # 小四
_set_style_fonts(normal, cn_font="宋体")
pf = normal.paragraph_format
pf.line_spacing = 1.5
pf.space_before = Pt(0)
pf.space_after = Pt(0)
# Heading 样式 — 让 Word TOC 域识别
for lvl, sz, cn in [(1, Pt(14), "黑体"), (2, Pt(12), "黑体"), (3, Pt(12), "宋体")]:
h = doc.styles[f"Heading {lvl}"]
h.font.name = "Times New Roman"
h.font.size = sz
h.font.bold = True
h.font.color.rgb = RGBColor(0, 0, 0) # 覆盖 builtin 蓝色
_set_style_fonts(h, cn_font=cn)
h.paragraph_format.line_spacing = 1.5
h.paragraph_format.space_before = Pt(6)
h.paragraph_format.space_after = Pt(3)
h.paragraph_format.first_line_indent = None
return doc
# ───────────────────────── TOC ─────────────────────────
def add_toc(doc: Document, depth: int = 3) -> None:
"""在文档开头插入 '目录' 标题 + Word 域 TOC。
Word 打开时不会自动展开;用户右键域 → '更新域' 或按 F9。
LibreOffice 打开会更直接显示。
"""
# "目 录" 标题 (居中, 不用 Heading 样式以免自我包含)
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = None
p.paragraph_format.space_before = Pt(12)
p.paragraph_format.space_after = Pt(6)
run = p.add_run("目 录")
run.font.size = Pt(16) # 三号
run.font.bold = True
_set_run_fonts(run, cn_font="黑体")
# TOC 域
p = doc.add_paragraph()
p.paragraph_format.first_line_indent = None
run = p.add_run()
fldChar1 = OxmlElement("w:fldChar")
fldChar1.set(qn("w:fldCharType"), "begin")
instrText = OxmlElement("w:instrText")
instrText.set(qn("xml:space"), "preserve")
instrText.text = f' TOC \\o "1-{depth}" \\h \\z \\u '
fldChar2 = OxmlElement("w:fldChar")
fldChar2.set(qn("w:fldCharType"), "separate")
fldChar3 = OxmlElement("w:fldChar")
fldChar3.set(qn("w:fldCharType"), "end")
# 占位文字 — Word 更新域时会被实际目录替换
placeholder_t = OxmlElement("w:t")
placeholder_t.set(qn("xml:space"), "preserve")
placeholder_t.text = "[在 Word 中按 F9 或右键此处选择 “更新域” 即可生成完整目录]"
run._element.append(fldChar1)
run._element.append(instrText)
run._element.append(fldChar2)
run._element.append(placeholder_t)
run._element.append(fldChar3)
doc.add_page_break()
# ───────────────────────── 内联 markdown ─────────────────────────
# 顺序敏感:**bold** 必须先于 *italic* 匹配, 否则会被 italic 抢
_INLINE_RE = re.compile(
r"(?P<bold>\*\*(?P<bold_t>[^*\n]+?)\*\*)"
r"|(?P<italic>(?<![\*\w])\*(?P<italic_t>[^*\n]+?)\*(?!\*))"
r"|(?P<code>`(?P<code_t>[^`\n]+?)`)"
)
def parse_inline(text: str) -> list[tuple[str, str]]:
"""切成 (style, segment) 列表; style ∈ plain/bold/italic/code。"""
out: list[tuple[str, str]] = []
pos = 0
for m in _INLINE_RE.finditer(text):
if m.start() > pos:
out.append(("plain", text[pos:m.start()]))
if m.group("bold"):
out.append(("bold", m.group("bold_t")))
elif m.group("italic"):
out.append(("italic", m.group("italic_t")))
elif m.group("code"):
out.append(("code", m.group("code_t")))
pos = m.end()
if pos < len(text):
out.append(("plain", text[pos:]))
return out or [("plain", text)]
def add_inline(paragraph, text: str, *, size: Pt = Pt(12), cn_font: str = "宋体") -> None:
for style, seg in parse_inline(text):
run = paragraph.add_run(seg)
run.font.size = size
if style == "bold":
run.bold = True
_set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
elif style == "italic":
run.italic = True
_set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
elif style == "code":
_set_run_fonts(run, cn_font=cn_font, en_font="Consolas")
else:
_set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
# ───────────────────────── 段落 / 标题 / 列表 ─────────────────────────
def add_heading(doc: Document, text: str, level: int) -> None:
p = doc.add_paragraph(style=f"Heading {level}")
p.paragraph_format.first_line_indent = None
# 标题里通常无内联 markdown, 但万一有也按内联解析 (黑体大小由 style 已设)
sizes = {1: Pt(14), 2: Pt(12), 3: Pt(12)}
cn = {1: "黑体", 2: "黑体", 3: "宋体"}
add_inline(p, text, size=sizes[level], cn_font=cn[level])
for run in p.runs:
run.bold = True
def add_body_paragraph(doc: Document, text: str, *, indent: bool = True) -> None:
p = doc.add_paragraph()
pf = p.paragraph_format
pf.line_spacing = 1.5
if indent:
pf.first_line_indent = Pt(24) # 2 字符
else:
pf.first_line_indent = None
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
add_inline(p, text)
# ───────────────────────── 行类型识别 ─────────────────────────
_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$")
_TABLE_LINE_RE = re.compile(r"^\s*\|.*\|\s*$")
_BLOCKQUOTE_RE = re.compile(r"^\s*>\s?")
_HR_RE = re.compile(r"^\s*-{3,}\s*$|^\s*={3,}\s*$|^\s*_{3,}\s*$")
# 列表项 (各自独立成段, 不跟相邻行合并, 不缩进首行)
_LIST_PATTERNS = [
re.compile(r"^\[\d+\]\s"), # [1]
re.compile(r"^[-*+]\s"), # - / * / +
re.compile(r"^\d+[\.、.]\s*"), # 1. / 1、 / 1
re.compile(r"^\(\d+\)\s*"), # (1)
re.compile(r"^\d+\s*"), # 1
re.compile(r"^[一二三四五六七八九十百千]+[、.\.]"), # 一、
re.compile(r"^[(][一二三四五六七八九十百千]+[)]"), # (一)
re.compile(r"^[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮]"), # ①
re.compile(r"^第[一二三四五六七八九十百]+[条章节]"), # 第一条
]
def is_list_item(line: str) -> bool:
return any(p.match(line) for p in _LIST_PATTERNS)
def is_table_line(line: str) -> bool:
return bool(_TABLE_LINE_RE.match(line))
def is_heading(line: str) -> bool:
return bool(_HEADING_RE.match(line))
def is_blockquote(line: str) -> bool:
return bool(_BLOCKQUOTE_RE.match(line))
def is_hr(line: str) -> bool:
return bool(_HR_RE.match(line))
# ───────────────────────── 表格 ─────────────────────────
def _split_md_row(line: str) -> list[str]:
return [c.strip() for c in line.strip().strip("|").split("|")]
def _is_separator_row(cells: list[str]) -> bool:
return all(re.match(r"^[-:\s]+$", c) for c in cells if c != "")
def render_table(doc: Document, table_lines: list[str]) -> None:
rows: list[list[str]] = []
for ln in table_lines:
cells = _split_md_row(ln)
if not cells or _is_separator_row(cells):
continue
rows.append(cells)
if not rows:
return
n_cols = max(len(r) for r in rows)
for r in rows:
while len(r) < n_cols:
r.append("")
table = doc.add_table(rows=len(rows), cols=n_cols)
try:
table.style = "Light Grid Accent 1"
except KeyError:
pass # style 不存在就用默认
for ri, row in enumerate(rows):
for ci, val in enumerate(row):
cell = table.rows[ri].cells[ci]
# 清掉 cell 默认空段落
cell.text = ""
p = cell.paragraphs[0]
p.paragraph_format.first_line_indent = None
p.paragraph_format.line_spacing = 1.2
add_inline(p, val, size=Pt(10.5), cn_font="宋体")
if ri == 0:
for run in p.runs:
run.bold = True
# ───────────────────────── 主渲染 ─────────────────────────
def render_md_block(doc: Document, md_text: str) -> None:
lines = md_text.splitlines()
i = 0
n = len(lines)
while i < n:
line = lines[i].rstrip()
# 空行
if not line.strip():
i += 1
continue
# 横线
if is_hr(line):
i += 1
continue
# 表格 (连续若干行 | ... | 视为一张表)
if is_table_line(line):
block: list[str] = []
while i < n and is_table_line(lines[i]):
block.append(lines[i])
i += 1
render_table(doc, block)
continue
# 标题
m = _HEADING_RE.match(line)
if m:
level = min(len(m.group(1)), 3)
add_heading(doc, m.group(2).strip(), level)
i += 1
continue
# 引用块 — 模板里多用作"写作提示", 不入正稿
if is_blockquote(line):
i += 1
continue
# 列表项 (含引文 [N]) — 各自独立成段, 不缩进首行
if is_list_item(line):
add_body_paragraph(doc, line.strip(), indent=False)
i += 1
continue
# 散文段落 — 合并下一空行 / 特殊行前的连续行
buf = [line.strip()]
j = i + 1
while j < n:
nxt = lines[j].rstrip()
if not nxt.strip():
break
if is_heading(nxt) or is_blockquote(nxt) or is_table_line(nxt) or is_list_item(nxt) or is_hr(nxt):
break
buf.append(nxt.strip())
j += 1
add_body_paragraph(doc, " ".join(buf), indent=True)
i = j
# ───────────────────────── 入口 ─────────────────────────
def render_sections(sections_dir: Path, out: Path, fund_type: str) -> None:
if not sections_dir.is_dir():
print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr)
sys.exit(2)
md_files = sorted(sections_dir.glob("*.md"))
if not md_files:
print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr)
sys.exit(2)
doc = init_doc()
add_toc(doc)
for f in md_files:
text = f.read_text(encoding="utf-8")
render_md_block(doc, text)
doc.add_page_break()
out.parent.mkdir(parents=True, exist_ok=True)
doc.save(str(out))
paras = sum(1 for _ in doc.paragraphs)
chars = sum(len(p.text) for p in doc.paragraphs)
tbls = len(doc.tables)
print(f"[OK] rendered {len(md_files)} sections -> {out}")
print(f" paragraphs: {paras} | tables: {tbls} | total chars: {chars}")
print(f" fund_type: {fund_type}")
print(f" font: 中文宋体小四 / 英文 Times New Roman 小四 / 行距 1.5 / 首行缩进 2 字符")
print(f" 提示: 在 Word 中打开后按 F9 (或右键目录 -> 更新域) 生成实际目录。")
def main() -> None:
ap = argparse.ArgumentParser(description="渲染章节 md → 申报书 docx")
ap.add_argument("sections_dir", type=Path, help="sections/*.md 目录")
ap.add_argument(
"--fund-type",
required=True,
choices=["key_rd", "major_project", "nsfc_joint_fund",
"nsfc_general", "nsfc_youth", "provincial", "enterprise"],
)
ap.add_argument("-o", "--output", type=Path, required=True, help="输出 .docx 路径")
args = ap.parse_args()
render_sections(args.sections_dir, args.output, args.fund_type)
if __name__ == "__main__":
main()