zcbot/skills/proposal/scripts/render_docx.py

193 lines
6.7 KiB
Python

"""把 sections/*.md 渲染成符合中国基金申报书排版规范的 .docx。
字体规范 (来自 typography.md):
- 标题黑体 / 正文中文宋体 / 英文 Times New Roman
- 行距 1.5 倍 / 首行缩进 2 字符
- A4 纸 / 上下 2.5cm / 左 3.0cm / 右 2.0cm
用法:
python render_docx.py <sections_dir> --fund-type key_rd -o <out.docx>
支持的基金类型: key_rd / major_project / nsfc_joint_fund / nsfc_general / nsfc_youth / provincial / enterprise
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.shared import Cm, Pt
def _set_east_asia_font(run, font_name: str = "宋体") -> None:
"""让 run 的中文字体生效 (python-docx 不直接支持东亚字体)。"""
rPr = run._element.get_or_add_rPr()
rFonts = rPr.find(qn("w:rFonts"))
if rFonts is None:
from docx.oxml import OxmlElement
rFonts = OxmlElement("w:rFonts")
rPr.append(rFonts)
rFonts.set(qn("w:eastAsia"), font_name)
rFonts.set(qn("w:ascii"), "Times New Roman")
rFonts.set(qn("w:hAnsi"), "Times New Roman")
def init_doc() -> Document:
doc = Document()
section = doc.sections[0]
section.page_height = Cm(29.7)
section.page_width = Cm(21)
section.top_margin = Cm(2.5)
section.bottom_margin = Cm(2.5)
section.left_margin = Cm(3.0)
section.right_margin = Cm(2.0)
# 不在这里改 default style — 直接每段自己设字体最稳
return doc
def add_paragraph(doc: Document, text: str, *, level: int = 0) -> None:
"""level: 0 正文 / 1 一级标题 / 2 二级标题 / 3 三级标题"""
if level == 0:
p = doc.add_paragraph()
p.paragraph_format.line_spacing = 1.5
p.paragraph_format.first_line_indent = Pt(24) # 2 字符
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
run = p.add_run(text)
run.font.name = "Times New Roman"
run.font.size = Pt(12) # 小四
_set_east_asia_font(run, "宋体")
else:
sizes = {1: Pt(14), 2: Pt(12), 3: Pt(12)}
bolds = {1: False, 2: True, 3: True}
font_cn = {1: "黑体", 2: "黑体", 3: "宋体"}
p = doc.add_paragraph()
p.paragraph_format.line_spacing = 1.5
p.paragraph_format.space_before = Pt(6)
p.paragraph_format.space_after = Pt(3)
run = p.add_run(text)
run.font.name = "Times New Roman"
run.font.size = sizes[level]
run.bold = bolds[level]
_set_east_asia_font(run, font_cn[level])
_HEADING_RE = re.compile(r"^(#+)\s+(.+)$")
_TABLE_LINE_RE = re.compile(r"^\s*\|.+\|\s*$")
def render_md_block(doc: Document, md_text: str) -> None:
lines = md_text.splitlines()
i = 0
while i < len(lines):
line = lines[i].rstrip()
# markdown table
if _TABLE_LINE_RE.match(line):
tbl_lines = []
while i < len(lines) and _TABLE_LINE_RE.match(lines[i]):
tbl_lines.append(lines[i])
i += 1
_render_md_table(doc, tbl_lines)
continue
if not line.strip():
i += 1
continue
m = _HEADING_RE.match(line)
if m:
hashes, title = m.group(1), m.group(2)
level = min(len(hashes), 3)
add_paragraph(doc, title.strip(), level=level)
elif line.startswith(">"):
# blockquote — 申报书里通常是写作提示, 渲染时跳过 (撰稿提示不入正稿)
pass
else:
# 正文段落: 把连续非空行合并成一段
buf = [line]
j = i + 1
while j < len(lines) and lines[j].strip() and not _HEADING_RE.match(lines[j]) and not lines[j].startswith(">") and not _TABLE_LINE_RE.match(lines[j]):
buf.append(lines[j].rstrip())
j += 1
text = " ".join(s.strip() for s in buf)
add_paragraph(doc, text, level=0)
i = j
continue
i += 1
def _render_md_table(doc: Document, table_lines: list[str]) -> None:
"""把一段 markdown 表格转换成 docx 表格。第二行如果是分隔符 (---) 跳过。"""
rows: list[list[str]] = []
for ln in table_lines:
cells = [c.strip() for c in ln.strip().strip("|").split("|")]
# skip pure separator row
if all(re.match(r"^[-: ]+$", c) for c in cells):
continue
rows.append(cells)
if not rows:
return
n_cols = max(len(r) for r in rows)
for r in rows:
while len(r) < n_cols:
r.append("")
table = doc.add_table(rows=len(rows), cols=n_cols)
table.style = "Light Grid Accent 1"
for ri, row in enumerate(rows):
for ci, val in enumerate(row):
cell = table.rows[ri].cells[ci]
cell.text = "" # clear default
p = cell.paragraphs[0]
run = p.add_run(val)
run.font.name = "Times New Roman"
run.font.size = Pt(10.5) # 五号
_set_east_asia_font(run, "宋体")
if ri == 0:
run.bold = True
def render_sections(sections_dir: Path, out: Path, fund_type: str) -> None:
if not sections_dir.is_dir():
print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr)
sys.exit(2)
md_files = sorted(sections_dir.glob("*.md"))
if not md_files:
print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr)
sys.exit(2)
doc = init_doc()
for f in md_files:
text = f.read_text(encoding="utf-8")
render_md_block(doc, text)
# 每个 section 后加分页, 让结构更清晰
doc.add_page_break()
out.parent.mkdir(parents=True, exist_ok=True)
doc.save(str(out))
# 统计
paras = sum(1 for p in doc.paragraphs)
chars = sum(len(p.text) for p in doc.paragraphs)
print(f"[OK] rendered {len(md_files)} sections → {out}")
print(f" paragraphs: {paras} | total chars: {chars}")
print(f" fund_type: {fund_type}")
print(f" font: 中文宋体小四 / 英文 Times New Roman 小四 / 行距 1.5 / 首行缩进 2 字符")
def main() -> None:
ap = argparse.ArgumentParser(description="渲染章节 md → 申报书 docx")
ap.add_argument("sections_dir", type=Path, help="sections/*.md 目录")
ap.add_argument("--fund-type", required=True,
choices=["key_rd", "major_project", "nsfc_joint_fund",
"nsfc_general", "nsfc_youth", "provincial", "enterprise"])
ap.add_argument("-o", "--output", type=Path, required=True, help="输出 .docx 路径")
args = ap.parse_args()
render_sections(args.sections_dir, args.output, args.fund_type)
if __name__ == "__main__":
main()