"""把 sections/*.md 渲染成符合中国基金申报书排版规范的 .docx。 字体规范 (来自 typography.md): - 标题黑体 (一二级) / 三级标题宋体 / 正文中文宋体 / 英文 Times New Roman - 行距 1.5 倍 / 首行缩进 2 字符 - A4 纸 / 上下 2.5cm / 左 3.0cm / 右 2.0cm 特性: - 自动插入"目录"页 (Word 内右键更新域 / F9 即生成 TOC) - 内联 markdown 解析: **加粗** / *斜体* / `等宽` - 列表/引用文献项 ([N], 1., (1), 一、, -, *) 各自独立成段 - markdown 表格自动识别, 包含分隔行 |---|---| 用法: python render_docx.py --fund-type key_rd -o """ from __future__ import annotations import argparse import re import sys from pathlib import Path from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.shared import Cm, Pt, RGBColor # ───────────────────────── 字体辅助 ───────────────────────── def _set_run_fonts(run, *, cn_font: str = "宋体", en_font: str = "Times New Roman") -> None: """同时设置 run 的中文 (eastAsia) 和西文 (ascii/hAnsi) 字体。""" rPr = run._element.get_or_add_rPr() rFonts = rPr.find(qn("w:rFonts")) if rFonts is None: rFonts = OxmlElement("w:rFonts") rPr.append(rFonts) rFonts.set(qn("w:eastAsia"), cn_font) rFonts.set(qn("w:ascii"), en_font) rFonts.set(qn("w:hAnsi"), en_font) def _set_style_fonts(style, *, cn_font: str = "宋体", en_font: str = "Times New Roman") -> None: """直接给 style 写 rFonts, 这样基于该 style 的所有段落都继承字体。""" el = style.element rPr = el.find(qn("w:rPr")) if rPr is None: rPr = OxmlElement("w:rPr") el.insert(0, rPr) rFonts = rPr.find(qn("w:rFonts")) if rFonts is None: rFonts = OxmlElement("w:rFonts") rPr.append(rFonts) rFonts.set(qn("w:eastAsia"), cn_font) rFonts.set(qn("w:ascii"), en_font) rFonts.set(qn("w:hAnsi"), en_font) # ───────────────────────── 文档初始化 ───────────────────────── def init_doc() -> Document: doc = Document() # 页面 section = doc.sections[0] section.page_height = Cm(29.7) section.page_width = Cm(21) section.top_margin = Cm(2.5) section.bottom_margin = Cm(2.5) section.left_margin = Cm(3.0) section.right_margin = Cm(2.0) # Normal 样式 (正文) normal = doc.styles["Normal"] normal.font.name = "Times New Roman" normal.font.size = Pt(12) # 小四 _set_style_fonts(normal, cn_font="宋体") pf = normal.paragraph_format pf.line_spacing = 1.5 pf.space_before = Pt(0) pf.space_after = Pt(0) # Heading 样式 — 让 Word TOC 域识别 for lvl, sz, cn in [(1, Pt(14), "黑体"), (2, Pt(12), "黑体"), (3, Pt(12), "宋体")]: h = doc.styles[f"Heading {lvl}"] h.font.name = "Times New Roman" h.font.size = sz h.font.bold = True h.font.color.rgb = RGBColor(0, 0, 0) # 覆盖 builtin 蓝色 _set_style_fonts(h, cn_font=cn) h.paragraph_format.line_spacing = 1.5 h.paragraph_format.space_before = Pt(6) h.paragraph_format.space_after = Pt(3) h.paragraph_format.first_line_indent = None return doc # ───────────────────────── TOC ───────────────────────── def add_toc(doc: Document, depth: int = 3) -> None: """在文档开头插入 '目录' 标题 + Word 域 TOC。 Word 打开时不会自动展开;用户右键域 → '更新域' 或按 F9。 LibreOffice 打开会更直接显示。 """ # "目 录" 标题 (居中, 不用 Heading 样式以免自我包含) p = doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.first_line_indent = None p.paragraph_format.space_before = Pt(12) p.paragraph_format.space_after = Pt(6) run = p.add_run("目 录") run.font.size = Pt(16) # 三号 run.font.bold = True _set_run_fonts(run, cn_font="黑体") # TOC 域 p = doc.add_paragraph() p.paragraph_format.first_line_indent = None run = p.add_run() fldChar1 = OxmlElement("w:fldChar") fldChar1.set(qn("w:fldCharType"), "begin") instrText = OxmlElement("w:instrText") instrText.set(qn("xml:space"), "preserve") instrText.text = f' TOC \\o "1-{depth}" \\h \\z \\u ' fldChar2 = OxmlElement("w:fldChar") fldChar2.set(qn("w:fldCharType"), "separate") fldChar3 = OxmlElement("w:fldChar") fldChar3.set(qn("w:fldCharType"), "end") # 占位文字 — Word 更新域时会被实际目录替换 placeholder_t = OxmlElement("w:t") placeholder_t.set(qn("xml:space"), "preserve") placeholder_t.text = "[在 Word 中按 F9 或右键此处选择 “更新域” 即可生成完整目录]" run._element.append(fldChar1) run._element.append(instrText) run._element.append(fldChar2) run._element.append(placeholder_t) run._element.append(fldChar3) doc.add_page_break() # ───────────────────────── 内联 markdown ───────────────────────── # 顺序敏感:**bold** 必须先于 *italic* 匹配, 否则会被 italic 抢 _INLINE_RE = re.compile( r"(?P\*\*(?P[^*\n]+?)\*\*)" r"|(?P(?[^*\n]+?)\*(?!\*))" r"|(?P`(?P[^`\n]+?)`)" ) def parse_inline(text: str) -> list[tuple[str, str]]: """切成 (style, segment) 列表; style ∈ plain/bold/italic/code。""" out: list[tuple[str, str]] = [] pos = 0 for m in _INLINE_RE.finditer(text): if m.start() > pos: out.append(("plain", text[pos:m.start()])) if m.group("bold"): out.append(("bold", m.group("bold_t"))) elif m.group("italic"): out.append(("italic", m.group("italic_t"))) elif m.group("code"): out.append(("code", m.group("code_t"))) pos = m.end() if pos < len(text): out.append(("plain", text[pos:])) return out or [("plain", text)] def add_inline(paragraph, text: str, *, size: Pt = Pt(12), cn_font: str = "宋体") -> None: for style, seg in parse_inline(text): run = paragraph.add_run(seg) run.font.size = size if style == "bold": run.bold = True _set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman") elif style == "italic": run.italic = True _set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman") elif style == "code": _set_run_fonts(run, cn_font=cn_font, en_font="Consolas") else: _set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman") # ───────────────────────── 段落 / 标题 / 列表 ───────────────────────── def add_heading(doc: Document, text: str, level: int) -> None: p = doc.add_paragraph(style=f"Heading {level}") p.paragraph_format.first_line_indent = None # 标题里通常无内联 markdown, 但万一有也按内联解析 (黑体大小由 style 已设) sizes = {1: Pt(14), 2: Pt(12), 3: Pt(12)} cn = {1: "黑体", 2: "黑体", 3: "宋体"} add_inline(p, text, size=sizes[level], cn_font=cn[level]) for run in p.runs: run.bold = True def add_body_paragraph(doc: Document, text: str, *, indent: bool = True) -> None: p = doc.add_paragraph() pf = p.paragraph_format pf.line_spacing = 1.5 if indent: pf.first_line_indent = Pt(24) # 2 字符 else: pf.first_line_indent = None p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY add_inline(p, text) # ───────────────────────── 行类型识别 ───────────────────────── _HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$") _TABLE_LINE_RE = re.compile(r"^\s*\|.*\|\s*$") _BLOCKQUOTE_RE = re.compile(r"^\s*>\s?") _HR_RE = re.compile(r"^\s*-{3,}\s*$|^\s*={3,}\s*$|^\s*_{3,}\s*$") # 列表项 (各自独立成段, 不跟相邻行合并, 不缩进首行) _LIST_PATTERNS = [ re.compile(r"^\[\d+\]\s"), # [1] re.compile(r"^[-*+]\s"), # - / * / + re.compile(r"^\d+[\.、.]\s*"), # 1. / 1、 / 1. re.compile(r"^\(\d+\)\s*"), # (1) re.compile(r"^(\d+)\s*"), # (1) re.compile(r"^[一二三四五六七八九十百千]+[、.\.]"), # 一、 re.compile(r"^[((][一二三四五六七八九十百千]+[))]"), # (一) re.compile(r"^[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮]"), # ① re.compile(r"^第[一二三四五六七八九十百]+[条章节]"), # 第一条 ] def is_list_item(line: str) -> bool: return any(p.match(line) for p in _LIST_PATTERNS) def is_table_line(line: str) -> bool: return bool(_TABLE_LINE_RE.match(line)) def is_heading(line: str) -> bool: return bool(_HEADING_RE.match(line)) def is_blockquote(line: str) -> bool: return bool(_BLOCKQUOTE_RE.match(line)) def is_hr(line: str) -> bool: return bool(_HR_RE.match(line)) # ───────────────────────── 表格 ───────────────────────── def _split_md_row(line: str) -> list[str]: return [c.strip() for c in line.strip().strip("|").split("|")] def _is_separator_row(cells: list[str]) -> bool: return all(re.match(r"^[-:\s]+$", c) for c in cells if c != "") def render_table(doc: Document, table_lines: list[str]) -> None: rows: list[list[str]] = [] for ln in table_lines: cells = _split_md_row(ln) if not cells or _is_separator_row(cells): continue rows.append(cells) if not rows: return n_cols = max(len(r) for r in rows) for r in rows: while len(r) < n_cols: r.append("") table = doc.add_table(rows=len(rows), cols=n_cols) try: table.style = "Light Grid Accent 1" except KeyError: pass # style 不存在就用默认 for ri, row in enumerate(rows): for ci, val in enumerate(row): cell = table.rows[ri].cells[ci] # 清掉 cell 默认空段落 cell.text = "" p = cell.paragraphs[0] p.paragraph_format.first_line_indent = None p.paragraph_format.line_spacing = 1.2 add_inline(p, val, size=Pt(10.5), cn_font="宋体") if ri == 0: for run in p.runs: run.bold = True # ───────────────────────── 主渲染 ───────────────────────── def render_md_block(doc: Document, md_text: str) -> None: lines = md_text.splitlines() i = 0 n = len(lines) while i < n: line = lines[i].rstrip() # 空行 if not line.strip(): i += 1 continue # 横线 if is_hr(line): i += 1 continue # 表格 (连续若干行 | ... | 视为一张表) if is_table_line(line): block: list[str] = [] while i < n and is_table_line(lines[i]): block.append(lines[i]) i += 1 render_table(doc, block) continue # 标题 m = _HEADING_RE.match(line) if m: level = min(len(m.group(1)), 3) add_heading(doc, m.group(2).strip(), level) i += 1 continue # 引用块 — 模板里多用作"写作提示", 不入正稿 if is_blockquote(line): i += 1 continue # 列表项 (含引文 [N]) — 各自独立成段, 不缩进首行 if is_list_item(line): add_body_paragraph(doc, line.strip(), indent=False) i += 1 continue # 散文段落 — 合并下一空行 / 特殊行前的连续行 buf = [line.strip()] j = i + 1 while j < n: nxt = lines[j].rstrip() if not nxt.strip(): break if is_heading(nxt) or is_blockquote(nxt) or is_table_line(nxt) or is_list_item(nxt) or is_hr(nxt): break buf.append(nxt.strip()) j += 1 add_body_paragraph(doc, " ".join(buf), indent=True) i = j # ───────────────────────── 入口 ───────────────────────── def render_sections(sections_dir: Path, out: Path, fund_type: str) -> None: if not sections_dir.is_dir(): print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr) sys.exit(2) md_files = sorted(sections_dir.glob("*.md")) if not md_files: print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr) sys.exit(2) doc = init_doc() add_toc(doc) for f in md_files: text = f.read_text(encoding="utf-8") render_md_block(doc, text) doc.add_page_break() out.parent.mkdir(parents=True, exist_ok=True) doc.save(str(out)) paras = sum(1 for _ in doc.paragraphs) chars = sum(len(p.text) for p in doc.paragraphs) tbls = len(doc.tables) print(f"[OK] rendered {len(md_files)} sections -> {out}") print(f" paragraphs: {paras} | tables: {tbls} | total chars: {chars}") print(f" fund_type: {fund_type}") print(f" font: 中文宋体小四 / 英文 Times New Roman 小四 / 行距 1.5 / 首行缩进 2 字符") print(f" 提示: 在 Word 中打开后按 F9 (或右键目录 -> 更新域) 生成实际目录。") def main() -> None: ap = argparse.ArgumentParser(description="渲染章节 md → 申报书 docx") ap.add_argument("sections_dir", type=Path, help="sections/*.md 目录") ap.add_argument( "--fund-type", required=True, choices=["key_rd", "major_project", "nsfc_joint_fund", "nsfc_general", "nsfc_youth", "provincial", "enterprise"], ) ap.add_argument("-o", "--output", type=Path, required=True, help="输出 .docx 路径") args = ap.parse_args() render_sections(args.sections_dir, args.output, args.fund_type) if __name__ == "__main__": main()