diff --git a/skills/proposal/scripts/render_docx.py b/skills/proposal/scripts/render_docx.py index 828e214..da698c5 100644 --- a/skills/proposal/scripts/render_docx.py +++ b/skills/proposal/scripts/render_docx.py @@ -1,14 +1,18 @@ """把 sections/*.md 渲染成符合中国基金申报书排版规范的 .docx。 字体规范 (来自 typography.md): -- 标题黑体 / 正文中文宋体 / 英文 Times New Roman +- 标题黑体 (一二级) / 三级标题宋体 / 正文中文宋体 / 英文 Times New Roman - 行距 1.5 倍 / 首行缩进 2 字符 - A4 纸 / 上下 2.5cm / 左 3.0cm / 右 2.0cm +特性: +- 自动插入"目录"页 (Word 内右键更新域 / F9 即生成 TOC) +- 内联 markdown 解析: **加粗** / *斜体* / `等宽` +- 列表/引用文献项 ([N], 1., (1), 一、, -, *) 各自独立成段 +- markdown 表格自动识别, 包含分隔行 |---|---| + 用法: python render_docx.py --fund-type key_rd -o - -支持的基金类型: key_rd / major_project / nsfc_joint_fund / nsfc_general / nsfc_youth / provincial / enterprise """ from __future__ import annotations import argparse @@ -18,25 +22,47 @@ from pathlib import Path from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.oxml import OxmlElement from docx.oxml.ns import qn -from docx.shared import Cm, Pt +from docx.shared import Cm, Pt, RGBColor -def _set_east_asia_font(run, font_name: str = "宋体") -> None: - """让 run 的中文字体生效 (python-docx 不直接支持东亚字体)。""" +# ───────────────────────── 字体辅助 ───────────────────────── + +def _set_run_fonts(run, *, cn_font: str = "宋体", en_font: str = "Times New Roman") -> None: + """同时设置 run 的中文 (eastAsia) 和西文 (ascii/hAnsi) 字体。""" rPr = run._element.get_or_add_rPr() rFonts = rPr.find(qn("w:rFonts")) if rFonts is None: - from docx.oxml import OxmlElement rFonts = OxmlElement("w:rFonts") rPr.append(rFonts) - rFonts.set(qn("w:eastAsia"), font_name) - rFonts.set(qn("w:ascii"), "Times New Roman") - rFonts.set(qn("w:hAnsi"), "Times New Roman") + rFonts.set(qn("w:eastAsia"), cn_font) + rFonts.set(qn("w:ascii"), en_font) + rFonts.set(qn("w:hAnsi"), en_font) +def _set_style_fonts(style, *, cn_font: str = "宋体", en_font: str = "Times New Roman") -> None: + """直接给 style 写 rFonts, 这样基于该 style 的所有段落都继承字体。""" + el = style.element + rPr = el.find(qn("w:rPr")) + if rPr is None: + rPr = OxmlElement("w:rPr") + el.insert(0, rPr) + rFonts = rPr.find(qn("w:rFonts")) + if rFonts is None: + rFonts = OxmlElement("w:rFonts") + rPr.append(rFonts) + rFonts.set(qn("w:eastAsia"), cn_font) + rFonts.set(qn("w:ascii"), en_font) + rFonts.set(qn("w:hAnsi"), en_font) + + +# ───────────────────────── 文档初始化 ───────────────────────── + def init_doc() -> Document: doc = Document() + + # 页面 section = doc.sections[0] section.page_height = Cm(29.7) section.page_width = Cm(21) @@ -44,88 +70,210 @@ def init_doc() -> Document: section.bottom_margin = Cm(2.5) section.left_margin = Cm(3.0) section.right_margin = Cm(2.0) - # 不在这里改 default style — 直接每段自己设字体最稳 + + # Normal 样式 (正文) + normal = doc.styles["Normal"] + normal.font.name = "Times New Roman" + normal.font.size = Pt(12) # 小四 + _set_style_fonts(normal, cn_font="宋体") + pf = normal.paragraph_format + pf.line_spacing = 1.5 + pf.space_before = Pt(0) + pf.space_after = Pt(0) + + # Heading 样式 — 让 Word TOC 域识别 + for lvl, sz, cn in [(1, Pt(14), "黑体"), (2, Pt(12), "黑体"), (3, Pt(12), "宋体")]: + h = doc.styles[f"Heading {lvl}"] + h.font.name = "Times New Roman" + h.font.size = sz + h.font.bold = True + h.font.color.rgb = RGBColor(0, 0, 0) # 覆盖 builtin 蓝色 + _set_style_fonts(h, cn_font=cn) + h.paragraph_format.line_spacing = 1.5 + h.paragraph_format.space_before = Pt(6) + h.paragraph_format.space_after = Pt(3) + h.paragraph_format.first_line_indent = None + return doc -def add_paragraph(doc: Document, text: str, *, level: int = 0) -> None: - """level: 0 正文 / 1 一级标题 / 2 二级标题 / 3 三级标题""" - if level == 0: - p = doc.add_paragraph() - p.paragraph_format.line_spacing = 1.5 - p.paragraph_format.first_line_indent = Pt(24) # 2 字符 - p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY - run = p.add_run(text) - run.font.name = "Times New Roman" - run.font.size = Pt(12) # 小四 - _set_east_asia_font(run, "宋体") - else: - sizes = {1: Pt(14), 2: Pt(12), 3: Pt(12)} - bolds = {1: False, 2: True, 3: True} - font_cn = {1: "黑体", 2: "黑体", 3: "宋体"} - p = doc.add_paragraph() - p.paragraph_format.line_spacing = 1.5 - p.paragraph_format.space_before = Pt(6) - p.paragraph_format.space_after = Pt(3) - run = p.add_run(text) - run.font.name = "Times New Roman" - run.font.size = sizes[level] - run.bold = bolds[level] - _set_east_asia_font(run, font_cn[level]) +# ───────────────────────── TOC ───────────────────────── + +def add_toc(doc: Document, depth: int = 3) -> None: + """在文档开头插入 '目录' 标题 + Word 域 TOC。 + + Word 打开时不会自动展开;用户右键域 → '更新域' 或按 F9。 + LibreOffice 打开会更直接显示。 + """ + # "目 录" 标题 (居中, 不用 Heading 样式以免自我包含) + p = doc.add_paragraph() + p.alignment = WD_ALIGN_PARAGRAPH.CENTER + p.paragraph_format.first_line_indent = None + p.paragraph_format.space_before = Pt(12) + p.paragraph_format.space_after = Pt(6) + run = p.add_run("目 录") + run.font.size = Pt(16) # 三号 + run.font.bold = True + _set_run_fonts(run, cn_font="黑体") + + # TOC 域 + p = doc.add_paragraph() + p.paragraph_format.first_line_indent = None + run = p.add_run() + + fldChar1 = OxmlElement("w:fldChar") + fldChar1.set(qn("w:fldCharType"), "begin") + + instrText = OxmlElement("w:instrText") + instrText.set(qn("xml:space"), "preserve") + instrText.text = f' TOC \\o "1-{depth}" \\h \\z \\u ' + + fldChar2 = OxmlElement("w:fldChar") + fldChar2.set(qn("w:fldCharType"), "separate") + + fldChar3 = OxmlElement("w:fldChar") + fldChar3.set(qn("w:fldCharType"), "end") + + # 占位文字 — Word 更新域时会被实际目录替换 + placeholder_t = OxmlElement("w:t") + placeholder_t.set(qn("xml:space"), "preserve") + placeholder_t.text = "[在 Word 中按 F9 或右键此处选择 “更新域” 即可生成完整目录]" + + run._element.append(fldChar1) + run._element.append(instrText) + run._element.append(fldChar2) + run._element.append(placeholder_t) + run._element.append(fldChar3) + + doc.add_page_break() -_HEADING_RE = re.compile(r"^(#+)\s+(.+)$") -_TABLE_LINE_RE = re.compile(r"^\s*\|.+\|\s*$") +# ───────────────────────── 内联 markdown ───────────────────────── + +# 顺序敏感:**bold** 必须先于 *italic* 匹配, 否则会被 italic 抢 +_INLINE_RE = re.compile( + r"(?P\*\*(?P[^*\n]+?)\*\*)" + r"|(?P(?[^*\n]+?)\*(?!\*))" + r"|(?P`(?P[^`\n]+?)`)" +) -def render_md_block(doc: Document, md_text: str) -> None: - lines = md_text.splitlines() - i = 0 - while i < len(lines): - line = lines[i].rstrip() +def parse_inline(text: str) -> list[tuple[str, str]]: + """切成 (style, segment) 列表; style ∈ plain/bold/italic/code。""" + out: list[tuple[str, str]] = [] + pos = 0 + for m in _INLINE_RE.finditer(text): + if m.start() > pos: + out.append(("plain", text[pos:m.start()])) + if m.group("bold"): + out.append(("bold", m.group("bold_t"))) + elif m.group("italic"): + out.append(("italic", m.group("italic_t"))) + elif m.group("code"): + out.append(("code", m.group("code_t"))) + pos = m.end() + if pos < len(text): + out.append(("plain", text[pos:])) + return out or [("plain", text)] - # markdown table - if _TABLE_LINE_RE.match(line): - tbl_lines = [] - while i < len(lines) and _TABLE_LINE_RE.match(lines[i]): - tbl_lines.append(lines[i]) - i += 1 - _render_md_table(doc, tbl_lines) - continue - if not line.strip(): - i += 1 - continue - - m = _HEADING_RE.match(line) - if m: - hashes, title = m.group(1), m.group(2) - level = min(len(hashes), 3) - add_paragraph(doc, title.strip(), level=level) - elif line.startswith(">"): - # blockquote — 申报书里通常是写作提示, 渲染时跳过 (撰稿提示不入正稿) - pass +def add_inline(paragraph, text: str, *, size: Pt = Pt(12), cn_font: str = "宋体") -> None: + for style, seg in parse_inline(text): + run = paragraph.add_run(seg) + run.font.size = size + if style == "bold": + run.bold = True + _set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman") + elif style == "italic": + run.italic = True + _set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman") + elif style == "code": + _set_run_fonts(run, cn_font=cn_font, en_font="Consolas") else: - # 正文段落: 把连续非空行合并成一段 - buf = [line] - j = i + 1 - while j < len(lines) and lines[j].strip() and not _HEADING_RE.match(lines[j]) and not lines[j].startswith(">") and not _TABLE_LINE_RE.match(lines[j]): - buf.append(lines[j].rstrip()) - j += 1 - text = " ".join(s.strip() for s in buf) - add_paragraph(doc, text, level=0) - i = j - continue - i += 1 + _set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman") -def _render_md_table(doc: Document, table_lines: list[str]) -> None: - """把一段 markdown 表格转换成 docx 表格。第二行如果是分隔符 (---) 跳过。""" +# ───────────────────────── 段落 / 标题 / 列表 ───────────────────────── + +def add_heading(doc: Document, text: str, level: int) -> None: + p = doc.add_paragraph(style=f"Heading {level}") + p.paragraph_format.first_line_indent = None + # 标题里通常无内联 markdown, 但万一有也按内联解析 (黑体大小由 style 已设) + sizes = {1: Pt(14), 2: Pt(12), 3: Pt(12)} + cn = {1: "黑体", 2: "黑体", 3: "宋体"} + add_inline(p, text, size=sizes[level], cn_font=cn[level]) + for run in p.runs: + run.bold = True + + +def add_body_paragraph(doc: Document, text: str, *, indent: bool = True) -> None: + p = doc.add_paragraph() + pf = p.paragraph_format + pf.line_spacing = 1.5 + if indent: + pf.first_line_indent = Pt(24) # 2 字符 + else: + pf.first_line_indent = None + p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY + add_inline(p, text) + + +# ───────────────────────── 行类型识别 ───────────────────────── + +_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$") +_TABLE_LINE_RE = re.compile(r"^\s*\|.*\|\s*$") +_BLOCKQUOTE_RE = re.compile(r"^\s*>\s?") +_HR_RE = re.compile(r"^\s*-{3,}\s*$|^\s*={3,}\s*$|^\s*_{3,}\s*$") + +# 列表项 (各自独立成段, 不跟相邻行合并, 不缩进首行) +_LIST_PATTERNS = [ + re.compile(r"^\[\d+\]\s"), # [1] + re.compile(r"^[-*+]\s"), # - / * / + + re.compile(r"^\d+[\.、.]\s*"), # 1. / 1、 / 1. + re.compile(r"^\(\d+\)\s*"), # (1) + re.compile(r"^(\d+)\s*"), # (1) + re.compile(r"^[一二三四五六七八九十百千]+[、.\.]"), # 一、 + re.compile(r"^[((][一二三四五六七八九十百千]+[))]"), # (一) + re.compile(r"^[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮]"), # ① + re.compile(r"^第[一二三四五六七八九十百]+[条章节]"), # 第一条 +] + + +def is_list_item(line: str) -> bool: + return any(p.match(line) for p in _LIST_PATTERNS) + + +def is_table_line(line: str) -> bool: + return bool(_TABLE_LINE_RE.match(line)) + + +def is_heading(line: str) -> bool: + return bool(_HEADING_RE.match(line)) + + +def is_blockquote(line: str) -> bool: + return bool(_BLOCKQUOTE_RE.match(line)) + + +def is_hr(line: str) -> bool: + return bool(_HR_RE.match(line)) + + +# ───────────────────────── 表格 ───────────────────────── + +def _split_md_row(line: str) -> list[str]: + return [c.strip() for c in line.strip().strip("|").split("|")] + + +def _is_separator_row(cells: list[str]) -> bool: + return all(re.match(r"^[-:\s]+$", c) for c in cells if c != "") + + +def render_table(doc: Document, table_lines: list[str]) -> None: rows: list[list[str]] = [] for ln in table_lines: - cells = [c.strip() for c in ln.strip().strip("|").split("|")] - # skip pure separator row - if all(re.match(r"^[-: ]+$", c) for c in cells): + cells = _split_md_row(ln) + if not cells or _is_separator_row(cells): continue rows.append(cells) if not rows: @@ -134,21 +282,91 @@ def _render_md_table(doc: Document, table_lines: list[str]) -> None: for r in rows: while len(r) < n_cols: r.append("") + table = doc.add_table(rows=len(rows), cols=n_cols) - table.style = "Light Grid Accent 1" + try: + table.style = "Light Grid Accent 1" + except KeyError: + pass # style 不存在就用默认 + for ri, row in enumerate(rows): for ci, val in enumerate(row): cell = table.rows[ri].cells[ci] - cell.text = "" # clear default + # 清掉 cell 默认空段落 + cell.text = "" p = cell.paragraphs[0] - run = p.add_run(val) - run.font.name = "Times New Roman" - run.font.size = Pt(10.5) # 五号 - _set_east_asia_font(run, "宋体") + p.paragraph_format.first_line_indent = None + p.paragraph_format.line_spacing = 1.2 + add_inline(p, val, size=Pt(10.5), cn_font="宋体") if ri == 0: - run.bold = True + for run in p.runs: + run.bold = True +# ───────────────────────── 主渲染 ───────────────────────── + +def render_md_block(doc: Document, md_text: str) -> None: + lines = md_text.splitlines() + i = 0 + n = len(lines) + while i < n: + line = lines[i].rstrip() + + # 空行 + if not line.strip(): + i += 1 + continue + + # 横线 + if is_hr(line): + i += 1 + continue + + # 表格 (连续若干行 | ... | 视为一张表) + if is_table_line(line): + block: list[str] = [] + while i < n and is_table_line(lines[i]): + block.append(lines[i]) + i += 1 + render_table(doc, block) + continue + + # 标题 + m = _HEADING_RE.match(line) + if m: + level = min(len(m.group(1)), 3) + add_heading(doc, m.group(2).strip(), level) + i += 1 + continue + + # 引用块 — 模板里多用作"写作提示", 不入正稿 + if is_blockquote(line): + i += 1 + continue + + # 列表项 (含引文 [N]) — 各自独立成段, 不缩进首行 + if is_list_item(line): + add_body_paragraph(doc, line.strip(), indent=False) + i += 1 + continue + + # 散文段落 — 合并下一空行 / 特殊行前的连续行 + buf = [line.strip()] + j = i + 1 + while j < n: + nxt = lines[j].rstrip() + if not nxt.strip(): + break + if is_heading(nxt) or is_blockquote(nxt) or is_table_line(nxt) or is_list_item(nxt) or is_hr(nxt): + break + buf.append(nxt.strip()) + j += 1 + add_body_paragraph(doc, " ".join(buf), indent=True) + i = j + + +# ───────────────────────── 入口 ───────────────────────── + def render_sections(sections_dir: Path, out: Path, fund_type: str) -> None: if not sections_dir.is_dir(): print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr) @@ -159,30 +377,34 @@ def render_sections(sections_dir: Path, out: Path, fund_type: str) -> None: sys.exit(2) doc = init_doc() + add_toc(doc) for f in md_files: text = f.read_text(encoding="utf-8") render_md_block(doc, text) - # 每个 section 后加分页, 让结构更清晰 doc.add_page_break() out.parent.mkdir(parents=True, exist_ok=True) doc.save(str(out)) - # 统计 - paras = sum(1 for p in doc.paragraphs) + paras = sum(1 for _ in doc.paragraphs) chars = sum(len(p.text) for p in doc.paragraphs) - print(f"[OK] rendered {len(md_files)} sections → {out}") - print(f" paragraphs: {paras} | total chars: {chars}") + tbls = len(doc.tables) + print(f"[OK] rendered {len(md_files)} sections -> {out}") + print(f" paragraphs: {paras} | tables: {tbls} | total chars: {chars}") print(f" fund_type: {fund_type}") print(f" font: 中文宋体小四 / 英文 Times New Roman 小四 / 行距 1.5 / 首行缩进 2 字符") + print(f" 提示: 在 Word 中打开后按 F9 (或右键目录 -> 更新域) 生成实际目录。") def main() -> None: ap = argparse.ArgumentParser(description="渲染章节 md → 申报书 docx") ap.add_argument("sections_dir", type=Path, help="sections/*.md 目录") - ap.add_argument("--fund-type", required=True, - choices=["key_rd", "major_project", "nsfc_joint_fund", - "nsfc_general", "nsfc_youth", "provincial", "enterprise"]) + ap.add_argument( + "--fund-type", + required=True, + choices=["key_rd", "major_project", "nsfc_joint_fund", + "nsfc_general", "nsfc_youth", "provincial", "enterprise"], + ) ap.add_argument("-o", "--output", type=Path, required=True, help="输出 .docx 路径") args = ap.parse_args() render_sections(args.sections_dir, args.output, args.fund_type)