zcbot/skills/proposal/scripts/render_docx.py

"""把 sections/*.md 渲染成符合中国基金申报书排版规范的 .docx。

字体规范 (来自 typography.md):
- 标题黑体 / 正文中文宋体 / 英文 Times New Roman
- 行距 1.5 倍 / 首行缩进 2 字符
- A4 纸 / 上下 2.5cm / 左 3.0cm / 右 2.0cm

用法:
  python render_docx.py <sections_dir> --fund-type key_rd -o <out.docx>

支持的基金类型: key_rd / major_project / nsfc_joint_fund / nsfc_general / nsfc_youth / provincial / enterprise
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path

from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.shared import Cm, Pt


def _set_east_asia_font(run, font_name: str = "宋体") -> None:
    """让 run 的中文字体生效 (python-docx 不直接支持东亚字体)。"""
    rPr = run._element.get_or_add_rPr()
    rFonts = rPr.find(qn("w:rFonts"))
    if rFonts is None:
        from docx.oxml import OxmlElement
        rFonts = OxmlElement("w:rFonts")
        rPr.append(rFonts)
    rFonts.set(qn("w:eastAsia"), font_name)
    rFonts.set(qn("w:ascii"), "Times New Roman")
    rFonts.set(qn("w:hAnsi"), "Times New Roman")


def init_doc() -> Document:
    doc = Document()
    section = doc.sections[0]
    section.page_height = Cm(29.7)
    section.page_width = Cm(21)
    section.top_margin = Cm(2.5)
    section.bottom_margin = Cm(2.5)
    section.left_margin = Cm(3.0)
    section.right_margin = Cm(2.0)
    # 不在这里改 default style — 直接每段自己设字体最稳
    return doc


def add_paragraph(doc: Document, text: str, *, level: int = 0) -> None:
    """level: 0 正文 / 1 一级标题 / 2 二级标题 / 3 三级标题"""
    if level == 0:
        p = doc.add_paragraph()
        p.paragraph_format.line_spacing = 1.5
        p.paragraph_format.first_line_indent = Pt(24)  # 2 字符
        p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
        run = p.add_run(text)
        run.font.name = "Times New Roman"
        run.font.size = Pt(12)  # 小四
        _set_east_asia_font(run, "宋体")
    else:
        sizes = {1: Pt(14), 2: Pt(12), 3: Pt(12)}
        bolds = {1: False, 2: True, 3: True}
        font_cn = {1: "黑体", 2: "黑体", 3: "宋体"}
        p = doc.add_paragraph()
        p.paragraph_format.line_spacing = 1.5
        p.paragraph_format.space_before = Pt(6)
        p.paragraph_format.space_after = Pt(3)
        run = p.add_run(text)
        run.font.name = "Times New Roman"
        run.font.size = sizes[level]
        run.bold = bolds[level]
        _set_east_asia_font(run, font_cn[level])


_HEADING_RE = re.compile(r"^(#+)\s+(.+)$")
_TABLE_LINE_RE = re.compile(r"^\s*\|.+\|\s*$")


def render_md_block(doc: Document, md_text: str) -> None:
    lines = md_text.splitlines()
    i = 0
    while i < len(lines):
        line = lines[i].rstrip()

        # markdown table
        if _TABLE_LINE_RE.match(line):
            tbl_lines = []
            while i < len(lines) and _TABLE_LINE_RE.match(lines[i]):
                tbl_lines.append(lines[i])
                i += 1
            _render_md_table(doc, tbl_lines)
            continue

        if not line.strip():
            i += 1
            continue

        m = _HEADING_RE.match(line)
        if m:
            hashes, title = m.group(1), m.group(2)
            level = min(len(hashes), 3)
            add_paragraph(doc, title.strip(), level=level)
        elif line.startswith(">"):
            # blockquote — 申报书里通常是写作提示, 渲染时跳过 (撰稿提示不入正稿)
            pass
        else:
            # 正文段落: 把连续非空行合并成一段
            buf = [line]
            j = i + 1
            while j < len(lines) and lines[j].strip() and not _HEADING_RE.match(lines[j]) and not lines[j].startswith(">") and not _TABLE_LINE_RE.match(lines[j]):
                buf.append(lines[j].rstrip())
                j += 1
            text = " ".join(s.strip() for s in buf)
            add_paragraph(doc, text, level=0)
            i = j
            continue
        i += 1


def _render_md_table(doc: Document, table_lines: list[str]) -> None:
    """把一段 markdown 表格转换成 docx 表格。第二行如果是分隔符 (---) 跳过。"""
    rows: list[list[str]] = []
    for ln in table_lines:
        cells = [c.strip() for c in ln.strip().strip("|").split("|")]
        # skip pure separator row
        if all(re.match(r"^[-: ]+$", c) for c in cells):
            continue
        rows.append(cells)
    if not rows:
        return
    n_cols = max(len(r) for r in rows)
    for r in rows:
        while len(r) < n_cols:
            r.append("")
    table = doc.add_table(rows=len(rows), cols=n_cols)
    table.style = "Light Grid Accent 1"
    for ri, row in enumerate(rows):
        for ci, val in enumerate(row):
            cell = table.rows[ri].cells[ci]
            cell.text = ""  # clear default
            p = cell.paragraphs[0]
            run = p.add_run(val)
            run.font.name = "Times New Roman"
            run.font.size = Pt(10.5)  # 五号
            _set_east_asia_font(run, "宋体")
            if ri == 0:
                run.bold = True


def render_sections(sections_dir: Path, out: Path, fund_type: str) -> None:
    if not sections_dir.is_dir():
        print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr)
        sys.exit(2)
    md_files = sorted(sections_dir.glob("*.md"))
    if not md_files:
        print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr)
        sys.exit(2)

    doc = init_doc()
    for f in md_files:
        text = f.read_text(encoding="utf-8")
        render_md_block(doc, text)
        # 每个 section 后加分页, 让结构更清晰
        doc.add_page_break()

    out.parent.mkdir(parents=True, exist_ok=True)
    doc.save(str(out))

    # 统计
    paras = sum(1 for p in doc.paragraphs)
    chars = sum(len(p.text) for p in doc.paragraphs)
    print(f"[OK] rendered {len(md_files)} sections → {out}")
    print(f"   paragraphs: {paras} | total chars: {chars}")
    print(f"   fund_type:  {fund_type}")
    print(f"   font: 中文宋体小四 / 英文 Times New Roman 小四 / 行距 1.5 / 首行缩进 2 字符")


def main() -> None:
    ap = argparse.ArgumentParser(description="渲染章节 md → 申报书 docx")
    ap.add_argument("sections_dir", type=Path, help="sections/*.md 目录")
    ap.add_argument("--fund-type", required=True,
                    choices=["key_rd", "major_project", "nsfc_joint_fund",
                             "nsfc_general", "nsfc_youth", "provincial", "enterprise"])
    ap.add_argument("-o", "--output", type=Path, required=True, help="输出 .docx 路径")
    args = ap.parse_args()
    render_sections(args.sections_dir, args.output, args.fund_type)


if __name__ == "__main__":
    main()