zcbot/skills/proposal/scripts/render_docx.py

"""把 sections/*.md 渲染成符合中国基金申报书排版规范的 .docx。

字体规范 (来自 typography.md):
- 标题黑体 (一二级) / 三级标题宋体 / 正文中文宋体 / 英文 Times New Roman
- 行距 1.5 倍 / 首行缩进 2 字符
- A4 纸 / 上下 2.5cm / 左 3.0cm / 右 2.0cm

特性:
- 自动插入"目录"页 (Word 内右键更新域 / F9 即生成 TOC)
- 内联 markdown 解析: **加粗** / *斜体* / `等宽`
- 列表/引用文献项 ([N], 1., (1), 一、, -, *) 各自独立成段
- markdown 表格自动识别, 包含分隔行 |---|---|

用法:
  python render_docx.py <sections_dir> --fund-type key_rd -o <out.docx>
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path

from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Cm, Pt, RGBColor


# ───────────────────────── 字体辅助 ─────────────────────────

def _set_run_fonts(run, *, cn_font: str = "宋体", en_font: str = "Times New Roman") -> None:
    """同时设置 run 的中文 (eastAsia) 和西文 (ascii/hAnsi) 字体。"""
    rPr = run._element.get_or_add_rPr()
    rFonts = rPr.find(qn("w:rFonts"))
    if rFonts is None:
        rFonts = OxmlElement("w:rFonts")
        rPr.append(rFonts)
    rFonts.set(qn("w:eastAsia"), cn_font)
    rFonts.set(qn("w:ascii"), en_font)
    rFonts.set(qn("w:hAnsi"), en_font)


def _set_style_fonts(style, *, cn_font: str = "宋体", en_font: str = "Times New Roman") -> None:
    """直接给 style 写 rFonts, 这样基于该 style 的所有段落都继承字体。"""
    el = style.element
    rPr = el.find(qn("w:rPr"))
    if rPr is None:
        rPr = OxmlElement("w:rPr")
        el.insert(0, rPr)
    rFonts = rPr.find(qn("w:rFonts"))
    if rFonts is None:
        rFonts = OxmlElement("w:rFonts")
        rPr.append(rFonts)
    rFonts.set(qn("w:eastAsia"), cn_font)
    rFonts.set(qn("w:ascii"), en_font)
    rFonts.set(qn("w:hAnsi"), en_font)


# ───────────────────────── 文档初始化 ─────────────────────────

def init_doc() -> Document:
    doc = Document()

    # 页面
    section = doc.sections[0]
    section.page_height = Cm(29.7)
    section.page_width = Cm(21)
    section.top_margin = Cm(2.5)
    section.bottom_margin = Cm(2.5)
    section.left_margin = Cm(3.0)
    section.right_margin = Cm(2.0)

    # Normal 样式 (正文)
    normal = doc.styles["Normal"]
    normal.font.name = "Times New Roman"
    normal.font.size = Pt(12)  # 小四
    _set_style_fonts(normal, cn_font="宋体")
    pf = normal.paragraph_format
    pf.line_spacing = 1.5
    pf.space_before = Pt(0)
    pf.space_after = Pt(0)

    # Heading 样式 — 让 Word TOC 域识别
    for lvl, sz, cn in [(1, Pt(14), "黑体"), (2, Pt(12), "黑体"), (3, Pt(12), "宋体")]:
        h = doc.styles[f"Heading {lvl}"]
        h.font.name = "Times New Roman"
        h.font.size = sz
        h.font.bold = True
        h.font.color.rgb = RGBColor(0, 0, 0)  # 覆盖 builtin 蓝色
        _set_style_fonts(h, cn_font=cn)
        h.paragraph_format.line_spacing = 1.5
        h.paragraph_format.space_before = Pt(6)
        h.paragraph_format.space_after = Pt(3)
        h.paragraph_format.first_line_indent = None

    return doc


# ───────────────────────── TOC ─────────────────────────

def add_toc(doc: Document, depth: int = 3) -> None:
    """在文档开头插入 '目录' 标题 + Word 域 TOC。

    Word 打开时不会自动展开;用户右键域 → '更新域' 或按 F9。
    LibreOffice 打开会更直接显示。
    """
    # "目  录" 标题 (居中, 不用 Heading 样式以免自我包含)
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p.paragraph_format.first_line_indent = None
    p.paragraph_format.space_before = Pt(12)
    p.paragraph_format.space_after = Pt(6)
    run = p.add_run("目  录")
    run.font.size = Pt(16)  # 三号
    run.font.bold = True
    _set_run_fonts(run, cn_font="黑体")

    # TOC 域
    p = doc.add_paragraph()
    p.paragraph_format.first_line_indent = None
    run = p.add_run()

    fldChar1 = OxmlElement("w:fldChar")
    fldChar1.set(qn("w:fldCharType"), "begin")

    instrText = OxmlElement("w:instrText")
    instrText.set(qn("xml:space"), "preserve")
    instrText.text = f' TOC \\o "1-{depth}" \\h \\z \\u '

    fldChar2 = OxmlElement("w:fldChar")
    fldChar2.set(qn("w:fldCharType"), "separate")

    fldChar3 = OxmlElement("w:fldChar")
    fldChar3.set(qn("w:fldCharType"), "end")

    # 占位文字 — Word 更新域时会被实际目录替换
    placeholder_t = OxmlElement("w:t")
    placeholder_t.set(qn("xml:space"), "preserve")
    placeholder_t.text = "[在 Word 中按 F9 或右键此处选择 “更新域” 即可生成完整目录]"

    run._element.append(fldChar1)
    run._element.append(instrText)
    run._element.append(fldChar2)
    run._element.append(placeholder_t)
    run._element.append(fldChar3)

    doc.add_page_break()


# ───────────────────────── 内联 markdown ─────────────────────────

# 顺序敏感:**bold** 必须先于 *italic* 匹配, 否则会被 italic 抢
_INLINE_RE = re.compile(
    r"(?P<bold>\*\*(?P<bold_t>[^*\n]+?)\*\*)"
    r"|(?P<italic>(?<![\*\w])\*(?P<italic_t>[^*\n]+?)\*(?!\*))"
    r"|(?P<code>`(?P<code_t>[^`\n]+?)`)"
)


def parse_inline(text: str) -> list[tuple[str, str]]:
    """切成 (style, segment) 列表; style ∈ plain/bold/italic/code。"""
    out: list[tuple[str, str]] = []
    pos = 0
    for m in _INLINE_RE.finditer(text):
        if m.start() > pos:
            out.append(("plain", text[pos:m.start()]))
        if m.group("bold"):
            out.append(("bold", m.group("bold_t")))
        elif m.group("italic"):
            out.append(("italic", m.group("italic_t")))
        elif m.group("code"):
            out.append(("code", m.group("code_t")))
        pos = m.end()
    if pos < len(text):
        out.append(("plain", text[pos:]))
    return out or [("plain", text)]


def add_inline(paragraph, text: str, *, size: Pt = Pt(12), cn_font: str = "宋体") -> None:
    for style, seg in parse_inline(text):
        run = paragraph.add_run(seg)
        run.font.size = size
        if style == "bold":
            run.bold = True
            _set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
        elif style == "italic":
            run.italic = True
            _set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
        elif style == "code":
            _set_run_fonts(run, cn_font=cn_font, en_font="Consolas")
        else:
            _set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")


# ───────────────────────── 段落 / 标题 / 列表 ─────────────────────────

def add_heading(doc: Document, text: str, level: int) -> None:
    p = doc.add_paragraph(style=f"Heading {level}")
    p.paragraph_format.first_line_indent = None
    # 标题里通常无内联 markdown, 但万一有也按内联解析 (黑体大小由 style 已设)
    sizes = {1: Pt(14), 2: Pt(12), 3: Pt(12)}
    cn = {1: "黑体", 2: "黑体", 3: "宋体"}
    add_inline(p, text, size=sizes[level], cn_font=cn[level])
    for run in p.runs:
        run.bold = True


def add_body_paragraph(doc: Document, text: str, *, indent: bool = True) -> None:
    p = doc.add_paragraph()
    pf = p.paragraph_format
    pf.line_spacing = 1.5
    if indent:
        pf.first_line_indent = Pt(24)  # 2 字符
    else:
        pf.first_line_indent = None
    p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
    add_inline(p, text)


# ───────────────────────── 行类型识别 ─────────────────────────

_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$")
_TABLE_LINE_RE = re.compile(r"^\s*\|.*\|\s*$")
_BLOCKQUOTE_RE = re.compile(r"^\s*>\s?")
_HR_RE = re.compile(r"^\s*-{3,}\s*$|^\s*={3,}\s*$|^\s*_{3,}\s*$")

# 列表项 (各自独立成段, 不跟相邻行合并, 不缩进首行)
_LIST_PATTERNS = [
    re.compile(r"^\[\d+\]\s"),                                  # [1]
    re.compile(r"^[-*+]\s"),                                    # - / * / +
    re.compile(r"^\d+[\.、．]\s*"),                             # 1.  / 1、 / 1．
    re.compile(r"^\(\d+\)\s*"),                                 # (1)
    re.compile(r"^（\d+）\s*"),                                 # （1）
    re.compile(r"^[一二三四五六七八九十百千]+[、．\.]"),       # 一、
    re.compile(r"^[（(][一二三四五六七八九十百千]+[)）]"),      # (一)
    re.compile(r"^[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮]"),                  # ①
    re.compile(r"^第[一二三四五六七八九十百]+[条章节]"),        # 第一条
]


def is_list_item(line: str) -> bool:
    return any(p.match(line) for p in _LIST_PATTERNS)


def is_table_line(line: str) -> bool:
    return bool(_TABLE_LINE_RE.match(line))


def is_heading(line: str) -> bool:
    return bool(_HEADING_RE.match(line))


def is_blockquote(line: str) -> bool:
    return bool(_BLOCKQUOTE_RE.match(line))


def is_hr(line: str) -> bool:
    return bool(_HR_RE.match(line))


# ───────────────────────── 表格 ─────────────────────────

def _split_md_row(line: str) -> list[str]:
    return [c.strip() for c in line.strip().strip("|").split("|")]


def _is_separator_row(cells: list[str]) -> bool:
    return all(re.match(r"^[-:\s]+$", c) for c in cells if c != "")


def render_table(doc: Document, table_lines: list[str]) -> None:
    rows: list[list[str]] = []
    for ln in table_lines:
        cells = _split_md_row(ln)
        if not cells or _is_separator_row(cells):
            continue
        rows.append(cells)
    if not rows:
        return
    n_cols = max(len(r) for r in rows)
    for r in rows:
        while len(r) < n_cols:
            r.append("")

    table = doc.add_table(rows=len(rows), cols=n_cols)
    try:
        table.style = "Light Grid Accent 1"
    except KeyError:
        pass  # style 不存在就用默认

    for ri, row in enumerate(rows):
        for ci, val in enumerate(row):
            cell = table.rows[ri].cells[ci]
            # 清掉 cell 默认空段落
            cell.text = ""
            p = cell.paragraphs[0]
            p.paragraph_format.first_line_indent = None
            p.paragraph_format.line_spacing = 1.2
            add_inline(p, val, size=Pt(10.5), cn_font="宋体")
            if ri == 0:
                for run in p.runs:
                    run.bold = True


# ───────────────────────── 主渲染 ─────────────────────────

def render_md_block(doc: Document, md_text: str) -> None:
    lines = md_text.splitlines()
    i = 0
    n = len(lines)
    while i < n:
        line = lines[i].rstrip()

        # 空行
        if not line.strip():
            i += 1
            continue

        # 横线
        if is_hr(line):
            i += 1
            continue

        # 表格 (连续若干行 | ... | 视为一张表)
        if is_table_line(line):
            block: list[str] = []
            while i < n and is_table_line(lines[i]):
                block.append(lines[i])
                i += 1
            render_table(doc, block)
            continue

        # 标题
        m = _HEADING_RE.match(line)
        if m:
            level = min(len(m.group(1)), 3)
            add_heading(doc, m.group(2).strip(), level)
            i += 1
            continue

        # 引用块 — 模板里多用作"写作提示", 不入正稿
        if is_blockquote(line):
            i += 1
            continue

        # 列表项 (含引文 [N]) — 各自独立成段, 不缩进首行
        if is_list_item(line):
            add_body_paragraph(doc, line.strip(), indent=False)
            i += 1
            continue

        # 散文段落 — 合并下一空行 / 特殊行前的连续行
        buf = [line.strip()]
        j = i + 1
        while j < n:
            nxt = lines[j].rstrip()
            if not nxt.strip():
                break
            if is_heading(nxt) or is_blockquote(nxt) or is_table_line(nxt) or is_list_item(nxt) or is_hr(nxt):
                break
            buf.append(nxt.strip())
            j += 1
        add_body_paragraph(doc, " ".join(buf), indent=True)
        i = j


# ───────────────────────── 入口 ─────────────────────────

def render_sections(sections_dir: Path, out: Path, fund_type: str) -> None:
    if not sections_dir.is_dir():
        print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr)
        sys.exit(2)
    md_files = sorted(sections_dir.glob("*.md"))
    if not md_files:
        print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr)
        sys.exit(2)

    doc = init_doc()
    add_toc(doc)
    for f in md_files:
        text = f.read_text(encoding="utf-8")
        render_md_block(doc, text)
        doc.add_page_break()

    out.parent.mkdir(parents=True, exist_ok=True)
    doc.save(str(out))

    paras = sum(1 for _ in doc.paragraphs)
    chars = sum(len(p.text) for p in doc.paragraphs)
    tbls = len(doc.tables)
    print(f"[OK] rendered {len(md_files)} sections -> {out}")
    print(f"   paragraphs: {paras} | tables: {tbls} | total chars: {chars}")
    print(f"   fund_type:  {fund_type}")
    print(f"   font: 中文宋体小四 / 英文 Times New Roman 小四 / 行距 1.5 / 首行缩进 2 字符")
    print(f"   提示: 在 Word 中打开后按 F9 (或右键目录 -> 更新域) 生成实际目录。")


def main() -> None:
    ap = argparse.ArgumentParser(description="渲染章节 md → 申报书 docx")
    ap.add_argument("sections_dir", type=Path, help="sections/*.md 目录")
    ap.add_argument(
        "--fund-type",
        required=True,
        choices=["key_rd", "major_project", "nsfc_joint_fund",
                 "nsfc_general", "nsfc_youth", "provincial", "enterprise"],
    )
    ap.add_argument("-o", "--output", type=Path, required=True, help="输出 .docx 路径")
    args = ap.parse_args()
    render_sections(args.sections_dir, args.output, args.fund_type)


if __name__ == "__main__":
    main()