193 lines
6.7 KiB
Python
193 lines
6.7 KiB
Python
"""把 sections/*.md 渲染成符合中国基金申报书排版规范的 .docx。
|
|
|
|
字体规范 (来自 typography.md):
|
|
- 标题黑体 / 正文中文宋体 / 英文 Times New Roman
|
|
- 行距 1.5 倍 / 首行缩进 2 字符
|
|
- A4 纸 / 上下 2.5cm / 左 3.0cm / 右 2.0cm
|
|
|
|
用法:
|
|
python render_docx.py <sections_dir> --fund-type key_rd -o <out.docx>
|
|
|
|
支持的基金类型: key_rd / major_project / nsfc_joint_fund / nsfc_general / nsfc_youth / provincial / enterprise
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from docx import Document
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
from docx.oxml.ns import qn
|
|
from docx.shared import Cm, Pt
|
|
|
|
|
|
def _set_east_asia_font(run, font_name: str = "宋体") -> None:
|
|
"""让 run 的中文字体生效 (python-docx 不直接支持东亚字体)。"""
|
|
rPr = run._element.get_or_add_rPr()
|
|
rFonts = rPr.find(qn("w:rFonts"))
|
|
if rFonts is None:
|
|
from docx.oxml import OxmlElement
|
|
rFonts = OxmlElement("w:rFonts")
|
|
rPr.append(rFonts)
|
|
rFonts.set(qn("w:eastAsia"), font_name)
|
|
rFonts.set(qn("w:ascii"), "Times New Roman")
|
|
rFonts.set(qn("w:hAnsi"), "Times New Roman")
|
|
|
|
|
|
def init_doc() -> Document:
|
|
doc = Document()
|
|
section = doc.sections[0]
|
|
section.page_height = Cm(29.7)
|
|
section.page_width = Cm(21)
|
|
section.top_margin = Cm(2.5)
|
|
section.bottom_margin = Cm(2.5)
|
|
section.left_margin = Cm(3.0)
|
|
section.right_margin = Cm(2.0)
|
|
# 不在这里改 default style — 直接每段自己设字体最稳
|
|
return doc
|
|
|
|
|
|
def add_paragraph(doc: Document, text: str, *, level: int = 0) -> None:
|
|
"""level: 0 正文 / 1 一级标题 / 2 二级标题 / 3 三级标题"""
|
|
if level == 0:
|
|
p = doc.add_paragraph()
|
|
p.paragraph_format.line_spacing = 1.5
|
|
p.paragraph_format.first_line_indent = Pt(24) # 2 字符
|
|
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
|
|
run = p.add_run(text)
|
|
run.font.name = "Times New Roman"
|
|
run.font.size = Pt(12) # 小四
|
|
_set_east_asia_font(run, "宋体")
|
|
else:
|
|
sizes = {1: Pt(14), 2: Pt(12), 3: Pt(12)}
|
|
bolds = {1: False, 2: True, 3: True}
|
|
font_cn = {1: "黑体", 2: "黑体", 3: "宋体"}
|
|
p = doc.add_paragraph()
|
|
p.paragraph_format.line_spacing = 1.5
|
|
p.paragraph_format.space_before = Pt(6)
|
|
p.paragraph_format.space_after = Pt(3)
|
|
run = p.add_run(text)
|
|
run.font.name = "Times New Roman"
|
|
run.font.size = sizes[level]
|
|
run.bold = bolds[level]
|
|
_set_east_asia_font(run, font_cn[level])
|
|
|
|
|
|
_HEADING_RE = re.compile(r"^(#+)\s+(.+)$")
|
|
_TABLE_LINE_RE = re.compile(r"^\s*\|.+\|\s*$")
|
|
|
|
|
|
def render_md_block(doc: Document, md_text: str) -> None:
|
|
lines = md_text.splitlines()
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i].rstrip()
|
|
|
|
# markdown table
|
|
if _TABLE_LINE_RE.match(line):
|
|
tbl_lines = []
|
|
while i < len(lines) and _TABLE_LINE_RE.match(lines[i]):
|
|
tbl_lines.append(lines[i])
|
|
i += 1
|
|
_render_md_table(doc, tbl_lines)
|
|
continue
|
|
|
|
if not line.strip():
|
|
i += 1
|
|
continue
|
|
|
|
m = _HEADING_RE.match(line)
|
|
if m:
|
|
hashes, title = m.group(1), m.group(2)
|
|
level = min(len(hashes), 3)
|
|
add_paragraph(doc, title.strip(), level=level)
|
|
elif line.startswith(">"):
|
|
# blockquote — 申报书里通常是写作提示, 渲染时跳过 (撰稿提示不入正稿)
|
|
pass
|
|
else:
|
|
# 正文段落: 把连续非空行合并成一段
|
|
buf = [line]
|
|
j = i + 1
|
|
while j < len(lines) and lines[j].strip() and not _HEADING_RE.match(lines[j]) and not lines[j].startswith(">") and not _TABLE_LINE_RE.match(lines[j]):
|
|
buf.append(lines[j].rstrip())
|
|
j += 1
|
|
text = " ".join(s.strip() for s in buf)
|
|
add_paragraph(doc, text, level=0)
|
|
i = j
|
|
continue
|
|
i += 1
|
|
|
|
|
|
def _render_md_table(doc: Document, table_lines: list[str]) -> None:
|
|
"""把一段 markdown 表格转换成 docx 表格。第二行如果是分隔符 (---) 跳过。"""
|
|
rows: list[list[str]] = []
|
|
for ln in table_lines:
|
|
cells = [c.strip() for c in ln.strip().strip("|").split("|")]
|
|
# skip pure separator row
|
|
if all(re.match(r"^[-: ]+$", c) for c in cells):
|
|
continue
|
|
rows.append(cells)
|
|
if not rows:
|
|
return
|
|
n_cols = max(len(r) for r in rows)
|
|
for r in rows:
|
|
while len(r) < n_cols:
|
|
r.append("")
|
|
table = doc.add_table(rows=len(rows), cols=n_cols)
|
|
table.style = "Light Grid Accent 1"
|
|
for ri, row in enumerate(rows):
|
|
for ci, val in enumerate(row):
|
|
cell = table.rows[ri].cells[ci]
|
|
cell.text = "" # clear default
|
|
p = cell.paragraphs[0]
|
|
run = p.add_run(val)
|
|
run.font.name = "Times New Roman"
|
|
run.font.size = Pt(10.5) # 五号
|
|
_set_east_asia_font(run, "宋体")
|
|
if ri == 0:
|
|
run.bold = True
|
|
|
|
|
|
def render_sections(sections_dir: Path, out: Path, fund_type: str) -> None:
|
|
if not sections_dir.is_dir():
|
|
print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr)
|
|
sys.exit(2)
|
|
md_files = sorted(sections_dir.glob("*.md"))
|
|
if not md_files:
|
|
print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
doc = init_doc()
|
|
for f in md_files:
|
|
text = f.read_text(encoding="utf-8")
|
|
render_md_block(doc, text)
|
|
# 每个 section 后加分页, 让结构更清晰
|
|
doc.add_page_break()
|
|
|
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
doc.save(str(out))
|
|
|
|
# 统计
|
|
paras = sum(1 for p in doc.paragraphs)
|
|
chars = sum(len(p.text) for p in doc.paragraphs)
|
|
print(f"[OK] rendered {len(md_files)} sections → {out}")
|
|
print(f" paragraphs: {paras} | total chars: {chars}")
|
|
print(f" fund_type: {fund_type}")
|
|
print(f" font: 中文宋体小四 / 英文 Times New Roman 小四 / 行距 1.5 / 首行缩进 2 字符")
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description="渲染章节 md → 申报书 docx")
|
|
ap.add_argument("sections_dir", type=Path, help="sections/*.md 目录")
|
|
ap.add_argument("--fund-type", required=True,
|
|
choices=["key_rd", "major_project", "nsfc_joint_fund",
|
|
"nsfc_general", "nsfc_youth", "provincial", "enterprise"])
|
|
ap.add_argument("-o", "--output", type=Path, required=True, help="输出 .docx 路径")
|
|
args = ap.parse_args()
|
|
render_sections(args.sections_dir, args.output, args.fund_type)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|