"""manuscript 体例 docx 渲染器(paper 投稿稿 + proposal 申报书,配置化双 profile)。 两者原是近亲(~80% 逐字相同),差异收进 PROFILES:页边距 / TOC 标题 / 图题前缀 / 列表多一条"第X条" / sections 循环(toc 是否默认 + 末段是否补分页)。函数体移植自 旧 paper/proposal render_docx.py,叶子原语走 rendering.common。 profile=paper: --lang {zh,en}(图题前缀 图/Fig.),--toc 可选(默认无) profile=proposal: --fund-type ...(仅打印),始终带 TOC,每段后分页 """ from __future__ import annotations import re import sys from pathlib import Path from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.shared import Cm, Pt, RGBColor from . import common from .common import set_run_fonts, set_style_fonts, set_subscript, CHEM_RE, parse_inline # ───────────────────────── profile 配置 ───────────────────────── _BASE_LIST_PATTERNS = [ re.compile(r"^\[\d+\]\s"), # [1] re.compile(r"^[-*+]\s"), # - / * / + re.compile(r"^\d+[\.、.]\s*"), # 1. / 1、 / 1. re.compile(r"^\(\d+\)\s*"), # (1) re.compile(r"^(\d+)\s*"), # (1) re.compile(r"^[一二三四五六七八九十百千]+[、.\.]"), # 一、 re.compile(r"^[((][一二三四五六七八九十百千]+[))]"), # (一) re.compile(r"^[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮]"), # ① ] PROFILES = { "paper": { "left_margin": Cm(2.5), "right_margin": Cm(2.5), "list_patterns": _BASE_LIST_PATTERNS, "toc_title": "Contents", "toc_placeholder": "[Press F9 in Word to generate the table of contents]", "always_toc": False, "trailing_page_break": False, }, "proposal": { "left_margin": Cm(3.0), "right_margin": Cm(2.0), "list_patterns": _BASE_LIST_PATTERNS + [ re.compile(r"^第[一二三四五六七八九十百]+[条章节]"), # 第一条 ], "toc_title": "目 录", "toc_placeholder": "[在 Word 中按 F9 或右键此处选择 “更新域” 即可生成完整目录]", "always_toc": True, "trailing_page_break": True, }, } # ───────────────────────── 文档初始化 ───────────────────────── def init_doc(prof: dict) -> Document: doc = Document() section = doc.sections[0] section.page_height = Cm(29.7) section.page_width = Cm(21) section.top_margin = Cm(2.5) section.bottom_margin = Cm(2.5) section.left_margin = prof["left_margin"] section.right_margin = prof["right_margin"] normal = doc.styles["Normal"] normal.font.name = "Times New Roman" normal.font.size = Pt(12) set_style_fonts(normal, cn_font="宋体") pf = normal.paragraph_format pf.line_spacing = 1.5 pf.space_before = Pt(0) pf.space_after = Pt(0) for lvl, sz, cn in [(1, Pt(14), "黑体"), (2, Pt(12), "黑体"), (3, Pt(12), "宋体")]: h = doc.styles[f"Heading {lvl}"] h.font.name = "Times New Roman" h.font.size = sz h.font.bold = True h.font.color.rgb = RGBColor(0, 0, 0) set_style_fonts(h, cn_font=cn) h.paragraph_format.line_spacing = 1.5 h.paragraph_format.space_before = Pt(6) h.paragraph_format.space_after = Pt(3) h.paragraph_format.first_line_indent = None return doc def add_toc(doc: Document, prof: dict, depth: int = 3) -> None: p = doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.first_line_indent = None p.paragraph_format.space_before = Pt(12) p.paragraph_format.space_after = Pt(6) run = p.add_run(prof["toc_title"]) run.font.size = Pt(16) run.font.bold = True set_run_fonts(run, cn_font="黑体") p = doc.add_paragraph() p.paragraph_format.first_line_indent = None run = p.add_run() fldChar1 = OxmlElement("w:fldChar") fldChar1.set(qn("w:fldCharType"), "begin") instrText = OxmlElement("w:instrText") instrText.set(qn("xml:space"), "preserve") instrText.text = f' TOC \\o "1-{depth}" \\h \\z \\u ' fldChar2 = OxmlElement("w:fldChar") fldChar2.set(qn("w:fldCharType"), "separate") fldChar3 = OxmlElement("w:fldChar") fldChar3.set(qn("w:fldCharType"), "end") placeholder_t = OxmlElement("w:t") placeholder_t.set(qn("xml:space"), "preserve") placeholder_t.text = prof["toc_placeholder"] run._element.append(fldChar1) run._element.append(instrText) run._element.append(fldChar2) run._element.append(placeholder_t) run._element.append(fldChar3) doc.add_page_break() # ───────────────────────── 内联(化学式下标)───────────────────────── def _emit_plain_with_chem(paragraph, text: str, *, size, cn_font: str) -> None: """plain 段:白名单化学式里的数字渲成下标,其余正常。无命中即一条普通 run。""" def _run(seg: str, sub: bool = False): if not seg: return r = paragraph.add_run(seg) r.font.size = size set_run_fonts(r, cn_font=cn_font, en_font="Times New Roman") if sub: set_subscript(r) pos = 0 for m in CHEM_RE.finditer(text): _run(text[pos:m.start()]) buf = "" for ch in m.group(0): if ch.isdigit(): _run(buf); buf = "" _run(ch, sub=True) else: buf += ch _run(buf) pos = m.end() _run(text[pos:]) def add_inline(paragraph, text: str, *, size: Pt = Pt(12), cn_font: str = "宋体") -> None: for style, seg in parse_inline(text): if style == "plain": _emit_plain_with_chem(paragraph, seg, size=size, cn_font=cn_font) continue run = paragraph.add_run(seg) run.font.size = size if style == "bold": run.bold = True set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman") elif style == "italic": run.italic = True set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman") elif style == "code": set_run_fonts(run, cn_font=cn_font, en_font="Consolas") # ───────────────────────── 段落 / 标题 / 列表 ───────────────────────── def add_heading(doc: Document, text: str, level: int) -> None: p = doc.add_paragraph(style=f"Heading {level}") p.paragraph_format.first_line_indent = None sizes = {1: Pt(14), 2: Pt(12), 3: Pt(12)} cn = {1: "黑体", 2: "黑体", 3: "宋体"} add_inline(p, text, size=sizes[level], cn_font=cn[level]) for run in p.runs: run.bold = True def add_body_paragraph(doc: Document, text: str, *, indent: bool = True) -> None: p = doc.add_paragraph() pf = p.paragraph_format pf.line_spacing = 1.5 if indent: pf.first_line_indent = Pt(24) else: pf.first_line_indent = None p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY add_inline(p, text) def is_list_item(line: str, prof: dict) -> bool: return any(p.match(line) for p in prof["list_patterns"]) def add_code_block(doc: Document, lines: list[str], lang: str = "") -> None: for ln in lines: p = doc.add_paragraph() pf = p.paragraph_format pf.first_line_indent = None pf.line_spacing = 1.0 pf.space_before = Pt(0) pf.space_after = Pt(0) run = p.add_run(ln if ln else " ") run.font.size = Pt(10.5) set_run_fonts(run, cn_font="新宋体", en_font="Consolas") for t in run._element.iter(qn("w:t")): t.set(qn("xml:space"), "preserve") # ───────────────────────── 表格 ───────────────────────── def render_table(doc: Document, table_lines: list[str]) -> None: rows: list[list[str]] = [] for ln in table_lines: cells = common.split_md_row(ln) if not cells or common.is_separator_row(cells): continue rows.append(cells) if not rows: return n_cols = max(len(r) for r in rows) for r in rows: while len(r) < n_cols: r.append("") table = doc.add_table(rows=len(rows), cols=n_cols) try: table.style = "Light Grid Accent 1" except KeyError: pass for ri, row in enumerate(rows): for ci, val in enumerate(row): cell = table.rows[ri].cells[ci] cell.text = "" p = cell.paragraphs[0] p.paragraph_format.first_line_indent = None p.paragraph_format.line_spacing = 1.2 add_inline(p, val, size=Pt(10.5), cn_font="宋体") if ri == 0: for run in p.runs: run.bold = True # ───────────────────────── 图片 + 图题 ───────────────────────── _MERMAID_CAPTION_RE = re.compile(r"^\s*%%\s*caption\s*:\s*(.+?)\s*$", re.IGNORECASE) _FILENAME_INVALID_RE = re.compile(r"[^一-鿿A-Za-z0-9]+") def caption_to_stem(caption: str) -> str: cleaned = _FILENAME_INVALID_RE.sub("_", caption).strip("_")[:40] if not cleaned: return "" return f"fig_{cleaned}" def extract_mermaid_caption(source: str) -> str | None: for ln in source.splitlines(): m = _MERMAID_CAPTION_RE.match(ln) if m: return m.group(1).strip() return None def add_image(doc: Document, png_path: Path, caption: str | None, ctx: dict) -> None: p = doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.first_line_indent = None p.paragraph_format.space_before = Pt(6) p.paragraph_format.space_after = Pt(3) run = p.add_run() try: run.add_picture(str(png_path), width=common.MAX_IMG_WIDTH) except Exception as e: run.add_text(f"[image failed: {png_path.name}: {e}]") return ctx["fig_no"] = ctx.get("fig_no", 0) + 1 cap_p = doc.add_paragraph() cap_p.alignment = WD_ALIGN_PARAGRAPH.CENTER cap_p.paragraph_format.first_line_indent = None cap_p.paragraph_format.space_before = Pt(0) cap_p.paragraph_format.space_after = Pt(6) label = ctx.get("fig_label", "Fig.") cap_text = f"{label} {ctx['fig_no']} {caption}" if caption else f"{label} {ctx['fig_no']}" cap_run = cap_p.add_run(cap_text) cap_run.font.size = Pt(10.5) cap_run.bold = True set_run_fonts(cap_run, cn_font="宋体", en_font="Times New Roman") # ───────────────────────── 主渲染 ───────────────────────── def render_md_block(doc: Document, md_text: str, ctx: dict) -> None: prof = ctx["prof"] lines = md_text.splitlines() i = 0 n = len(lines) while i < n: line = lines[i].rstrip() if not line.strip(): i += 1 continue if common.is_hr(line): i += 1 continue m_img = common.IMAGE_LINE_RE.match(line) if m_img: src = m_img.group("src") cap = m_img.group("cap").strip() or None png = common.resolve_image_path(src, ctx["sections_dir"]) if png is not None: add_image(doc, png, cap, ctx) else: add_body_paragraph(doc, f"[image missing: {src}]", indent=False) i += 1 continue m_fence = common.FENCE_RE.match(line) if m_fence: fence = m_fence.group(1) lang = m_fence.group(2) or "" code: list[str] = [] i += 1 while i < n: m_close = common.FENCE_RE.match(lines[i]) if m_close and m_close.group(1)[0] == fence[0] and len(m_close.group(1)) >= len(fence): i += 1 break code.append(lines[i]) i += 1 if lang.lower() == "mermaid": source = "\n".join(code) cap = extract_mermaid_caption(source) if cap: stem = caption_to_stem(cap) if stem: png = ctx["figures_dir"] / f"{stem}.png" if png.is_file(): add_image(doc, png, cap, ctx) continue add_code_block(doc, code, lang) continue if common.is_table_line(line): block: list[str] = [] while i < n and common.is_table_line(lines[i]): block.append(lines[i]) i += 1 render_table(doc, block) continue m = common.HEADING_RE.match(line) if m: level = min(len(m.group(1)), 3) add_heading(doc, m.group(2).strip(), level) i += 1 continue if common.is_blockquote(line): i += 1 continue if is_list_item(line, prof): add_body_paragraph(doc, line.strip(), indent=False) i += 1 continue buf = [line.strip()] j = i + 1 while j < n: nxt = lines[j].rstrip() if not nxt.strip(): break if (common.is_heading(nxt) or common.is_blockquote(nxt) or common.is_table_line(nxt) or is_list_item(nxt, prof) or common.is_hr(nxt)): break buf.append(nxt.strip()) j += 1 add_body_paragraph(doc, " ".join(buf), indent=True) i = j # ───────────────────────── 入口 ───────────────────────── def render_sections(profile: str, sections_dir: Path, out: Path, *, lang: str = "en", toc: bool = False, fund_type: str = "") -> None: prof = PROFILES[profile] if not sections_dir.is_dir(): print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr) sys.exit(2) md_files = sorted(sections_dir.glob("*.md")) if not md_files: print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr) sys.exit(2) figures_dir = sections_dir.parent / "figures" ctx: dict = { "prof": prof, "sections_dir": sections_dir, "figures_dir": figures_dir, "fig_no": 0, "fig_label": ("图" if lang == "zh" else "Fig.") if profile == "paper" else "图", } doc = init_doc(prof) if prof["always_toc"] or toc: add_toc(doc, prof) for idx, f in enumerate(md_files): text = f.read_text(encoding="utf-8") render_md_block(doc, text, ctx) if prof["trailing_page_break"] or idx != len(md_files) - 1: doc.add_page_break() out.parent.mkdir(parents=True, exist_ok=True) doc.save(str(out)) paras = sum(1 for _ in doc.paragraphs) chars = sum(len(p.text) for p in doc.paragraphs) tbls = len(doc.tables) print(f"[OK] rendered {len(md_files)} sections -> {out}") print(f" profile: {profile} | paragraphs: {paras} | tables: {tbls} | " f"figures: {ctx['fig_no']} | chars: {chars}")