"""把 sections/*.md 渲染成期刊投稿稿 .docx (manuscript draft)。 与 proposal/render_docx.py 同源, 差异: - 无 fund-type; 改用 --lang {zh,en} (默认 en) 标注语言, 仅影响信息打印与首行缩进策略 - 目录 (TOC) 默认**不生成** (期刊投稿稿无需目录); 要草稿带目录加 --toc - 字体规范保持: 中文宋体小四 / 英文 Times New Roman 小四 / 行距 1.5 / 首行缩进 2 字符 (eastAsia=宋体 只对 CJK 字符生效, 纯英文论文正文走 Times New Roman, 同一套 style 通吃) 支持: **加粗** / *斜体* / `等宽`; 列表 / 表格 /  居中插图 + 图题自增; ```mermaid``` 块按 caption 查 figures/fig_
`(?P[^`\n]+?)`)"
)
def parse_inline(text: str) -> list[tuple[str, str]]:
out: list[tuple[str, str]] = []
pos = 0
for m in _INLINE_RE.finditer(text):
if m.start() > pos:
out.append(("plain", text[pos:m.start()]))
if m.group("bold"):
out.append(("bold", m.group("bold_t")))
elif m.group("italic"):
out.append(("italic", m.group("italic_t")))
elif m.group("code"):
out.append(("code", m.group("code_t")))
pos = m.end()
if pos < len(text):
out.append(("plain", text[pos:]))
return out or [("plain", text)]
# ── 化学式下标白名单(与 proposal/brief 三处渲染器共用同一份)──
# 长的在前,\b 防误伤 LC3 / C595 / 2026;不收 Ca2+ 这类带电荷的(那是上标,白名单不收即天然避开)
_CHEM_RE = re.compile(
r"Ca\(OH\)2|Mg\(OH\)2"
r"|\b(?:Al2O3|Fe2O3|Fe3O4|Mn2O3|Cr2O3|P2O5|Na2SO4|K2SO4|CaSO4|CaCO3|MgCO3|"
r"CaCl2|MgCl2|Na2O|K2O|SiO2|TiO2|ZrO2|SO4|SO3|SO2|CO3|CO2|NO3|NO2|PO4|"
r"H2O|NH3|CH4|C4AF|C3S2|C2AS|C3S|C2S|C3A|O2|N2|H2)\b"
)
def _set_subscript(run) -> None:
rPr = run._element.get_or_add_rPr()
va = OxmlElement("w:vertAlign")
va.set(qn("w:val"), "subscript")
rPr.append(va)
def _emit_plain_with_chem(paragraph, text: str, *, size, cn_font: str) -> None:
"""plain 段:白名单化学式里的数字渲成下标,其余正常。无命中即一条普通 run。"""
def _run(seg: str, sub: bool = False):
if not seg:
return
r = paragraph.add_run(seg)
r.font.size = size
_set_run_fonts(r, cn_font=cn_font, en_font="Times New Roman")
if sub:
_set_subscript(r)
pos = 0
for m in _CHEM_RE.finditer(text):
_run(text[pos:m.start()])
buf = ""
for ch in m.group(0):
if ch.isdigit():
_run(buf); buf = ""
_run(ch, sub=True)
else:
buf += ch
_run(buf)
pos = m.end()
_run(text[pos:])
def add_inline(paragraph, text: str, *, size: Pt = Pt(12), cn_font: str = "宋体") -> None:
for style, seg in parse_inline(text):
if style == "plain":
_emit_plain_with_chem(paragraph, seg, size=size, cn_font=cn_font)
continue
run = paragraph.add_run(seg)
run.font.size = size
if style == "bold":
run.bold = True
_set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
elif style == "italic":
run.italic = True
_set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
elif style == "code":
_set_run_fonts(run, cn_font=cn_font, en_font="Consolas")
# ───────────────────────── 段落 / 标题 / 列表 ─────────────────────────
def add_heading(doc: Document, text: str, level: int) -> None:
p = doc.add_paragraph(style=f"Heading {level}")
p.paragraph_format.first_line_indent = None
sizes = {1: Pt(14), 2: Pt(12), 3: Pt(12)}
cn = {1: "黑体", 2: "黑体", 3: "宋体"}
add_inline(p, text, size=sizes[level], cn_font=cn[level])
for run in p.runs:
run.bold = True
def add_body_paragraph(doc: Document, text: str, *, indent: bool = True) -> None:
p = doc.add_paragraph()
pf = p.paragraph_format
pf.line_spacing = 1.5
if indent:
pf.first_line_indent = Pt(24)
else:
pf.first_line_indent = None
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
add_inline(p, text)
# ───────────────────────── 行类型识别 ─────────────────────────
_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$")
_TABLE_LINE_RE = re.compile(r"^\s*\|.*\|\s*$")
_BLOCKQUOTE_RE = re.compile(r"^\s*>\s?")
_HR_RE = re.compile(r"^\s*-{3,}\s*$|^\s*={3,}\s*$|^\s*_{3,}\s*$")
_FENCE_RE = re.compile(r"^\s*(`{3,}|~{3,})\s*(\S*)\s*$")
_LIST_PATTERNS = [
re.compile(r"^\[\d+\]\s"),
re.compile(r"^[-*+]\s"),
re.compile(r"^\d+[\.、.]\s*"),
re.compile(r"^\(\d+\)\s*"),
re.compile(r"^(\d+)\s*"),
re.compile(r"^[一二三四五六七八九十百千]+[、.\.]"),
re.compile(r"^[((][一二三四五六七八九十百千]+[))]"),
re.compile(r"^[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮]"),
]
def is_list_item(line: str) -> bool:
return any(p.match(line) for p in _LIST_PATTERNS)
def is_table_line(line: str) -> bool:
return bool(_TABLE_LINE_RE.match(line))
def is_heading(line: str) -> bool:
return bool(_HEADING_RE.match(line))
def is_blockquote(line: str) -> bool:
return bool(_BLOCKQUOTE_RE.match(line))
def is_hr(line: str) -> bool:
return bool(_HR_RE.match(line))
# ───────────────────────── 代码块 / ASCII 图 ─────────────────────────
def add_code_block(doc: Document, lines: list[str], lang: str = "") -> None:
for ln in lines:
p = doc.add_paragraph()
pf = p.paragraph_format
pf.first_line_indent = None
pf.line_spacing = 1.0
pf.space_before = Pt(0)
pf.space_after = Pt(0)
run = p.add_run(ln if ln else " ")
run.font.size = Pt(10.5)
_set_run_fonts(run, cn_font="新宋体", en_font="Consolas")
for t in run._element.iter(qn("w:t")):
t.set(qn("xml:space"), "preserve")
# ───────────────────────── 表格 ─────────────────────────
def _split_md_row(line: str) -> list[str]:
return [c.strip() for c in line.strip().strip("|").split("|")]
def _is_separator_row(cells: list[str]) -> bool:
return all(re.match(r"^[-:\s]+$", c) for c in cells if c != "")
def render_table(doc: Document, table_lines: list[str]) -> None:
rows: list[list[str]] = []
for ln in table_lines:
cells = _split_md_row(ln)
if not cells or _is_separator_row(cells):
continue
rows.append(cells)
if not rows:
return
n_cols = max(len(r) for r in rows)
for r in rows:
while len(r) < n_cols:
r.append("")
table = doc.add_table(rows=len(rows), cols=n_cols)
try:
table.style = "Light Grid Accent 1"
except KeyError:
pass
for ri, row in enumerate(rows):
for ci, val in enumerate(row):
cell = table.rows[ri].cells[ci]
cell.text = ""
p = cell.paragraphs[0]
p.paragraph_format.first_line_indent = None
p.paragraph_format.line_spacing = 1.2
add_inline(p, val, size=Pt(10.5), cn_font="宋体")
if ri == 0:
for run in p.runs:
run.bold = True
# ───────────────────────── 图片 + 图题 ─────────────────────────
_IMAGE_LINE_RE = re.compile(r"^\s*!\[(?P[^\]]*)\]\((?P[^)\s]+)\)\s*$")
_MERMAID_CAPTION_RE = re.compile(r"^\s*%%\s*caption\s*:\s*(.+?)\s*$", re.IGNORECASE)
_FILENAME_INVALID_RE = re.compile(r"[^一-鿿A-Za-z0-9]+")
_MAX_IMG_WIDTH = Cm(15)
def caption_to_stem(caption: str) -> str:
cleaned = _FILENAME_INVALID_RE.sub("_", caption).strip("_")[:40]
if not cleaned:
return ""
return f"fig_{cleaned}"
def extract_mermaid_caption(source: str) -> str | None:
for ln in source.splitlines():
m = _MERMAID_CAPTION_RE.match(ln)
if m:
return m.group(1).strip()
return None
def _resolve_image_path(src: str, base_dir: Path) -> Path | None:
p = Path(src)
if not p.is_absolute():
p = (base_dir / p).resolve()
return p if p.is_file() else None
def add_image(doc: Document, png_path: Path, caption: str | None, ctx: dict) -> None:
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = None
p.paragraph_format.space_before = Pt(6)
p.paragraph_format.space_after = Pt(3)
run = p.add_run()
try:
run.add_picture(str(png_path), width=_MAX_IMG_WIDTH)
except Exception as e:
run.add_text(f"[image failed: {png_path.name}: {e}]")
return
ctx["fig_no"] = ctx.get("fig_no", 0) + 1
cap_p = doc.add_paragraph()
cap_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
cap_p.paragraph_format.first_line_indent = None
cap_p.paragraph_format.space_before = Pt(0)
cap_p.paragraph_format.space_after = Pt(6)
label = ctx.get("fig_label", "Fig.")
cap_text = f"{label} {ctx['fig_no']} {caption}" if caption else f"{label} {ctx['fig_no']}"
cap_run = cap_p.add_run(cap_text)
cap_run.font.size = Pt(10.5)
cap_run.bold = True
_set_run_fonts(cap_run, cn_font="宋体", en_font="Times New Roman")
# ───────────────────────── 主渲染 ─────────────────────────
def render_md_block(doc: Document, md_text: str, ctx: dict) -> None:
lines = md_text.splitlines()
i = 0
n = len(lines)
while i < n:
line = lines[i].rstrip()
if not line.strip():
i += 1
continue
if is_hr(line):
i += 1
continue
m_img = _IMAGE_LINE_RE.match(line)
if m_img:
src = m_img.group("src")
cap = m_img.group("cap").strip() or None
png = _resolve_image_path(src, ctx["sections_dir"])
if png is not None:
add_image(doc, png, cap, ctx)
else:
add_body_paragraph(doc, f"[image missing: {src}]", indent=False)
i += 1
continue
m_fence = _FENCE_RE.match(line)
if m_fence:
fence = m_fence.group(1)
lang = m_fence.group(2) or ""
code: list[str] = []
i += 1
while i < n:
m_close = _FENCE_RE.match(lines[i])
if m_close and m_close.group(1)[0] == fence[0] and len(m_close.group(1)) >= len(fence):
i += 1
break
code.append(lines[i])
i += 1
if lang.lower() == "mermaid":
source = "\n".join(code)
cap = extract_mermaid_caption(source)
if cap:
stem = caption_to_stem(cap)
if stem:
png = ctx["figures_dir"] / f"{stem}.png"
if png.is_file():
add_image(doc, png, cap, ctx)
continue
add_code_block(doc, code, lang)
continue
if is_table_line(line):
block: list[str] = []
while i < n and is_table_line(lines[i]):
block.append(lines[i])
i += 1
render_table(doc, block)
continue
m = _HEADING_RE.match(line)
if m:
level = min(len(m.group(1)), 3)
add_heading(doc, m.group(2).strip(), level)
i += 1
continue
if is_blockquote(line):
i += 1
continue
if is_list_item(line):
add_body_paragraph(doc, line.strip(), indent=False)
i += 1
continue
buf = [line.strip()]
j = i + 1
while j < n:
nxt = lines[j].rstrip()
if not nxt.strip():
break
if is_heading(nxt) or is_blockquote(nxt) or is_table_line(nxt) or is_list_item(nxt) or is_hr(nxt):
break
buf.append(nxt.strip())
j += 1
add_body_paragraph(doc, " ".join(buf), indent=True)
i = j
# ───────────────────────── 入口 ─────────────────────────
def render_sections(sections_dir: Path, out: Path, lang: str, toc: bool) -> None:
if not sections_dir.is_dir():
print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr)
sys.exit(2)
md_files = sorted(sections_dir.glob("*.md"))
if not md_files:
print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr)
sys.exit(2)
figures_dir = sections_dir.parent / "figures"
ctx: dict = {
"sections_dir": sections_dir,
"figures_dir": figures_dir,
"fig_no": 0,
"fig_label": "图" if lang == "zh" else "Fig.",
}
doc = init_doc()
if toc:
add_toc(doc)
for idx, f in enumerate(md_files):
text = f.read_text(encoding="utf-8")
render_md_block(doc, text, ctx)
if idx != len(md_files) - 1:
doc.add_page_break()
out.parent.mkdir(parents=True, exist_ok=True)
doc.save(str(out))
paras = sum(1 for _ in doc.paragraphs)
chars = sum(len(p.text) for p in doc.paragraphs)
tbls = len(doc.tables)
print(f"[OK] rendered {len(md_files)} sections -> {out}")
print(f" paragraphs: {paras} | tables: {tbls} | figures: {ctx['fig_no']} | total chars: {chars}")
print(f" lang: {lang} | toc: {toc}")
print(f" font: 中文宋体小四 / 英文 Times New Roman 小四 / 行距 1.5 / 首行缩进 2 字符")
def main() -> None:
ap = argparse.ArgumentParser(description="渲染章节 md → 论文投稿稿 docx")
ap.add_argument("sections_dir", type=Path, help="sections/*.md 目录")
ap.add_argument("--lang", choices=["zh", "en"], default="en",
help="论文语言 (影响图题前缀 图/Fig. 与信息打印); 默认 en")
ap.add_argument("--toc", action="store_true",
help="生成目录页 (期刊投稿稿通常不需要; 内部草稿评阅时可加)")
ap.add_argument("-o", "--output", type=Path, required=True, help="输出 .docx 路径")
args = ap.parse_args()
render_sections(args.sections_dir, args.output, args.lang, args.toc)
if __name__ == "__main__":
main()