442 lines
16 KiB
Python
442 lines
16 KiB
Python
"""manuscript 体例 docx 渲染器(paper 投稿稿 + proposal 申报书,配置化双 profile)。
|
||
|
||
两者原是近亲(~80% 逐字相同),差异收进 PROFILES:页边距 / TOC 标题 / 图题前缀 /
|
||
列表多一条"第X条" / sections 循环(toc 是否默认 + 末段是否补分页)。函数体移植自
|
||
旧 paper/proposal render_docx.py,叶子原语走 rendering.common。
|
||
|
||
profile=paper: --lang {zh,en}(图题前缀 图/Fig.),--toc 可选(默认无)
|
||
profile=proposal: --fund-type ...(仅打印),始终带 TOC,每段后分页
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
from docx import Document
|
||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
from docx.oxml import OxmlElement
|
||
from docx.oxml.ns import qn
|
||
from docx.shared import Cm, Pt, RGBColor
|
||
|
||
from . import common
|
||
from .common import set_run_fonts, set_style_fonts, set_subscript, CHEM_RE, parse_inline
|
||
|
||
|
||
# ───────────────────────── profile 配置 ─────────────────────────
|
||
|
||
_BASE_LIST_PATTERNS = [
|
||
re.compile(r"^\[\d+\]\s"), # [1]
|
||
re.compile(r"^[-*+]\s"), # - / * / +
|
||
re.compile(r"^\d+[\.、.]\s*"), # 1. / 1、 / 1.
|
||
re.compile(r"^\(\d+\)\s*"), # (1)
|
||
re.compile(r"^(\d+)\s*"), # (1)
|
||
re.compile(r"^[一二三四五六七八九十百千]+[、.\.]"), # 一、
|
||
re.compile(r"^[((][一二三四五六七八九十百千]+[))]"), # (一)
|
||
re.compile(r"^[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮]"), # ①
|
||
]
|
||
|
||
PROFILES = {
|
||
"paper": {
|
||
"left_margin": Cm(2.5),
|
||
"right_margin": Cm(2.5),
|
||
"list_patterns": _BASE_LIST_PATTERNS,
|
||
"toc_title": "Contents",
|
||
"toc_placeholder": "[Press F9 in Word to generate the table of contents]",
|
||
"always_toc": False,
|
||
"trailing_page_break": False,
|
||
},
|
||
"proposal": {
|
||
"left_margin": Cm(3.0),
|
||
"right_margin": Cm(2.0),
|
||
"list_patterns": _BASE_LIST_PATTERNS + [
|
||
re.compile(r"^第[一二三四五六七八九十百]+[条章节]"), # 第一条
|
||
],
|
||
"toc_title": "目 录",
|
||
"toc_placeholder": "[在 Word 中按 F9 或右键此处选择 “更新域” 即可生成完整目录]",
|
||
"always_toc": True,
|
||
"trailing_page_break": True,
|
||
},
|
||
}
|
||
|
||
|
||
# ───────────────────────── 文档初始化 ─────────────────────────
|
||
|
||
def init_doc(prof: dict) -> Document:
|
||
doc = Document()
|
||
|
||
section = doc.sections[0]
|
||
section.page_height = Cm(29.7)
|
||
section.page_width = Cm(21)
|
||
section.top_margin = Cm(2.5)
|
||
section.bottom_margin = Cm(2.5)
|
||
section.left_margin = prof["left_margin"]
|
||
section.right_margin = prof["right_margin"]
|
||
|
||
normal = doc.styles["Normal"]
|
||
normal.font.name = "Times New Roman"
|
||
normal.font.size = Pt(12)
|
||
set_style_fonts(normal, cn_font="宋体")
|
||
pf = normal.paragraph_format
|
||
pf.line_spacing = 1.5
|
||
pf.space_before = Pt(0)
|
||
pf.space_after = Pt(0)
|
||
|
||
for lvl, sz, cn in [(1, Pt(14), "黑体"), (2, Pt(12), "黑体"), (3, Pt(12), "宋体")]:
|
||
h = doc.styles[f"Heading {lvl}"]
|
||
h.font.name = "Times New Roman"
|
||
h.font.size = sz
|
||
h.font.bold = True
|
||
h.font.color.rgb = RGBColor(0, 0, 0)
|
||
set_style_fonts(h, cn_font=cn)
|
||
h.paragraph_format.line_spacing = 1.5
|
||
h.paragraph_format.space_before = Pt(6)
|
||
h.paragraph_format.space_after = Pt(3)
|
||
h.paragraph_format.first_line_indent = None
|
||
|
||
return doc
|
||
|
||
|
||
def add_toc(doc: Document, prof: dict, depth: int = 3) -> None:
|
||
p = doc.add_paragraph()
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
p.paragraph_format.first_line_indent = None
|
||
p.paragraph_format.space_before = Pt(12)
|
||
p.paragraph_format.space_after = Pt(6)
|
||
run = p.add_run(prof["toc_title"])
|
||
run.font.size = Pt(16)
|
||
run.font.bold = True
|
||
set_run_fonts(run, cn_font="黑体")
|
||
|
||
p = doc.add_paragraph()
|
||
p.paragraph_format.first_line_indent = None
|
||
run = p.add_run()
|
||
|
||
fldChar1 = OxmlElement("w:fldChar")
|
||
fldChar1.set(qn("w:fldCharType"), "begin")
|
||
instrText = OxmlElement("w:instrText")
|
||
instrText.set(qn("xml:space"), "preserve")
|
||
instrText.text = f' TOC \\o "1-{depth}" \\h \\z \\u '
|
||
fldChar2 = OxmlElement("w:fldChar")
|
||
fldChar2.set(qn("w:fldCharType"), "separate")
|
||
fldChar3 = OxmlElement("w:fldChar")
|
||
fldChar3.set(qn("w:fldCharType"), "end")
|
||
placeholder_t = OxmlElement("w:t")
|
||
placeholder_t.set(qn("xml:space"), "preserve")
|
||
placeholder_t.text = prof["toc_placeholder"]
|
||
run._element.append(fldChar1)
|
||
run._element.append(instrText)
|
||
run._element.append(fldChar2)
|
||
run._element.append(placeholder_t)
|
||
run._element.append(fldChar3)
|
||
doc.add_page_break()
|
||
|
||
|
||
# ───────────────────────── 内联(化学式下标)─────────────────────────
|
||
|
||
def _emit_plain_with_chem(paragraph, text: str, *, size, cn_font: str) -> None:
|
||
"""plain 段:白名单化学式里的数字渲成下标,其余正常。无命中即一条普通 run。"""
|
||
def _run(seg: str, sub: bool = False):
|
||
if not seg:
|
||
return
|
||
r = paragraph.add_run(seg)
|
||
r.font.size = size
|
||
set_run_fonts(r, cn_font=cn_font, en_font="Times New Roman")
|
||
if sub:
|
||
set_subscript(r)
|
||
|
||
pos = 0
|
||
for m in CHEM_RE.finditer(text):
|
||
_run(text[pos:m.start()])
|
||
buf = ""
|
||
for ch in m.group(0):
|
||
if ch.isdigit():
|
||
_run(buf); buf = ""
|
||
_run(ch, sub=True)
|
||
else:
|
||
buf += ch
|
||
_run(buf)
|
||
pos = m.end()
|
||
_run(text[pos:])
|
||
|
||
|
||
def add_inline(paragraph, text: str, *, size: Pt = Pt(12), cn_font: str = "宋体") -> None:
|
||
for style, seg in parse_inline(text):
|
||
if style == "plain":
|
||
_emit_plain_with_chem(paragraph, seg, size=size, cn_font=cn_font)
|
||
continue
|
||
run = paragraph.add_run(seg)
|
||
run.font.size = size
|
||
if style == "bold":
|
||
run.bold = True
|
||
set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
|
||
elif style == "italic":
|
||
run.italic = True
|
||
set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
|
||
elif style == "code":
|
||
set_run_fonts(run, cn_font=cn_font, en_font="Consolas")
|
||
|
||
|
||
# ───────────────────────── 段落 / 标题 / 列表 ─────────────────────────
|
||
|
||
def add_heading(doc: Document, text: str, level: int) -> None:
|
||
p = doc.add_paragraph(style=f"Heading {level}")
|
||
p.paragraph_format.first_line_indent = None
|
||
sizes = {1: Pt(14), 2: Pt(12), 3: Pt(12)}
|
||
cn = {1: "黑体", 2: "黑体", 3: "宋体"}
|
||
add_inline(p, text, size=sizes[level], cn_font=cn[level])
|
||
for run in p.runs:
|
||
run.bold = True
|
||
|
||
|
||
def add_body_paragraph(doc: Document, text: str, *, indent: bool = True) -> None:
|
||
p = doc.add_paragraph()
|
||
pf = p.paragraph_format
|
||
pf.line_spacing = 1.5
|
||
if indent:
|
||
pf.first_line_indent = Pt(24)
|
||
else:
|
||
pf.first_line_indent = None
|
||
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
|
||
add_inline(p, text)
|
||
|
||
|
||
def is_list_item(line: str, prof: dict) -> bool:
|
||
return any(p.match(line) for p in prof["list_patterns"])
|
||
|
||
|
||
def add_code_block(doc: Document, lines: list[str], lang: str = "") -> None:
|
||
for ln in lines:
|
||
p = doc.add_paragraph()
|
||
pf = p.paragraph_format
|
||
pf.first_line_indent = None
|
||
pf.line_spacing = 1.0
|
||
pf.space_before = Pt(0)
|
||
pf.space_after = Pt(0)
|
||
run = p.add_run(ln if ln else " ")
|
||
run.font.size = Pt(10.5)
|
||
set_run_fonts(run, cn_font="新宋体", en_font="Consolas")
|
||
for t in run._element.iter(qn("w:t")):
|
||
t.set(qn("xml:space"), "preserve")
|
||
|
||
|
||
# ───────────────────────── 表格 ─────────────────────────
|
||
|
||
def render_table(doc: Document, table_lines: list[str]) -> None:
|
||
rows: list[list[str]] = []
|
||
for ln in table_lines:
|
||
cells = common.split_md_row(ln)
|
||
if not cells or common.is_separator_row(cells):
|
||
continue
|
||
rows.append(cells)
|
||
if not rows:
|
||
return
|
||
n_cols = max(len(r) for r in rows)
|
||
for r in rows:
|
||
while len(r) < n_cols:
|
||
r.append("")
|
||
|
||
table = doc.add_table(rows=len(rows), cols=n_cols)
|
||
try:
|
||
table.style = "Light Grid Accent 1"
|
||
except KeyError:
|
||
pass
|
||
|
||
for ri, row in enumerate(rows):
|
||
for ci, val in enumerate(row):
|
||
cell = table.rows[ri].cells[ci]
|
||
cell.text = ""
|
||
p = cell.paragraphs[0]
|
||
p.paragraph_format.first_line_indent = None
|
||
p.paragraph_format.line_spacing = 1.2
|
||
add_inline(p, val, size=Pt(10.5), cn_font="宋体")
|
||
if ri == 0:
|
||
for run in p.runs:
|
||
run.bold = True
|
||
|
||
|
||
# ───────────────────────── 图片 + 图题 ─────────────────────────
|
||
|
||
_MERMAID_CAPTION_RE = re.compile(r"^\s*%%\s*caption\s*:\s*(.+?)\s*$", re.IGNORECASE)
|
||
_FILENAME_INVALID_RE = re.compile(r"[^一-鿿A-Za-z0-9]+")
|
||
|
||
|
||
def caption_to_stem(caption: str) -> str:
|
||
cleaned = _FILENAME_INVALID_RE.sub("_", caption).strip("_")[:40]
|
||
if not cleaned:
|
||
return ""
|
||
return f"fig_{cleaned}"
|
||
|
||
|
||
def extract_mermaid_caption(source: str) -> str | None:
|
||
for ln in source.splitlines():
|
||
m = _MERMAID_CAPTION_RE.match(ln)
|
||
if m:
|
||
return m.group(1).strip()
|
||
return None
|
||
|
||
|
||
def add_image(doc: Document, png_path: Path, caption: str | None, ctx: dict) -> None:
|
||
p = doc.add_paragraph()
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
p.paragraph_format.first_line_indent = None
|
||
p.paragraph_format.space_before = Pt(6)
|
||
p.paragraph_format.space_after = Pt(3)
|
||
run = p.add_run()
|
||
try:
|
||
run.add_picture(str(png_path), width=common.MAX_IMG_WIDTH)
|
||
except Exception as e:
|
||
run.add_text(f"[image failed: {png_path.name}: {e}]")
|
||
return
|
||
|
||
ctx["fig_no"] = ctx.get("fig_no", 0) + 1
|
||
cap_p = doc.add_paragraph()
|
||
cap_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
cap_p.paragraph_format.first_line_indent = None
|
||
cap_p.paragraph_format.space_before = Pt(0)
|
||
cap_p.paragraph_format.space_after = Pt(6)
|
||
label = ctx.get("fig_label", "Fig.")
|
||
cap_text = f"{label} {ctx['fig_no']} {caption}" if caption else f"{label} {ctx['fig_no']}"
|
||
cap_run = cap_p.add_run(cap_text)
|
||
cap_run.font.size = Pt(10.5)
|
||
cap_run.bold = True
|
||
set_run_fonts(cap_run, cn_font="宋体", en_font="Times New Roman")
|
||
|
||
|
||
# ───────────────────────── 主渲染 ─────────────────────────
|
||
|
||
def render_md_block(doc: Document, md_text: str, ctx: dict) -> None:
|
||
prof = ctx["prof"]
|
||
lines = md_text.splitlines()
|
||
i = 0
|
||
n = len(lines)
|
||
while i < n:
|
||
line = lines[i].rstrip()
|
||
|
||
if not line.strip():
|
||
i += 1
|
||
continue
|
||
|
||
if common.is_hr(line):
|
||
i += 1
|
||
continue
|
||
|
||
m_img = common.IMAGE_LINE_RE.match(line)
|
||
if m_img:
|
||
src = m_img.group("src")
|
||
cap = m_img.group("cap").strip() or None
|
||
png = common.resolve_image_path(src, ctx["sections_dir"])
|
||
if png is not None:
|
||
add_image(doc, png, cap, ctx)
|
||
else:
|
||
add_body_paragraph(doc, f"[image missing: {src}]", indent=False)
|
||
i += 1
|
||
continue
|
||
|
||
m_fence = common.FENCE_RE.match(line)
|
||
if m_fence:
|
||
fence = m_fence.group(1)
|
||
lang = m_fence.group(2) or ""
|
||
code: list[str] = []
|
||
i += 1
|
||
while i < n:
|
||
m_close = common.FENCE_RE.match(lines[i])
|
||
if m_close and m_close.group(1)[0] == fence[0] and len(m_close.group(1)) >= len(fence):
|
||
i += 1
|
||
break
|
||
code.append(lines[i])
|
||
i += 1
|
||
|
||
if lang.lower() == "mermaid":
|
||
source = "\n".join(code)
|
||
cap = extract_mermaid_caption(source)
|
||
if cap:
|
||
stem = caption_to_stem(cap)
|
||
if stem:
|
||
png = ctx["figures_dir"] / f"{stem}.png"
|
||
if png.is_file():
|
||
add_image(doc, png, cap, ctx)
|
||
continue
|
||
add_code_block(doc, code, lang)
|
||
continue
|
||
|
||
if common.is_table_line(line):
|
||
block: list[str] = []
|
||
while i < n and common.is_table_line(lines[i]):
|
||
block.append(lines[i])
|
||
i += 1
|
||
render_table(doc, block)
|
||
continue
|
||
|
||
m = common.HEADING_RE.match(line)
|
||
if m:
|
||
level = min(len(m.group(1)), 3)
|
||
add_heading(doc, m.group(2).strip(), level)
|
||
i += 1
|
||
continue
|
||
|
||
if common.is_blockquote(line):
|
||
i += 1
|
||
continue
|
||
|
||
if is_list_item(line, prof):
|
||
add_body_paragraph(doc, line.strip(), indent=False)
|
||
i += 1
|
||
continue
|
||
|
||
buf = [line.strip()]
|
||
j = i + 1
|
||
while j < n:
|
||
nxt = lines[j].rstrip()
|
||
if not nxt.strip():
|
||
break
|
||
if (common.is_heading(nxt) or common.is_blockquote(nxt) or common.is_table_line(nxt)
|
||
or is_list_item(nxt, prof) or common.is_hr(nxt)):
|
||
break
|
||
buf.append(nxt.strip())
|
||
j += 1
|
||
add_body_paragraph(doc, " ".join(buf), indent=True)
|
||
i = j
|
||
|
||
|
||
# ───────────────────────── 入口 ─────────────────────────
|
||
|
||
def render_sections(profile: str, sections_dir: Path, out: Path, *,
|
||
lang: str = "en", toc: bool = False, fund_type: str = "") -> None:
|
||
prof = PROFILES[profile]
|
||
if not sections_dir.is_dir():
|
||
print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr)
|
||
sys.exit(2)
|
||
md_files = sorted(sections_dir.glob("*.md"))
|
||
if not md_files:
|
||
print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr)
|
||
sys.exit(2)
|
||
|
||
figures_dir = sections_dir.parent / "figures"
|
||
ctx: dict = {
|
||
"prof": prof,
|
||
"sections_dir": sections_dir,
|
||
"figures_dir": figures_dir,
|
||
"fig_no": 0,
|
||
"fig_label": ("图" if lang == "zh" else "Fig.") if profile == "paper" else "图",
|
||
}
|
||
|
||
doc = init_doc(prof)
|
||
if prof["always_toc"] or toc:
|
||
add_toc(doc, prof)
|
||
for idx, f in enumerate(md_files):
|
||
text = f.read_text(encoding="utf-8")
|
||
render_md_block(doc, text, ctx)
|
||
if prof["trailing_page_break"] or idx != len(md_files) - 1:
|
||
doc.add_page_break()
|
||
|
||
out.parent.mkdir(parents=True, exist_ok=True)
|
||
doc.save(str(out))
|
||
|
||
paras = sum(1 for _ in doc.paragraphs)
|
||
chars = sum(len(p.text) for p in doc.paragraphs)
|
||
tbls = len(doc.tables)
|
||
print(f"[OK] rendered {len(md_files)} sections -> {out}")
|
||
print(f" profile: {profile} | paragraphs: {paras} | tables: {tbls} | "
|
||
f"figures: {ctx['fig_no']} | chars: {chars}")
|