zcbot/rendering/docx_manuscript.py

442 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""manuscript 体例 docx 渲染器(paper 投稿稿 + proposal 申报书,配置化双 profile)。
两者原是近亲(~80% 逐字相同),差异收进 PROFILES:页边距 / TOC 标题 / 图题前缀 /
列表多一条"第X条" / sections 循环(toc 是否默认 + 末段是否补分页)。函数体移植自
旧 paper/proposal render_docx.py,叶子原语走 rendering.common。
profile=paper: --lang {zh,en}(图题前缀 图/Fig.),--toc 可选(默认无)
profile=proposal: --fund-type ...(仅打印),始终带 TOC,每段后分页
"""
from __future__ import annotations
import re
import sys
from pathlib import Path
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Cm, Pt, RGBColor
from . import common
from .common import set_run_fonts, set_style_fonts, set_subscript, CHEM_RE, parse_inline
# ───────────────────────── profile 配置 ─────────────────────────
_BASE_LIST_PATTERNS = [
re.compile(r"^\[\d+\]\s"), # [1]
re.compile(r"^[-*+]\s"), # - / * / +
re.compile(r"^\d+[\.、.]\s*"), # 1. / 1、 / 1
re.compile(r"^\(\d+\)\s*"), # (1)
re.compile(r"^\d+\s*"), # 1
re.compile(r"^[一二三四五六七八九十百千]+[、.\.]"), # 一、
re.compile(r"^[(][一二三四五六七八九十百千]+[)]"), # (一)
re.compile(r"^[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮]"), # ①
]
PROFILES = {
"paper": {
"left_margin": Cm(2.5),
"right_margin": Cm(2.5),
"list_patterns": _BASE_LIST_PATTERNS,
"toc_title": "Contents",
"toc_placeholder": "[Press F9 in Word to generate the table of contents]",
"always_toc": False,
"trailing_page_break": False,
},
"proposal": {
"left_margin": Cm(3.0),
"right_margin": Cm(2.0),
"list_patterns": _BASE_LIST_PATTERNS + [
re.compile(r"^第[一二三四五六七八九十百]+[条章节]"), # 第一条
],
"toc_title": "目 录",
"toc_placeholder": "[在 Word 中按 F9 或右键此处选择 “更新域” 即可生成完整目录]",
"always_toc": True,
"trailing_page_break": True,
},
}
# ───────────────────────── 文档初始化 ─────────────────────────
def init_doc(prof: dict) -> Document:
doc = Document()
section = doc.sections[0]
section.page_height = Cm(29.7)
section.page_width = Cm(21)
section.top_margin = Cm(2.5)
section.bottom_margin = Cm(2.5)
section.left_margin = prof["left_margin"]
section.right_margin = prof["right_margin"]
normal = doc.styles["Normal"]
normal.font.name = "Times New Roman"
normal.font.size = Pt(12)
set_style_fonts(normal, cn_font="宋体")
pf = normal.paragraph_format
pf.line_spacing = 1.5
pf.space_before = Pt(0)
pf.space_after = Pt(0)
for lvl, sz, cn in [(1, Pt(14), "黑体"), (2, Pt(12), "黑体"), (3, Pt(12), "宋体")]:
h = doc.styles[f"Heading {lvl}"]
h.font.name = "Times New Roman"
h.font.size = sz
h.font.bold = True
h.font.color.rgb = RGBColor(0, 0, 0)
set_style_fonts(h, cn_font=cn)
h.paragraph_format.line_spacing = 1.5
h.paragraph_format.space_before = Pt(6)
h.paragraph_format.space_after = Pt(3)
h.paragraph_format.first_line_indent = None
return doc
def add_toc(doc: Document, prof: dict, depth: int = 3) -> None:
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = None
p.paragraph_format.space_before = Pt(12)
p.paragraph_format.space_after = Pt(6)
run = p.add_run(prof["toc_title"])
run.font.size = Pt(16)
run.font.bold = True
set_run_fonts(run, cn_font="黑体")
p = doc.add_paragraph()
p.paragraph_format.first_line_indent = None
run = p.add_run()
fldChar1 = OxmlElement("w:fldChar")
fldChar1.set(qn("w:fldCharType"), "begin")
instrText = OxmlElement("w:instrText")
instrText.set(qn("xml:space"), "preserve")
instrText.text = f' TOC \\o "1-{depth}" \\h \\z \\u '
fldChar2 = OxmlElement("w:fldChar")
fldChar2.set(qn("w:fldCharType"), "separate")
fldChar3 = OxmlElement("w:fldChar")
fldChar3.set(qn("w:fldCharType"), "end")
placeholder_t = OxmlElement("w:t")
placeholder_t.set(qn("xml:space"), "preserve")
placeholder_t.text = prof["toc_placeholder"]
run._element.append(fldChar1)
run._element.append(instrText)
run._element.append(fldChar2)
run._element.append(placeholder_t)
run._element.append(fldChar3)
doc.add_page_break()
# ───────────────────────── 内联(化学式下标)─────────────────────────
def _emit_plain_with_chem(paragraph, text: str, *, size, cn_font: str) -> None:
"""plain 段:白名单化学式里的数字渲成下标,其余正常。无命中即一条普通 run。"""
def _run(seg: str, sub: bool = False):
if not seg:
return
r = paragraph.add_run(seg)
r.font.size = size
set_run_fonts(r, cn_font=cn_font, en_font="Times New Roman")
if sub:
set_subscript(r)
pos = 0
for m in CHEM_RE.finditer(text):
_run(text[pos:m.start()])
buf = ""
for ch in m.group(0):
if ch.isdigit():
_run(buf); buf = ""
_run(ch, sub=True)
else:
buf += ch
_run(buf)
pos = m.end()
_run(text[pos:])
def add_inline(paragraph, text: str, *, size: Pt = Pt(12), cn_font: str = "宋体") -> None:
for style, seg in parse_inline(text):
if style == "plain":
_emit_plain_with_chem(paragraph, seg, size=size, cn_font=cn_font)
continue
run = paragraph.add_run(seg)
run.font.size = size
if style == "bold":
run.bold = True
set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
elif style == "italic":
run.italic = True
set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
elif style == "code":
set_run_fonts(run, cn_font=cn_font, en_font="Consolas")
# ───────────────────────── 段落 / 标题 / 列表 ─────────────────────────
def add_heading(doc: Document, text: str, level: int) -> None:
p = doc.add_paragraph(style=f"Heading {level}")
p.paragraph_format.first_line_indent = None
sizes = {1: Pt(14), 2: Pt(12), 3: Pt(12)}
cn = {1: "黑体", 2: "黑体", 3: "宋体"}
add_inline(p, text, size=sizes[level], cn_font=cn[level])
for run in p.runs:
run.bold = True
def add_body_paragraph(doc: Document, text: str, *, indent: bool = True) -> None:
p = doc.add_paragraph()
pf = p.paragraph_format
pf.line_spacing = 1.5
if indent:
pf.first_line_indent = Pt(24)
else:
pf.first_line_indent = None
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
add_inline(p, text)
def is_list_item(line: str, prof: dict) -> bool:
return any(p.match(line) for p in prof["list_patterns"])
def add_code_block(doc: Document, lines: list[str], lang: str = "") -> None:
for ln in lines:
p = doc.add_paragraph()
pf = p.paragraph_format
pf.first_line_indent = None
pf.line_spacing = 1.0
pf.space_before = Pt(0)
pf.space_after = Pt(0)
run = p.add_run(ln if ln else " ")
run.font.size = Pt(10.5)
set_run_fonts(run, cn_font="新宋体", en_font="Consolas")
for t in run._element.iter(qn("w:t")):
t.set(qn("xml:space"), "preserve")
# ───────────────────────── 表格 ─────────────────────────
def render_table(doc: Document, table_lines: list[str]) -> None:
rows: list[list[str]] = []
for ln in table_lines:
cells = common.split_md_row(ln)
if not cells or common.is_separator_row(cells):
continue
rows.append(cells)
if not rows:
return
n_cols = max(len(r) for r in rows)
for r in rows:
while len(r) < n_cols:
r.append("")
table = doc.add_table(rows=len(rows), cols=n_cols)
try:
table.style = "Light Grid Accent 1"
except KeyError:
pass
for ri, row in enumerate(rows):
for ci, val in enumerate(row):
cell = table.rows[ri].cells[ci]
cell.text = ""
p = cell.paragraphs[0]
p.paragraph_format.first_line_indent = None
p.paragraph_format.line_spacing = 1.2
add_inline(p, val, size=Pt(10.5), cn_font="宋体")
if ri == 0:
for run in p.runs:
run.bold = True
# ───────────────────────── 图片 + 图题 ─────────────────────────
_MERMAID_CAPTION_RE = re.compile(r"^\s*%%\s*caption\s*:\s*(.+?)\s*$", re.IGNORECASE)
_FILENAME_INVALID_RE = re.compile(r"[^一-鿿A-Za-z0-9]+")
def caption_to_stem(caption: str) -> str:
cleaned = _FILENAME_INVALID_RE.sub("_", caption).strip("_")[:40]
if not cleaned:
return ""
return f"fig_{cleaned}"
def extract_mermaid_caption(source: str) -> str | None:
for ln in source.splitlines():
m = _MERMAID_CAPTION_RE.match(ln)
if m:
return m.group(1).strip()
return None
def add_image(doc: Document, png_path: Path, caption: str | None, ctx: dict) -> None:
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.first_line_indent = None
p.paragraph_format.space_before = Pt(6)
p.paragraph_format.space_after = Pt(3)
run = p.add_run()
try:
run.add_picture(str(png_path), width=common.MAX_IMG_WIDTH)
except Exception as e:
run.add_text(f"[image failed: {png_path.name}: {e}]")
return
ctx["fig_no"] = ctx.get("fig_no", 0) + 1
cap_p = doc.add_paragraph()
cap_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
cap_p.paragraph_format.first_line_indent = None
cap_p.paragraph_format.space_before = Pt(0)
cap_p.paragraph_format.space_after = Pt(6)
label = ctx.get("fig_label", "Fig.")
cap_text = f"{label} {ctx['fig_no']} {caption}" if caption else f"{label} {ctx['fig_no']}"
cap_run = cap_p.add_run(cap_text)
cap_run.font.size = Pt(10.5)
cap_run.bold = True
set_run_fonts(cap_run, cn_font="宋体", en_font="Times New Roman")
# ───────────────────────── 主渲染 ─────────────────────────
def render_md_block(doc: Document, md_text: str, ctx: dict) -> None:
prof = ctx["prof"]
lines = md_text.splitlines()
i = 0
n = len(lines)
while i < n:
line = lines[i].rstrip()
if not line.strip():
i += 1
continue
if common.is_hr(line):
i += 1
continue
m_img = common.IMAGE_LINE_RE.match(line)
if m_img:
src = m_img.group("src")
cap = m_img.group("cap").strip() or None
png = common.resolve_image_path(src, ctx["sections_dir"])
if png is not None:
add_image(doc, png, cap, ctx)
else:
add_body_paragraph(doc, f"[image missing: {src}]", indent=False)
i += 1
continue
m_fence = common.FENCE_RE.match(line)
if m_fence:
fence = m_fence.group(1)
lang = m_fence.group(2) or ""
code: list[str] = []
i += 1
while i < n:
m_close = common.FENCE_RE.match(lines[i])
if m_close and m_close.group(1)[0] == fence[0] and len(m_close.group(1)) >= len(fence):
i += 1
break
code.append(lines[i])
i += 1
if lang.lower() == "mermaid":
source = "\n".join(code)
cap = extract_mermaid_caption(source)
if cap:
stem = caption_to_stem(cap)
if stem:
png = ctx["figures_dir"] / f"{stem}.png"
if png.is_file():
add_image(doc, png, cap, ctx)
continue
add_code_block(doc, code, lang)
continue
if common.is_table_line(line):
block: list[str] = []
while i < n and common.is_table_line(lines[i]):
block.append(lines[i])
i += 1
render_table(doc, block)
continue
m = common.HEADING_RE.match(line)
if m:
level = min(len(m.group(1)), 3)
add_heading(doc, m.group(2).strip(), level)
i += 1
continue
if common.is_blockquote(line):
i += 1
continue
if is_list_item(line, prof):
add_body_paragraph(doc, line.strip(), indent=False)
i += 1
continue
buf = [line.strip()]
j = i + 1
while j < n:
nxt = lines[j].rstrip()
if not nxt.strip():
break
if (common.is_heading(nxt) or common.is_blockquote(nxt) or common.is_table_line(nxt)
or is_list_item(nxt, prof) or common.is_hr(nxt)):
break
buf.append(nxt.strip())
j += 1
add_body_paragraph(doc, " ".join(buf), indent=True)
i = j
# ───────────────────────── 入口 ─────────────────────────
def render_sections(profile: str, sections_dir: Path, out: Path, *,
lang: str = "en", toc: bool = False, fund_type: str = "") -> None:
prof = PROFILES[profile]
if not sections_dir.is_dir():
print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr)
sys.exit(2)
md_files = sorted(sections_dir.glob("*.md"))
if not md_files:
print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr)
sys.exit(2)
figures_dir = sections_dir.parent / "figures"
ctx: dict = {
"prof": prof,
"sections_dir": sections_dir,
"figures_dir": figures_dir,
"fig_no": 0,
"fig_label": ("" if lang == "zh" else "Fig.") if profile == "paper" else "",
}
doc = init_doc(prof)
if prof["always_toc"] or toc:
add_toc(doc, prof)
for idx, f in enumerate(md_files):
text = f.read_text(encoding="utf-8")
render_md_block(doc, text, ctx)
if prof["trailing_page_break"] or idx != len(md_files) - 1:
doc.add_page_break()
out.parent.mkdir(parents=True, exist_ok=True)
doc.save(str(out))
paras = sum(1 for _ in doc.paragraphs)
chars = sum(len(p.text) for p in doc.paragraphs)
tbls = len(doc.tables)
print(f"[OK] rendered {len(md_files)} sections -> {out}")
print(f" profile: {profile} | paragraphs: {paras} | tables: {tbls} | "
f"figures: {ctx['fig_no']} | chars: {chars}")