731 lines
27 KiB
Python
731 lines
27 KiB
Python
"""把 sections/*.md 渲染成科研方向简报 .docx(简报体例,区别于 paper 的投稿稿)。
|
||
|
||
相对 paper/render_docx.py 的简报专属增强:
|
||
- **商务红配色**(主色 #C00000):标题分级染色 + 标题下细色条;TL;DR / 「判断」行做浅红底纹 callout
|
||
- **引文上标 + 内部超链接**:正文 [1] / [W3] → 上标红色,点击锚到「重要论文列表 / 参考文献」段对应条目
|
||
- **论文列表 / 参考文献可点击**:标题含「论文列表 / 文献列表 / 参考文献」的段,行首 [n] 条目作锚点;
|
||
条目内 DOI(整条是 DOI 或末尾 "DOI: 10.xxx")→ https://doi.org/... 蓝色超链接;web 条目里的域名/路径 → https:// 超链接
|
||
- **化学式下标(白名单)**:CO2 / C3S2 / Na2O / SO4 ... → 真实下标,**白名单精确匹配**,不误伤 LC3 / EN 197-5 / 8.5 Mt / 2026
|
||
|
||
字体规范同院内其它渲染:中文宋体小四 / 英文 Times New Roman 小四 / 行距 1.5 / 首行缩进 2 字符。
|
||
支持 **加粗** / *斜体* / `等宽` / 列表 / 表格 /  居中插图。
|
||
|
||
用法:
|
||
python render_docx.py <sections_dir> -o <out.docx>
|
||
python render_docx.py <sections_dir> --no-color -o <out.docx> # 关配色出纯黑白
|
||
"""
|
||
from __future__ import annotations
|
||
import argparse
|
||
import re
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
from docx import Document
|
||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
from docx.opc.constants import RELATIONSHIP_TYPE as RT
|
||
from docx.oxml import OxmlElement
|
||
from docx.oxml.ns import qn
|
||
from docx.shared import Cm, Pt, RGBColor
|
||
|
||
# ───────────────────────── 主题色 ─────────────────────────
|
||
|
||
PRIMARY = "C00000" # 商务红主色
|
||
PRIMARY_RGB = RGBColor(0xC0, 0x00, 0x00)
|
||
TLDR_FILL = "FBE9E9" # TL;DR 浅红底纹
|
||
CALLOUT_FILL = "F7DDDD" # 「判断」callout 底纹
|
||
LINK_BLUE = "1155CC" # 超链接蓝
|
||
TABLE_HEAD_FILL = "C00000"
|
||
|
||
|
||
# ───────────────────────── 字体 / 低层 OOXML 辅助 ─────────────────────────
|
||
|
||
def _set_run_fonts(run, *, cn_font="宋体", en_font="Times New Roman") -> None:
|
||
rPr = run._element.get_or_add_rPr()
|
||
rFonts = rPr.find(qn("w:rFonts"))
|
||
if rFonts is None:
|
||
rFonts = OxmlElement("w:rFonts")
|
||
rPr.append(rFonts)
|
||
rFonts.set(qn("w:eastAsia"), cn_font)
|
||
rFonts.set(qn("w:ascii"), en_font)
|
||
rFonts.set(qn("w:hAnsi"), en_font)
|
||
|
||
|
||
def _set_style_fonts(style, *, cn_font="宋体", en_font="Times New Roman") -> None:
|
||
el = style.element
|
||
rPr = el.find(qn("w:rPr"))
|
||
if rPr is None:
|
||
rPr = OxmlElement("w:rPr")
|
||
el.insert(0, rPr)
|
||
rFonts = rPr.find(qn("w:rFonts"))
|
||
if rFonts is None:
|
||
rFonts = OxmlElement("w:rFonts")
|
||
rPr.append(rFonts)
|
||
rFonts.set(qn("w:eastAsia"), cn_font)
|
||
rFonts.set(qn("w:ascii"), en_font)
|
||
rFonts.set(qn("w:hAnsi"), en_font)
|
||
|
||
|
||
def _set_subscript(run) -> None:
|
||
rPr = run._element.get_or_add_rPr()
|
||
va = OxmlElement("w:vertAlign")
|
||
va.set(qn("w:val"), "subscript")
|
||
rPr.append(va)
|
||
|
||
|
||
def _para_shading(paragraph, fill: str) -> None:
|
||
pPr = paragraph._p.get_or_add_pPr()
|
||
shd = OxmlElement("w:shd")
|
||
shd.set(qn("w:val"), "clear")
|
||
shd.set(qn("w:color"), "auto")
|
||
shd.set(qn("w:fill"), fill)
|
||
pPr.append(shd)
|
||
|
||
|
||
def _para_border(paragraph, *, sides=("bottom",), color=PRIMARY, size=8, space=3) -> None:
|
||
pPr = paragraph._p.get_or_add_pPr()
|
||
pBdr = pPr.find(qn("w:pBdr"))
|
||
if pBdr is None:
|
||
pBdr = OxmlElement("w:pBdr")
|
||
pPr.append(pBdr)
|
||
for side in sides:
|
||
el = OxmlElement(f"w:{side}")
|
||
el.set(qn("w:val"), "single")
|
||
el.set(qn("w:sz"), str(size))
|
||
el.set(qn("w:space"), str(space))
|
||
el.set(qn("w:color"), color)
|
||
pBdr.append(el)
|
||
|
||
|
||
def _add_bookmark(paragraph, name: str, bm_id: int) -> None:
|
||
start = OxmlElement("w:bookmarkStart")
|
||
start.set(qn("w:id"), str(bm_id))
|
||
start.set(qn("w:name"), name)
|
||
end = OxmlElement("w:bookmarkEnd")
|
||
end.set(qn("w:id"), str(bm_id))
|
||
paragraph._p.insert(0, start)
|
||
paragraph._p.append(end)
|
||
|
||
|
||
def _mk_run_xml(text: str, *, size_pt: float, color=None, superscript=False,
|
||
underline=False, bold=False, cn_font="宋体", en_font="Times New Roman"):
|
||
r = OxmlElement("w:r")
|
||
rPr = OxmlElement("w:rPr")
|
||
rFonts = OxmlElement("w:rFonts")
|
||
rFonts.set(qn("w:eastAsia"), cn_font)
|
||
rFonts.set(qn("w:ascii"), en_font)
|
||
rFonts.set(qn("w:hAnsi"), en_font)
|
||
rPr.append(rFonts)
|
||
if bold:
|
||
rPr.append(OxmlElement("w:b"))
|
||
if color:
|
||
c = OxmlElement("w:color")
|
||
c.set(qn("w:val"), color)
|
||
rPr.append(c)
|
||
if underline:
|
||
u = OxmlElement("w:u")
|
||
u.set(qn("w:val"), "single")
|
||
rPr.append(u)
|
||
if superscript:
|
||
va = OxmlElement("w:vertAlign")
|
||
va.set(qn("w:val"), "superscript")
|
||
rPr.append(va)
|
||
sz = OxmlElement("w:sz")
|
||
sz.set(qn("w:val"), str(int(size_pt * 2)))
|
||
rPr.append(sz)
|
||
r.append(rPr)
|
||
t = OxmlElement("w:t")
|
||
t.set(qn("xml:space"), "preserve")
|
||
t.text = text
|
||
r.append(t)
|
||
return r
|
||
|
||
|
||
def add_internal_link(paragraph, anchor: str, text: str, *, size_pt: float,
|
||
color=PRIMARY, superscript=False) -> None:
|
||
h = OxmlElement("w:hyperlink")
|
||
h.set(qn("w:anchor"), anchor)
|
||
h.append(_mk_run_xml(text, size_pt=size_pt, color=color, superscript=superscript))
|
||
paragraph._p.append(h)
|
||
|
||
|
||
def add_external_link(paragraph, url: str, text: str, *, size_pt: float) -> None:
|
||
part = paragraph.part
|
||
r_id = part.relate_to(url, RT.HYPERLINK, is_external=True)
|
||
h = OxmlElement("w:hyperlink")
|
||
h.set(qn("r:id"), r_id)
|
||
h.append(_mk_run_xml(text, size_pt=size_pt, color=LINK_BLUE, underline=True))
|
||
paragraph._p.append(h)
|
||
|
||
|
||
# ───────────────────────── 文档初始化 ─────────────────────────
|
||
|
||
def init_doc(color: bool) -> Document:
|
||
doc = Document()
|
||
section = doc.sections[0]
|
||
section.page_height = Cm(29.7)
|
||
section.page_width = Cm(21)
|
||
for m in ("top_margin", "bottom_margin", "left_margin", "right_margin"):
|
||
setattr(section, m, Cm(2.5))
|
||
|
||
normal = doc.styles["Normal"]
|
||
normal.font.name = "Times New Roman"
|
||
normal.font.size = Pt(12)
|
||
_set_style_fonts(normal, cn_font="宋体")
|
||
pf = normal.paragraph_format
|
||
pf.line_spacing = 1.5
|
||
pf.space_before = Pt(0)
|
||
pf.space_after = Pt(0)
|
||
|
||
head_color = PRIMARY_RGB if color else RGBColor(0, 0, 0)
|
||
for lvl, sz, cn in [(1, Pt(18), "黑体"), (2, Pt(14), "黑体"), (3, Pt(12), "黑体")]:
|
||
h = doc.styles[f"Heading {lvl}"]
|
||
h.font.name = "Times New Roman"
|
||
h.font.size = sz
|
||
h.font.bold = True
|
||
h.font.color.rgb = head_color
|
||
_set_style_fonts(h, cn_font=cn)
|
||
h.paragraph_format.line_spacing = 1.3
|
||
h.paragraph_format.space_before = Pt(10 if lvl <= 2 else 6)
|
||
h.paragraph_format.space_after = Pt(4)
|
||
h.paragraph_format.first_line_indent = None
|
||
return doc
|
||
|
||
|
||
# ───────────────────────── 内联:bold/italic/code 切分 ─────────────────────────
|
||
|
||
_INLINE_RE = re.compile(
|
||
r"(?P<bold>\*\*(?P<bold_t>[^*\n]+?)\*\*)"
|
||
r"|(?P<italic>(?<![\*\w])\*(?P<italic_t>[^*\n]+?)\*(?!\*))"
|
||
r"|(?P<code>`(?P<code_t>[^`\n]+?)`)"
|
||
)
|
||
|
||
# 引文标记 [12] / [W3]
|
||
_CITE_RE = re.compile(r"\[(W?\d+)\]")
|
||
|
||
# 化学式下标白名单(统一三处渲染器共用同一份;长的在前,\b 防误伤 LC3 / C595 / 2026;
|
||
# 不含 Ca2+ 这类带电荷的——它是上标不是下标,白名单不收即天然避开)
|
||
_CHEM_RE = re.compile(
|
||
r"Ca\(OH\)2|Mg\(OH\)2"
|
||
r"|\b(?:Al2O3|Fe2O3|Fe3O4|Mn2O3|Cr2O3|P2O5|Na2SO4|K2SO4|CaSO4|CaCO3|MgCO3|"
|
||
r"CaCl2|MgCl2|Na2O|K2O|SiO2|TiO2|ZrO2|SO4|SO3|SO2|CO3|CO2|NO3|NO2|PO4|"
|
||
r"H2O|NH3|CH4|C4AF|C3S2|C2AS|C3S|C2S|C3A|O2|N2|H2)\b"
|
||
)
|
||
|
||
|
||
def _emit_chem(paragraph, text: str, *, size_pt: float, cn_font: str) -> None:
|
||
"""把白名单化学式里的数字渲成下标,其余正常。"""
|
||
pos = 0
|
||
for m in _CHEM_RE.finditer(text):
|
||
if m.start() > pos:
|
||
_emit_plain_run(paragraph, text[pos:m.start()], size_pt=size_pt, cn_font=cn_font)
|
||
formula = m.group(0)
|
||
buf = ""
|
||
for ch in formula:
|
||
if ch.isdigit():
|
||
if buf:
|
||
_emit_plain_run(paragraph, buf, size_pt=size_pt, cn_font=cn_font)
|
||
buf = ""
|
||
sub = paragraph.add_run(ch)
|
||
sub.font.size = Pt(size_pt)
|
||
_set_run_fonts(sub, cn_font=cn_font, en_font="Times New Roman")
|
||
_set_subscript(sub)
|
||
else:
|
||
buf += ch
|
||
if buf:
|
||
_emit_plain_run(paragraph, buf, size_pt=size_pt, cn_font=cn_font)
|
||
pos = m.end()
|
||
if pos < len(text):
|
||
_emit_plain_run(paragraph, text[pos:], size_pt=size_pt, cn_font=cn_font)
|
||
|
||
|
||
def _emit_plain_run(paragraph, text: str, *, size_pt: float, cn_font: str) -> None:
|
||
if not text:
|
||
return
|
||
run = paragraph.add_run(text)
|
||
run.font.size = Pt(size_pt)
|
||
_set_run_fonts(run, cn_font=cn_font, en_font="Times New Roman")
|
||
|
||
|
||
def _emit_plain_with_cites(paragraph, text: str, *, size_pt: float, cn_font: str,
|
||
make_citations: bool) -> None:
|
||
"""plain 段:处理引文上标超链接 + 化学式下标。"""
|
||
if not make_citations:
|
||
_emit_chem(paragraph, text, size_pt=size_pt, cn_font=cn_font)
|
||
return
|
||
pos = 0
|
||
prev_end = None
|
||
for m in _CITE_RE.finditer(text):
|
||
if m.start() > pos:
|
||
_emit_chem(paragraph, text[pos:m.start()], size_pt=size_pt, cn_font=cn_font)
|
||
# 连续 [1][3] 之间补一个上标逗号
|
||
if prev_end is not None and m.start() == prev_end:
|
||
comma = paragraph.add_run(",")
|
||
comma.font.size = Pt(size_pt * 0.85)
|
||
comma.font.color.rgb = PRIMARY_RGB
|
||
_set_subscript_super(comma)
|
||
cid = m.group(1)
|
||
add_internal_link(paragraph, f"ref_{cid}", cid, size_pt=size_pt * 0.85,
|
||
color=PRIMARY, superscript=True)
|
||
prev_end = m.end()
|
||
pos = m.end()
|
||
if pos < len(text):
|
||
_emit_chem(paragraph, text[pos:], size_pt=size_pt, cn_font=cn_font)
|
||
|
||
|
||
def _set_subscript_super(run) -> None:
|
||
rPr = run._element.get_or_add_rPr()
|
||
va = OxmlElement("w:vertAlign")
|
||
va.set(qn("w:val"), "superscript")
|
||
rPr.append(va)
|
||
|
||
|
||
def add_inline_rich(paragraph, text: str, *, size_pt=12.0, cn_font="宋体",
|
||
make_citations=True) -> None:
|
||
pos = 0
|
||
for m in _INLINE_RE.finditer(text):
|
||
if m.start() > pos:
|
||
_emit_plain_with_cites(paragraph, text[pos:m.start()], size_pt=size_pt,
|
||
cn_font=cn_font, make_citations=make_citations)
|
||
if m.group("bold"):
|
||
run = paragraph.add_run(m.group("bold_t"))
|
||
run.bold = True
|
||
run.font.size = Pt(size_pt)
|
||
_set_run_fonts(run, cn_font=cn_font)
|
||
elif m.group("italic"):
|
||
run = paragraph.add_run(m.group("italic_t"))
|
||
run.italic = True
|
||
run.font.size = Pt(size_pt)
|
||
_set_run_fonts(run, cn_font=cn_font)
|
||
elif m.group("code"):
|
||
run = paragraph.add_run(m.group("code_t"))
|
||
run.font.size = Pt(size_pt)
|
||
_set_run_fonts(run, cn_font=cn_font, en_font="Consolas")
|
||
pos = m.end()
|
||
if pos < len(text):
|
||
_emit_plain_with_cites(paragraph, text[pos:], size_pt=size_pt,
|
||
cn_font=cn_font, make_citations=make_citations)
|
||
|
||
|
||
# ───────────────────────── 标题 / 段落 ─────────────────────────
|
||
|
||
def add_heading(doc: Document, text: str, level: int, color: bool) -> None:
|
||
p = doc.add_paragraph(style=f"Heading {level}")
|
||
p.paragraph_format.first_line_indent = None
|
||
sizes = {1: 18.0, 2: 14.0, 3: 12.0}
|
||
if level == 1:
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
add_inline_rich(p, text, size_pt=sizes[level], cn_font="黑体", make_citations=False)
|
||
for run in p.runs:
|
||
run.bold = True
|
||
if color and level <= 2:
|
||
_para_border(p, sides=("bottom",), color=PRIMARY, size=(12 if level == 1 else 6))
|
||
elif color and level == 3:
|
||
p.paragraph_format.left_indent = Pt(8)
|
||
_para_border(p, sides=("left",), color=PRIMARY, size=20, space=6)
|
||
|
||
|
||
def add_body_paragraph(doc: Document, text: str, *, indent=True) -> None:
|
||
p = doc.add_paragraph()
|
||
pf = p.paragraph_format
|
||
pf.line_spacing = 1.5
|
||
pf.first_line_indent = Pt(24) if indent else None
|
||
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
|
||
add_inline_rich(p, text)
|
||
|
||
|
||
def add_callout(doc: Document, text: str, fill: str, color: bool) -> None:
|
||
"""判断 / 引用块类强调框:底纹 + 左红条。"""
|
||
p = doc.add_paragraph()
|
||
pf = p.paragraph_format
|
||
pf.line_spacing = 1.4
|
||
pf.first_line_indent = None
|
||
pf.left_indent = Pt(8)
|
||
pf.space_before = Pt(3)
|
||
pf.space_after = Pt(3)
|
||
if color:
|
||
_para_shading(p, fill)
|
||
_para_border(p, sides=("left",), color=PRIMARY, size=22, space=5)
|
||
add_inline_rich(p, text)
|
||
|
||
|
||
def add_meta_band(doc: Document, text: str, color: bool) -> None:
|
||
"""标题下方的信息带(方向/时间窗/深度/数据源/受众):居中浅红底纹 + 上下细线。"""
|
||
p = doc.add_paragraph()
|
||
pf = p.paragraph_format
|
||
pf.first_line_indent = None
|
||
pf.space_before = Pt(2)
|
||
pf.space_after = Pt(12)
|
||
pf.line_spacing = 1.35
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if color:
|
||
_para_shading(p, "F3DADA")
|
||
_para_border(p, sides=("top", "bottom"), color=PRIMARY, size=6, space=3)
|
||
add_inline_rich(p, text, size_pt=10.5, make_citations=False)
|
||
|
||
|
||
def add_tldr_card(doc: Document, text: str, color: bool) -> None:
|
||
"""TL;DR 要点:每条做成浅红左条卡片,堆叠成卡片列。"""
|
||
p = doc.add_paragraph()
|
||
pf = p.paragraph_format
|
||
pf.first_line_indent = None
|
||
pf.left_indent = Pt(10)
|
||
pf.space_before = Pt(1)
|
||
pf.space_after = Pt(3)
|
||
pf.line_spacing = 1.3
|
||
if color:
|
||
_para_shading(p, TLDR_FILL)
|
||
_para_border(p, sides=("left",), color=PRIMARY, size=26, space=6)
|
||
add_inline_rich(p, text, size_pt=11.0)
|
||
|
||
|
||
def _add_field(paragraph, instr: str) -> None:
|
||
run = paragraph.add_run()
|
||
for typ, payload in (("begin", None), ("instr", instr), ("separate", None), ("end", None)):
|
||
if typ == "instr":
|
||
el = OxmlElement("w:instrText")
|
||
el.set(qn("xml:space"), "preserve")
|
||
el.text = payload
|
||
else:
|
||
el = OxmlElement("w:fldChar")
|
||
el.set(qn("w:fldCharType"), typ)
|
||
run._r.append(el)
|
||
|
||
|
||
def add_page_footer(doc: Document, color: bool) -> None:
|
||
p = doc.sections[0].footer.paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
pre = p.add_run("第 ")
|
||
_add_field(p, " PAGE ")
|
||
post = p.add_run(" 页")
|
||
for r in p.runs:
|
||
r.font.size = Pt(9)
|
||
if color:
|
||
r.font.color.rgb = PRIMARY_RGB
|
||
_set_run_fonts(r, cn_font="宋体")
|
||
|
||
|
||
# ───────────────────────── 参考文献条目(可点击)─────────────────────────
|
||
|
||
_REF_RE = re.compile(r"^\[(W?\d+)\]\s+(.+)$")
|
||
_DOI_RE = re.compile(r"^10\.\d{4,9}/\S+$")
|
||
_DOI_INLINE_RE = re.compile(r"10\.\d{4,9}/\S+") # 条目内 DOI 子串(论文列表条目末尾常带 "DOI: 10.xxx")
|
||
_URL_TOKEN_RE = re.compile(r"([a-z0-9][\w.\-]*\.[a-z]{2,}(?:/[^\s]+)?)", re.IGNORECASE)
|
||
|
||
|
||
def add_reference_item(doc: Document, cid: str, value: str, bm_id: int, color: bool) -> None:
|
||
p = doc.add_paragraph()
|
||
pf = p.paragraph_format
|
||
pf.first_line_indent = None
|
||
pf.left_indent = Pt(18)
|
||
pf.line_spacing = 1.3
|
||
_add_bookmark(p, f"ref_{cid}", bm_id)
|
||
# 编号标签 [n]
|
||
lab = p.add_run(f"[{cid}] ")
|
||
lab.bold = True
|
||
lab.font.size = Pt(10.5)
|
||
if color:
|
||
lab.font.color.rgb = PRIMARY_RGB
|
||
_set_run_fonts(lab, cn_font="宋体")
|
||
value = value.strip()
|
||
if _DOI_RE.match(value):
|
||
add_external_link(p, f"https://doi.org/{value}", value, size_pt=10.5)
|
||
return
|
||
# 论文列表条目:行内含 DOI(如 "<标题>. <作者>, <刊>, 2026-03. DOI: 10.1016/...")
|
||
# → 把 DOI 子串做成超链接,前后文正常
|
||
m_doi = _DOI_INLINE_RE.search(value)
|
||
if m_doi:
|
||
doi = m_doi.group(0).rstrip(".,;)")
|
||
pre, post = value[:m_doi.start()], value[m_doi.start() + len(doi):]
|
||
if pre:
|
||
_emit_plain_run(p, pre, size_pt=10.5, cn_font="宋体")
|
||
add_external_link(p, f"https://doi.org/{doi}", doi, size_pt=10.5)
|
||
if post:
|
||
_emit_plain_run(p, post, size_pt=10.5, cn_font="宋体")
|
||
return
|
||
# web 条目:把第一个像 URL 的 token 变成超链接
|
||
m = _URL_TOKEN_RE.search(value)
|
||
if m and ("/" in m.group(1) or m.group(1).count(".") >= 1) and " " not in m.group(1):
|
||
pre, mid, post = value[:m.start()], m.group(1), value[m.end():]
|
||
_emit_plain_run(p, pre, size_pt=10.5, cn_font="宋体")
|
||
url = mid if mid.startswith("http") else f"https://{mid}"
|
||
add_external_link(p, url, mid, size_pt=10.5)
|
||
if post:
|
||
_emit_plain_run(p, post, size_pt=10.5, cn_font="宋体")
|
||
else:
|
||
_emit_plain_run(p, value, size_pt=10.5, cn_font="宋体")
|
||
|
||
|
||
# ───────────────────────── 行类型识别 ─────────────────────────
|
||
|
||
_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$")
|
||
_TABLE_LINE_RE = re.compile(r"^\s*\|.*\|\s*$")
|
||
_BLOCKQUOTE_RE = re.compile(r"^\s*>\s?")
|
||
_HR_RE = re.compile(r"^\s*-{3,}\s*$|^\s*={3,}\s*$|^\s*_{3,}\s*$")
|
||
_FENCE_RE = re.compile(r"^\s*(`{3,}|~{3,})\s*(\S*)\s*$")
|
||
_IMAGE_LINE_RE = re.compile(r"^\s*!\[(?P<cap>[^\]]*)\]\((?P<src>[^)\s]+)\)\s*$")
|
||
_MAX_IMG_WIDTH = Cm(15)
|
||
|
||
_LIST_PATTERNS = [
|
||
re.compile(r"^[-*+]\s"),
|
||
re.compile(r"^\d+[\.、.]\s*"),
|
||
re.compile(r"^\(\d+\)\s*"),
|
||
re.compile(r"^(\d+)\s*"),
|
||
re.compile(r"^[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮]"),
|
||
]
|
||
|
||
|
||
def is_list_item(line: str) -> bool:
|
||
return any(p.match(line) for p in _LIST_PATTERNS)
|
||
|
||
|
||
# ───────────────────────── 表格 ─────────────────────────
|
||
|
||
def _split_md_row(line: str) -> list[str]:
|
||
return [c.strip() for c in line.strip().strip("|").split("|")]
|
||
|
||
|
||
def _is_sep_row(cells: list[str]) -> bool:
|
||
return all(re.match(r"^[-:\s]+$", c) for c in cells if c != "")
|
||
|
||
|
||
def render_table(doc: Document, table_lines: list[str], color: bool) -> None:
|
||
rows = []
|
||
for ln in table_lines:
|
||
cells = _split_md_row(ln)
|
||
if not cells or _is_sep_row(cells):
|
||
continue
|
||
rows.append(cells)
|
||
if not rows:
|
||
return
|
||
n_cols = max(len(r) for r in rows)
|
||
for r in rows:
|
||
while len(r) < n_cols:
|
||
r.append("")
|
||
table = doc.add_table(rows=len(rows), cols=n_cols)
|
||
try:
|
||
table.style = "Table Grid"
|
||
except KeyError:
|
||
pass
|
||
for ri, row in enumerate(rows):
|
||
for ci, val in enumerate(row):
|
||
cell = table.rows[ri].cells[ci]
|
||
cell.text = ""
|
||
p = cell.paragraphs[0]
|
||
p.paragraph_format.first_line_indent = None
|
||
p.paragraph_format.line_spacing = 1.2
|
||
add_inline_rich(p, val, size_pt=10.5, cn_font="宋体", make_citations=False)
|
||
if ri == 0:
|
||
if color:
|
||
_para_shading(p, TABLE_HEAD_FILL)
|
||
for run in p.runs:
|
||
run.bold = True
|
||
if color:
|
||
run.font.color.rgb = RGBColor(0xFF, 0xFF, 0xFF)
|
||
|
||
|
||
# ───────────────────────── 图片 ─────────────────────────
|
||
|
||
def _resolve_image_path(src: str, base_dir: Path) -> Path | None:
|
||
p = Path(src)
|
||
if not p.is_absolute():
|
||
p = (base_dir / p).resolve()
|
||
return p if p.is_file() else None
|
||
|
||
|
||
def add_image(doc: Document, png_path: Path, caption: str | None, ctx: dict) -> None:
|
||
p = doc.add_paragraph()
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
p.paragraph_format.first_line_indent = None
|
||
p.paragraph_format.space_before = Pt(6)
|
||
p.paragraph_format.space_after = Pt(3)
|
||
run = p.add_run()
|
||
try:
|
||
run.add_picture(str(png_path), width=_MAX_IMG_WIDTH)
|
||
except Exception as e:
|
||
run.add_text(f"[image failed: {png_path.name}: {e}]")
|
||
return
|
||
ctx["fig_no"] = ctx.get("fig_no", 0) + 1
|
||
cap_p = doc.add_paragraph()
|
||
cap_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
cap_p.paragraph_format.first_line_indent = None
|
||
cap_p.paragraph_format.space_after = Pt(6)
|
||
cap_text = f"图 {ctx['fig_no']} {caption}" if caption else f"图 {ctx['fig_no']}"
|
||
cap_run = cap_p.add_run(cap_text)
|
||
cap_run.font.size = Pt(10.5)
|
||
cap_run.bold = True
|
||
_set_run_fonts(cap_run, cn_font="宋体")
|
||
|
||
|
||
# ───────────────────────── 主渲染 ─────────────────────────
|
||
|
||
def render_md_block(doc: Document, md_text: str, ctx: dict) -> None:
|
||
color = ctx["color"]
|
||
lines = md_text.splitlines()
|
||
i, n = 0, len(lines)
|
||
in_refs = False # 进入「参考文献」段后,[n] 行按引文条目渲染
|
||
expect_meta = False # 紧跟 H1 标题的信息带(方向/时间窗...)
|
||
in_tldr = False # 「一句话要点」段:列表项做卡片
|
||
while i < n:
|
||
line = lines[i].rstrip()
|
||
if not line.strip():
|
||
i += 1
|
||
continue
|
||
|
||
if _HR_RE.match(line):
|
||
i += 1
|
||
continue
|
||
|
||
m_img = _IMAGE_LINE_RE.match(line)
|
||
if m_img:
|
||
png = _resolve_image_path(m_img.group("src"), ctx["sections_dir"])
|
||
if png is not None:
|
||
add_image(doc, png, m_img.group("cap").strip() or None, ctx)
|
||
else:
|
||
add_body_paragraph(doc, f"[image missing: {m_img.group('src')}]", indent=False)
|
||
i += 1
|
||
continue
|
||
|
||
m_fence = _FENCE_RE.match(line)
|
||
if m_fence:
|
||
fence = m_fence.group(1)
|
||
code = []
|
||
i += 1
|
||
while i < n:
|
||
mc = _FENCE_RE.match(lines[i])
|
||
if mc and mc.group(1)[0] == fence[0] and len(mc.group(1)) >= len(fence):
|
||
i += 1
|
||
break
|
||
code.append(lines[i])
|
||
i += 1
|
||
for ln in code:
|
||
p = doc.add_paragraph()
|
||
p.paragraph_format.first_line_indent = None
|
||
p.paragraph_format.line_spacing = 1.0
|
||
run = p.add_run(ln if ln else " ")
|
||
run.font.size = Pt(10.5)
|
||
_set_run_fonts(run, cn_font="新宋体", en_font="Consolas")
|
||
continue
|
||
|
||
if _TABLE_LINE_RE.match(line):
|
||
block = []
|
||
while i < n and _TABLE_LINE_RE.match(lines[i]):
|
||
block.append(lines[i])
|
||
i += 1
|
||
render_table(doc, block, color)
|
||
continue
|
||
|
||
m = _HEADING_RE.match(line)
|
||
if m:
|
||
title = m.group(2).strip()
|
||
level = min(len(m.group(1)), 3)
|
||
# 只在 H1/H2 重判段类型 —— 让「重要论文列表」段下的 ### 期刊子标题不重置 in_refs,
|
||
# 子标题下的 [n] 条目才能继续按参考锚点渲染(带 DOI 超链接)
|
||
if level <= 2:
|
||
in_refs = ("参考文献" in title) or ("论文列表" in title) or ("文献列表" in title)
|
||
expect_meta = (level == 1)
|
||
if level <= 2:
|
||
in_tldr = ("要点" in title) or ("TL;DR" in title.upper())
|
||
add_heading(doc, title, level, color)
|
||
i += 1
|
||
continue
|
||
|
||
if _BLOCKQUOTE_RE.match(line):
|
||
# 引用块:并合连续 > 行,做浅红 callout(说明 / 取舍纪律等)
|
||
buf = [_BLOCKQUOTE_RE.sub("", line).strip()]
|
||
i += 1
|
||
while i < n and _BLOCKQUOTE_RE.match(lines[i]):
|
||
buf.append(_BLOCKQUOTE_RE.sub("", lines[i]).strip())
|
||
i += 1
|
||
add_callout(doc, " ".join(buf), TLDR_FILL, color)
|
||
continue
|
||
|
||
# 参考文献条目
|
||
if in_refs:
|
||
m_ref = _REF_RE.match(line.strip())
|
||
if m_ref:
|
||
ctx["bm_id"] += 1
|
||
add_reference_item(doc, m_ref.group(1), m_ref.group(2), ctx["bm_id"], color)
|
||
i += 1
|
||
continue
|
||
|
||
# 「判断」强调行 → callout
|
||
if line.strip().startswith("**判断**"):
|
||
add_callout(doc, line.strip(), CALLOUT_FILL, color)
|
||
i += 1
|
||
continue
|
||
|
||
if is_list_item(line):
|
||
if in_tldr:
|
||
add_tldr_card(doc, line.strip(), color)
|
||
else:
|
||
add_body_paragraph(doc, line.strip(), indent=False)
|
||
i += 1
|
||
continue
|
||
|
||
# 紧跟标题的信息带
|
||
if expect_meta and ("时间窗" in line):
|
||
add_meta_band(doc, line.strip(), color)
|
||
expect_meta = False
|
||
i += 1
|
||
continue
|
||
|
||
# 普通段落:并合软换行
|
||
buf = [line.strip()]
|
||
j = i + 1
|
||
while j < n:
|
||
nxt = lines[j].rstrip()
|
||
if not nxt.strip() or _HEADING_RE.match(nxt) or _BLOCKQUOTE_RE.match(nxt) \
|
||
or _TABLE_LINE_RE.match(nxt) or is_list_item(nxt) or _HR_RE.match(nxt):
|
||
break
|
||
buf.append(nxt.strip())
|
||
j += 1
|
||
add_body_paragraph(doc, " ".join(buf), indent=True)
|
||
i = j
|
||
|
||
|
||
def render_sections(sections_dir: Path, out: Path, color: bool) -> None:
|
||
if not sections_dir.is_dir():
|
||
print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr)
|
||
sys.exit(2)
|
||
md_files = sorted(sections_dir.glob("*.md"))
|
||
if not md_files:
|
||
print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr)
|
||
sys.exit(2)
|
||
|
||
ctx = {
|
||
"sections_dir": sections_dir,
|
||
"figures_dir": sections_dir.parent / "figures",
|
||
"fig_no": 0,
|
||
"bm_id": 0,
|
||
"color": color,
|
||
}
|
||
doc = init_doc(color)
|
||
add_page_footer(doc, color)
|
||
for idx, f in enumerate(md_files):
|
||
render_md_block(doc, f.read_text(encoding="utf-8"), ctx)
|
||
if idx != len(md_files) - 1:
|
||
doc.add_page_break()
|
||
|
||
out.parent.mkdir(parents=True, exist_ok=True)
|
||
doc.save(str(out))
|
||
|
||
paras = sum(1 for _ in doc.paragraphs)
|
||
chars = sum(len(p.text) for p in doc.paragraphs)
|
||
print(f"[OK] rendered {len(md_files)} sections -> {out}")
|
||
print(f" paragraphs: {paras} | tables: {len(doc.tables)} | figures: {ctx['fig_no']} | chars: {chars}")
|
||
print(f" theme: {'商务红 #C00000' if color else '黑白'} | 引文上标+超链接 | 化学式下标白名单")
|
||
|
||
|
||
def main() -> None:
|
||
ap = argparse.ArgumentParser(description="渲染章节 md → 科研方向简报 docx")
|
||
ap.add_argument("sections_dir", type=Path, help="sections/*.md 目录")
|
||
ap.add_argument("--no-color", dest="color", action="store_false",
|
||
help="关配色,出纯黑白(默认商务红主题)")
|
||
ap.add_argument("-o", "--output", type=Path, required=True, help="输出 .docx 路径")
|
||
args = ap.parse_args()
|
||
render_sections(args.sections_dir, args.output, args.color)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|