215 lines
7.3 KiB
Python
215 lines
7.3 KiB
Python
"""预处理 sections/*.md 里的 mermaid 块 → 渲染为 figures/fig_<caption>.png。
|
|
|
|
与 proposal skill 的 render_diagrams.py 同源 —— 论文里的技术流程图 / 实验装置
|
|
示意 / 机理图同样用 mermaid 写, 本脚本统一渲成 PNG, render_docx.py 按 caption 查表插图。
|
|
|
|
caption 命名规则:
|
|
- 每个 mermaid 块**必须**有首行注释 `%% caption: <图题>`, 否则直接报错退出
|
|
- caption 在全 task 内必须唯一, 撞了就报错 (强制起更具体的题)
|
|
- 文件名 = caption 清洗后 (保留 CJK / 字母 / 数字, 其它字符 → '_', 截 40 字), 前缀 'fig_'
|
|
|
|
渲染后端 (按优先级): 本地 mmdc → mermaid.ink 公网 API。两种都没留警告退出 0。
|
|
|
|
用法:
|
|
python render_diagrams.py <task_dir>/sections/
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import base64
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import urllib.error
|
|
import urllib.request
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
_FENCE_OPEN_RE = re.compile(r"^\s*```\s*mermaid\s*$")
|
|
_FENCE_CLOSE_RE = re.compile(r"^\s*```\s*$")
|
|
_CAPTION_RE = re.compile(r"^\s*%%\s*caption\s*:\s*(.+?)\s*$", re.IGNORECASE)
|
|
_FILENAME_INVALID_RE = re.compile(r"[^一-鿿A-Za-z0-9]+")
|
|
|
|
MERMAID_INK_URL = "https://mermaid.ink/img/{payload}?type=png&bgColor=FFFFFF"
|
|
|
|
|
|
def caption_to_stem(caption: str) -> str:
|
|
cleaned = _FILENAME_INVALID_RE.sub("_", caption).strip("_")[:40]
|
|
if not cleaned:
|
|
raise ValueError(f"caption sanitizes to empty: {caption!r}")
|
|
return f"fig_{cleaned}"
|
|
|
|
|
|
def extract_caption(source: str) -> str | None:
|
|
for ln in source.splitlines():
|
|
m = _CAPTION_RE.match(ln)
|
|
if m:
|
|
return m.group(1).strip()
|
|
return None
|
|
|
|
|
|
def find_mermaid_blocks(md_text: str) -> list[str]:
|
|
blocks: list[str] = []
|
|
lines = md_text.splitlines()
|
|
i = 0
|
|
n = len(lines)
|
|
while i < n:
|
|
if _FENCE_OPEN_RE.match(lines[i]):
|
|
buf: list[str] = []
|
|
i += 1
|
|
while i < n and not _FENCE_CLOSE_RE.match(lines[i]):
|
|
buf.append(lines[i])
|
|
i += 1
|
|
blocks.append("\n".join(buf))
|
|
i += 1
|
|
else:
|
|
i += 1
|
|
return blocks
|
|
|
|
|
|
def render_via_mmdc(source: str, out_png: Path) -> bool:
|
|
import os
|
|
mmdc = shutil.which("mmdc")
|
|
if not mmdc:
|
|
return False
|
|
with tempfile.NamedTemporaryFile("w", suffix=".mmd", delete=False, encoding="utf-8") as tf:
|
|
tf.write(source)
|
|
tmp_path = Path(tf.name)
|
|
try:
|
|
argv = [mmdc, "-i", str(tmp_path), "-o", str(out_png), "-b", "white", "--quiet"]
|
|
puppeteer_cfg = os.environ.get("MERMAID_PUPPETEER_CONFIG", "").strip()
|
|
if puppeteer_cfg and Path(puppeteer_cfg).is_file():
|
|
argv += ["-p", puppeteer_cfg]
|
|
proc = subprocess.run(argv, capture_output=True, text=True, timeout=60)
|
|
if proc.returncode != 0:
|
|
print(f" [mmdc] returncode={proc.returncode}: {proc.stderr.strip()[:200]}", file=sys.stderr)
|
|
return False
|
|
return out_png.exists()
|
|
except (subprocess.TimeoutExpired, OSError) as e:
|
|
print(f" [mmdc] error: {e}", file=sys.stderr)
|
|
return False
|
|
finally:
|
|
try:
|
|
tmp_path.unlink()
|
|
except OSError:
|
|
pass
|
|
|
|
|
|
def render_via_mermaid_ink(source: str, out_png: Path) -> bool:
|
|
payload = base64.urlsafe_b64encode(source.strip().encode("utf-8")).decode("ascii").rstrip("=")
|
|
url = MERMAID_INK_URL.format(payload=payload)
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": "zcbot-paper/1.0"})
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
if resp.status != 200:
|
|
print(f" [mermaid.ink] HTTP {resp.status}", file=sys.stderr)
|
|
return False
|
|
data = resp.read()
|
|
if not data or len(data) < 100:
|
|
print(f" [mermaid.ink] payload too small ({len(data)} bytes)", file=sys.stderr)
|
|
return False
|
|
out_png.write_bytes(data)
|
|
return True
|
|
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, OSError) as e:
|
|
print(f" [mermaid.ink] error: {e}", file=sys.stderr)
|
|
return False
|
|
|
|
|
|
def render_one(source: str, out_png: Path) -> str:
|
|
if render_via_mmdc(source, out_png):
|
|
return "mmdc"
|
|
if render_via_mermaid_ink(source, out_png):
|
|
return "mermaid.ink"
|
|
return "fail"
|
|
|
|
|
|
def render_sections(sections_dir: Path) -> int:
|
|
if not sections_dir.is_dir():
|
|
print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr)
|
|
return 2
|
|
|
|
figures_dir = sections_dir.parent / "figures"
|
|
figures_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
md_files = sorted(sections_dir.glob("*.md"))
|
|
if not md_files:
|
|
print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr)
|
|
return 2
|
|
|
|
blocks_meta: list[tuple[Path, str, str]] = []
|
|
missing_cap: list[Path] = []
|
|
for md in md_files:
|
|
text = md.read_text(encoding="utf-8")
|
|
for src in find_mermaid_blocks(text):
|
|
cap = extract_caption(src)
|
|
if not cap:
|
|
missing_cap.append(md)
|
|
continue
|
|
blocks_meta.append((md, cap, src))
|
|
|
|
fatal = False
|
|
if missing_cap:
|
|
print("[ERR] 以下 md 里有 mermaid 块缺首行 '%% caption: <图题>':", file=sys.stderr)
|
|
for md in missing_cap:
|
|
print(f" - {md.name}", file=sys.stderr)
|
|
fatal = True
|
|
|
|
by_cap: dict[str, list[str]] = defaultdict(list)
|
|
for md, cap, _ in blocks_meta:
|
|
by_cap[cap].append(md.name)
|
|
dups = [(c, mds) for c, mds in by_cap.items() if len(mds) > 1]
|
|
if dups:
|
|
print("[ERR] caption 在全 task 内必须唯一, 以下撞名:", file=sys.stderr)
|
|
for c, mds in dups:
|
|
print(f" - {c!r} 出现在: {', '.join(mds)}", file=sys.stderr)
|
|
fatal = True
|
|
|
|
if fatal:
|
|
return 2
|
|
|
|
if not blocks_meta:
|
|
print(f"[OK] no mermaid block found in {sections_dir} (nothing to render)")
|
|
return 0
|
|
|
|
by_backend: dict[str, int] = {}
|
|
fail_blocks: list[tuple[Path, str]] = []
|
|
for md, cap, src in blocks_meta:
|
|
try:
|
|
stem = caption_to_stem(cap)
|
|
except ValueError as e:
|
|
print(f"[ERR] {md.name}: {e}", file=sys.stderr)
|
|
return 2
|
|
png = figures_dir / f"{stem}.png"
|
|
backend = render_one(src, png)
|
|
by_backend[backend] = by_backend.get(backend, 0) + 1
|
|
mark = {"mmdc": "+", "mermaid.ink": "+", "fail": "x"}[backend]
|
|
print(f" {mark} [{backend:11s}] {md.name} :: {png.name} :: {cap}")
|
|
if backend == "fail":
|
|
fail_blocks.append((md, cap))
|
|
|
|
print()
|
|
print(f"[OK] processed {len(blocks_meta)} mermaid block(s) -> {figures_dir}")
|
|
for b, c in sorted(by_backend.items()):
|
|
print(f" {b}: {c}")
|
|
|
|
if fail_blocks:
|
|
print()
|
|
print(f"[WARN] {len(fail_blocks)} block(s) failed to render. render_docx.py 会走 ASCII fallback.")
|
|
for md, cap in fail_blocks:
|
|
print(f" - {md.name} :: {cap}")
|
|
|
|
return 0
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description="预处理 sections/*.md 的 mermaid 块 → figures/*.png")
|
|
ap.add_argument("sections_dir", type=Path, help="sections/*.md 目录")
|
|
args = ap.parse_args()
|
|
sys.exit(render_sections(args.sections_dir))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|