zcbot/skills/proposal/scripts/render_diagrams.py

"""预处理 sections/*.md 里的 mermaid 块 → 渲染为 figures/fig_<caption>.png。

不改动 sections/*.md, 只往 <sections_dir>/../figures/ 下落 PNG。
render_docx.py 在遇到 ```mermaid 块时按相同 caption 查 figures/, 有就插图 + 图题,
没有就 ASCII 兜底。

caption 命名规则:
- 每个 mermaid 块**必须**有首行注释 `%% caption: <图题>`, 否则直接报错退出
- caption 在全 task 内必须唯一, 撞了就报错 (强制起更具体的题, e.g.
  "训练阶段架构" / "推理阶段架构")
- 文件名 = caption 清洗后 (保留 CJK / 字母 / 数字, 其它字符 → '_', 截 40 字)
  前缀加 'fig_', e.g. caption "关键技术映射" → figures/fig_关键技术映射.png

渲染后端选择 (按优先级):
  1. 本地 mmdc (mermaid-cli) —— 最高质量,要 Node.js + npm i -g @mermaid-js/mermaid-cli
  2. mermaid.ink 公网 API —— 不装东西,要联网

两种都没, 留警告退出 0 (让流水线继续), render_docx.py 走 ASCII fallback。
缺 caption / caption 撞名直接退出 2 (硬错, 必须改 md)。

用法:
    python render_diagrams.py <task_dir>/sections/
"""
from __future__ import annotations

import argparse
import base64
import re
import shutil
import subprocess
import sys
import tempfile
import urllib.error
import urllib.request
from collections import defaultdict
from pathlib import Path

_FENCE_OPEN_RE = re.compile(r"^\s*```\s*mermaid\s*$")
_FENCE_CLOSE_RE = re.compile(r"^\s*```\s*$")
_CAPTION_RE = re.compile(r"^\s*%%\s*caption\s*:\s*(.+?)\s*$", re.IGNORECASE)
_FILENAME_INVALID_RE = re.compile(r"[^一-鿿A-Za-z0-9]+")

MERMAID_INK_URL = "https://mermaid.ink/img/{payload}?type=png&bgColor=FFFFFF"


def caption_to_stem(caption: str) -> str:
    """caption → 'fig_<sanitized>' (无扩展名).

    保留 CJK / 拉丁字母 / 数字, 其它字符折成 '_', 头尾去 '_', 截 40 字。
    清洗后为空 → ValueError。
    """
    cleaned = _FILENAME_INVALID_RE.sub("_", caption).strip("_")[:40]
    if not cleaned:
        raise ValueError(f"caption sanitizes to empty: {caption!r}")
    return f"fig_{cleaned}"


def extract_caption(source: str) -> str | None:
    for ln in source.splitlines():
        m = _CAPTION_RE.match(ln)
        if m:
            return m.group(1).strip()
    return None


def find_mermaid_blocks(md_text: str) -> list[str]:
    """返回 .md 里所有 mermaid 块的源码(不含 ``` fence)。"""
    blocks: list[str] = []
    lines = md_text.splitlines()
    i = 0
    n = len(lines)
    while i < n:
        if _FENCE_OPEN_RE.match(lines[i]):
            buf: list[str] = []
            i += 1
            while i < n and not _FENCE_CLOSE_RE.match(lines[i]):
                buf.append(lines[i])
                i += 1
            blocks.append("\n".join(buf))
            i += 1
        else:
            i += 1
    return blocks


def render_via_mmdc(source: str, out_png: Path) -> bool:
    """有 mmdc 就用 mmdc, 输出 png 到 out_png。成功 True, 失败 False。"""
    mmdc = shutil.which("mmdc")
    if not mmdc:
        return False
    with tempfile.NamedTemporaryFile("w", suffix=".mmd", delete=False, encoding="utf-8") as tf:
        tf.write(source)
        tmp_path = Path(tf.name)
    try:
        proc = subprocess.run(
            [mmdc, "-i", str(tmp_path), "-o", str(out_png), "-b", "white", "--quiet"],
            capture_output=True,
            text=True,
            timeout=60,
        )
        if proc.returncode != 0:
            print(f"   [mmdc] returncode={proc.returncode}: {proc.stderr.strip()[:200]}", file=sys.stderr)
            return False
        return out_png.exists()
    except (subprocess.TimeoutExpired, OSError) as e:
        print(f"   [mmdc] error: {e}", file=sys.stderr)
        return False
    finally:
        try:
            tmp_path.unlink()
        except OSError:
            pass


def render_via_mermaid_ink(source: str, out_png: Path) -> bool:
    """通过 mermaid.ink 公网 API 渲染。要联网。成功 True, 失败 False。"""
    payload = base64.urlsafe_b64encode(source.strip().encode("utf-8")).decode("ascii").rstrip("=")
    url = MERMAID_INK_URL.format(payload=payload)
    try:
        req = urllib.request.Request(url, headers={"User-Agent": "zcbot-proposal/1.0"})
        with urllib.request.urlopen(req, timeout=30) as resp:
            if resp.status != 200:
                print(f"   [mermaid.ink] HTTP {resp.status}", file=sys.stderr)
                return False
            data = resp.read()
        if not data or len(data) < 100:
            print(f"   [mermaid.ink] payload too small ({len(data)} bytes)", file=sys.stderr)
            return False
        out_png.write_bytes(data)
        return True
    except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, OSError) as e:
        print(f"   [mermaid.ink] error: {e}", file=sys.stderr)
        return False


def render_one(source: str, out_png: Path) -> str:
    """渲染一块 mermaid → png。返回使用的后端名 / "fail"。

    总是覆盖渲染 (caption 没改但 mermaid 源改了的情况下也能更新 png)。
    """
    if render_via_mmdc(source, out_png):
        return "mmdc"
    if render_via_mermaid_ink(source, out_png):
        return "mermaid.ink"
    return "fail"


def render_sections(sections_dir: Path) -> int:
    if not sections_dir.is_dir():
        print(f"[ERR] sections dir not found: {sections_dir}", file=sys.stderr)
        return 2

    figures_dir = sections_dir.parent / "figures"
    figures_dir.mkdir(parents=True, exist_ok=True)

    md_files = sorted(sections_dir.glob("*.md"))
    if not md_files:
        print(f"[ERR] no .md found in {sections_dir}", file=sys.stderr)
        return 2

    # Pass 1: 收集所有块 + 验证 caption 完整性 / 唯一性
    blocks_meta: list[tuple[Path, str, str]] = []  # (md, caption, source)
    missing_cap: list[Path] = []
    for md in md_files:
        text = md.read_text(encoding="utf-8")
        for src in find_mermaid_blocks(text):
            cap = extract_caption(src)
            if not cap:
                missing_cap.append(md)
                continue
            blocks_meta.append((md, cap, src))

    fatal = False
    if missing_cap:
        print("[ERR] 以下 md 里有 mermaid 块缺首行 '%% caption: <图题>':", file=sys.stderr)
        for md in missing_cap:
            print(f"       - {md.name}", file=sys.stderr)
        fatal = True

    by_cap: dict[str, list[str]] = defaultdict(list)
    for md, cap, _ in blocks_meta:
        by_cap[cap].append(md.name)
    dups = [(c, mds) for c, mds in by_cap.items() if len(mds) > 1]
    if dups:
        print("[ERR] caption 在全 task 内必须唯一, 以下撞名:", file=sys.stderr)
        for c, mds in dups:
            print(f"       - {c!r} 出现在: {', '.join(mds)}", file=sys.stderr)
        print("       改成更具体的题文 (e.g. '训练阶段架构' / '推理阶段架构')", file=sys.stderr)
        fatal = True

    if fatal:
        return 2

    # Pass 2: 渲染
    if not blocks_meta:
        print(f"[OK] no mermaid block found in {sections_dir} (nothing to render)")
        return 0

    by_backend: dict[str, int] = {}
    fail_blocks: list[tuple[Path, str]] = []
    for md, cap, src in blocks_meta:
        try:
            stem = caption_to_stem(cap)
        except ValueError as e:
            print(f"[ERR] {md.name}: {e}", file=sys.stderr)
            return 2
        png = figures_dir / f"{stem}.png"
        backend = render_one(src, png)
        by_backend[backend] = by_backend.get(backend, 0) + 1
        mark = {"mmdc": "+", "mermaid.ink": "+", "fail": "x"}[backend]
        print(f"  {mark} [{backend:11s}] {md.name} :: {png.name} :: {cap}")
        if backend == "fail":
            fail_blocks.append((md, cap))

    print()
    print(f"[OK] processed {len(blocks_meta)} mermaid block(s) -> {figures_dir}")
    for b, c in sorted(by_backend.items()):
        print(f"   {b}: {c}")

    if fail_blocks:
        print()
        print(f"[WARN] {len(fail_blocks)} block(s) failed to render. render_docx.py 会走 ASCII fallback.")
        print(f"       要画真图: 装 mmdc (npm i -g @mermaid-js/mermaid-cli) 或保证联网走 mermaid.ink。")
        for md, cap in fail_blocks:
            print(f"       - {md.name} :: {cap}")

    return 0


def main() -> None:
    ap = argparse.ArgumentParser(description="预处理 sections/*.md 的 mermaid 块 → figures/*.png")
    ap.add_argument("sections_dir", type=Path, help="sections/*.md 目录")
    args = ap.parse_args()
    rc = render_sections(args.sections_dir)
    sys.exit(rc)


if __name__ == "__main__":
    main()