zcbot/scripts/_smoke_proposal_diagrams.py

"""Smoke: render_docx.py 图片 + mermaid 缓存路径。

构造一个临时 sections/, figures/ 结构, 跑 render_docx, 验证:
- mermaid 块 hash 在 figures/ 有对应 png → 走插图路径
- mermaid 块 hash 在 figures/ 没 png → 走 ASCII fallback (不崩, 文本保留)
- ![](path) 直接图片 → 走插图路径
- 图编号自增
- inline_shapes 数 = 命中插图的次数
"""
from __future__ import annotations

import hashlib
import os
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
RENDER_DOCX = ROOT / "skills" / "proposal" / "scripts" / "render_docx.py"
PYTHON = ROOT / ".venv" / "Scripts" / "python.exe"
if not PYTHON.exists():
    PYTHON = Path(sys.executable)  # CI / unix fallback


def _run_render(sections: Path, out: Path) -> subprocess.CompletedProcess:
    """跑 render_docx.py, 子进程强制 utf-8 输出 (Windows GBK stdout 兜底)。"""
    env = os.environ.copy()
    env["PYTHONIOENCODING"] = "utf-8"
    return subprocess.run(
        [str(PYTHON), str(RENDER_DOCX), str(sections), "--fund-type", "key_rd", "-o", str(out)],
        capture_output=True, text=True, encoding="utf-8", env=env,
    )


def mermaid_hash(source: str) -> str:
    return hashlib.sha1(source.strip().encode("utf-8")).hexdigest()[:10]


def make_tiny_png(out: Path) -> None:
    """用 matplotlib 生成一张 1-bar 的真 png(确保 python-docx 能 add_picture)。"""
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(4, 2), dpi=100)
    ax.bar(["A", "B", "C"], [1, 3, 2], color="#c00000")
    ax.set_title("smoke")
    fig.savefig(str(out), bbox_inches="tight")
    plt.close(fig)


def case(name: str):
    def deco(fn):
        def wrapped(*a, **kw):
            print(f"[case] {name} ...", end=" ")
            try:
                fn(*a, **kw)
                print("OK")
            except Exception as e:
                print(f"FAIL: {e}")
                raise
        return wrapped
    return deco


@case("happy: cached mermaid + direct image + ASCII fallback")
def smoke_happy(tmp: Path) -> None:
    sections = tmp / "sections"
    figures = tmp / "figures"
    sections.mkdir(parents=True)
    figures.mkdir(parents=True)

    # mermaid block #1 — 命中缓存
    m1 = (
        "%% caption: 关键问题与技术映射\n"
        "flowchart LR\n"
        "  A --> B\n"
    )
    h1 = mermaid_hash(m1)
    png1 = figures / f"fig_{h1}.png"
    make_tiny_png(png1)

    # mermaid block #2 — 缺缓存, 走 ASCII fallback
    m2 = (
        "%% caption: 缺缓存的图\n"
        "flowchart TB\n"
        "  X --> Y\n"
    )

    # direct image — 自己造 png
    direct_png = figures / "direct.png"
    make_tiny_png(direct_png)

    # 写 .md
    (sections / "01_test.md").write_text(
        f"""# 测试章节

这是一段散文。**加粗** 与 *斜体* 应当正确解析。

```mermaid
{m1.rstrip()}
```

正文继续。下面是一张缺缓存的 mermaid:

```mermaid
{m2.rstrip()}
```

再看一张直接引用的图:

![已有 PNG: 一柱形示例](../figures/direct.png)

末尾段。
""",
        encoding="utf-8",
    )

    out = tmp / "test.docx"
    proc = _run_render(sections, out)
    assert proc.returncode == 0, f"render_docx exited {proc.returncode}\nSTDERR: {proc.stderr}\nSTDOUT: {proc.stdout}"
    assert out.is_file() and out.stat().st_size > 1000, f"output docx not produced: {out}"

    # 报告里应明确 figures: 2 (mermaid#1 + direct)
    assert "figures: 2" in proc.stdout, f"expected 'figures: 2' in stdout, got:\n{proc.stdout}"

    # 打开 docx 验内容
    from docx import Document
    doc = Document(str(out))
    # 真插图数(inline_shapes 计 add_picture)= 2
    assert len(doc.inline_shapes) == 2, f"expected 2 inline shapes, got {len(doc.inline_shapes)}"

    all_text = "\n".join(p.text for p in doc.paragraphs)
    # 命中缓存的 mermaid 走图 + 题
    assert "图 1" in all_text and "关键问题与技术映射" in all_text, "missing fig 1 caption"
    # direct 图也有题
    assert "图 2" in all_text and "已有 PNG" in all_text, "missing fig 2 caption"
    # 缺缓存的 mermaid 走 ASCII fallback,源码保留
    assert "flowchart TB" in all_text and "X --> Y" in all_text, "ASCII fallback didn't preserve mermaid source"
    # 缺缓存的不应该有 "图 3"(没插入图就不计数)
    assert "图 3" not in all_text, "ghost figure number for missed cache"


@case("happy: no diagrams at all (regression: existing flows unchanged)")
def smoke_no_diagrams(tmp: Path) -> None:
    sections = tmp / "sections"
    sections.mkdir(parents=True)
    (sections / "01.md").write_text(
        "# 标题\n\n散文段落。**加粗**。\n\n| 列 1 | 列 2 |\n|---|---|\n| a | b |\n",
        encoding="utf-8",
    )

    out = tmp / "test.docx"
    proc = _run_render(sections, out)
    assert proc.returncode == 0, f"render_docx exited {proc.returncode}\nSTDERR: {proc.stderr}"
    assert "figures: 0" in proc.stdout, f"expected 'figures: 0' in stdout, got:\n{proc.stdout}"

    from docx import Document
    doc = Document(str(out))
    assert len(doc.inline_shapes) == 0
    assert len(doc.tables) == 1  # markdown table


@case("render_diagrams: scans + hashes mermaid blocks, cache hit short-circuit")
def smoke_render_diagrams(tmp: Path) -> None:
    """不依赖 mmdc / mermaid.ink:预先放 cache png, 期望 render_diagrams 全部 'cache' 命中。"""
    sys.path.insert(0, str(ROOT / "skills" / "proposal" / "scripts"))
    try:
        import render_diagrams as rd
    finally:
        sys.path.pop(0)

    sections = tmp / "sections"
    figures = tmp / "figures"
    sections.mkdir(parents=True)
    figures.mkdir(parents=True)

    m1 = "%% caption: 图甲\nflowchart LR\n  A --> B\n"
    m2 = "flowchart TB\n  X --> Y\n  Y --> Z\n"

    (sections / "a.md").write_text(
        f"# A\n\n```mermaid\n{m1.rstrip()}\n```\n\n散文。\n\n```mermaid\n{m2.rstrip()}\n```\n",
        encoding="utf-8",
    )
    (sections / "b.md").write_text("# B\n\n仅文本,无 mermaid。\n", encoding="utf-8")

    # 预填两个 png 让 render_one 走 cache 分支(避开网络)
    for src in (m1, m2):
        h = rd.mermaid_hash(src)
        (figures / f"fig_{h}.png").write_bytes(b"\x89PNG\r\n\x1a\nfake")

    # API 调用(不走 subprocess, 避免 stdout 编码再次干扰)
    rc = rd.render_sections(sections)
    assert rc == 0, f"render_sections rc={rc}"

    # caption 抽取
    assert rd.extract_caption(m1) == "图甲"
    assert rd.extract_caption(m2) is None

    # find_mermaid_blocks 行为
    text = (sections / "a.md").read_text(encoding="utf-8")
    blocks = rd.find_mermaid_blocks(text)
    assert len(blocks) == 2, f"expected 2 blocks, got {len(blocks)}"


@case("missing image src → 占位文字, 不崩")
def smoke_missing_image(tmp: Path) -> None:
    sections = tmp / "sections"
    sections.mkdir(parents=True)
    (sections / "01.md").write_text(
        "# 测试\n\n![不存在](figures/ghost.png)\n\n后面一段。\n",
        encoding="utf-8",
    )

    out = tmp / "test.docx"
    proc = _run_render(sections, out)
    assert proc.returncode == 0, f"render_docx exited {proc.returncode}\nSTDERR: {proc.stderr}"
    from docx import Document
    doc = Document(str(out))
    assert len(doc.inline_shapes) == 0
    all_text = "\n".join(p.text for p in doc.paragraphs)
    assert "图片缺失" in all_text, "missing-image placeholder not rendered"
    assert "后面一段" in all_text, "後续段落丢了"


def main() -> None:
    if not RENDER_DOCX.exists():
        print(f"[ERR] render_docx.py not found: {RENDER_DOCX}", file=sys.stderr)
        sys.exit(2)

    with tempfile.TemporaryDirectory(prefix="zcbot_smoke_") as td:
        base = Path(td)
        smoke_happy(base / "happy")
        smoke_no_diagrams(base / "nodia")
        smoke_render_diagrams(base / "diag")
        smoke_missing_image(base / "ghost")

    print()
    print("[OK] all smoke cases passed")


if __name__ == "__main__":
    main()