zcbot/tests/test_rendering.py

"""平台渲染层 rendering/ 守护测试。

防回归 + 防漂移:
- 三 profile docx 渲染端到端跑通(段落>0、表格==1)
- 化学式白名单单一事实源(pdf 与 docx 共用 common.CHEM_RE,且能正确下标 + 不误伤 LC3)
- pdf HTML 生成链(md→HTML→_enrich_html)在不依赖 chromium 下可验

不验 pdf 的 chromium 那步(需沙盒 chromium);那条走 deploy/sandbox/probe_chromium_pdf.sh。
"""
from __future__ import annotations

import sys
import tempfile
import unittest
import zipfile
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parents[1]))

from rendering import common, docx_brief, docx_manuscript, pdf  # noqa: E402

_SAMPLE = """# 方向标题

## 背景

面向**低碳水泥**,化学式 CO2 / C3S / Na2O,不误伤 LC3、EN 197-5、2026。

- 列表项一
1. 有序一

[1] 文献. 作者, 刊, 2026. DOI: 10.1016/j.cemconres.2026.107891

| 期刊 | 入选 |
|---|---|
| Cement and Concrete Research | 11 |

> 引用块提示。
"""


def _write_sections(d: Path) -> Path:
    sec = d / "sections"
    sec.mkdir()
    (sec / "00.md").write_text(_SAMPLE, encoding="utf-8")
    return sec


def _docx_paragraphs(path: Path) -> int:
    from docx import Document
    return sum(1 for _ in Document(str(path)).paragraphs)


class TestDocxProfiles(unittest.TestCase):
    def test_three_profiles_render(self):
        with tempfile.TemporaryDirectory() as td:
            d = Path(td)
            sec = _write_sections(d)
            # brief
            out_b = d / "brief.docx"
            docx_brief.render_sections(sec, out_b, color=True)
            self.assertTrue(out_b.exists() and out_b.stat().st_size > 0)
            self.assertGreater(_docx_paragraphs(out_b), 0)
            # paper
            out_p = d / "paper.docx"
            docx_manuscript.render_sections("paper", sec, out_p, lang="zh")
            self.assertTrue(out_p.exists() and out_p.stat().st_size > 0)
            # proposal
            out_r = d / "proposal.docx"
            docx_manuscript.render_sections("proposal", sec, out_r, fund_type="key_rd")
            self.assertTrue(out_r.exists() and out_r.stat().st_size > 0)
            # 每份都应有 1 张表
            for f in (out_b, out_p, out_r):
                with zipfile.ZipFile(f) as z:
                    self.assertIn("word/document.xml", z.namelist())


class TestChemSingleSource(unittest.TestCase):
    def test_pdf_uses_common_chem(self):
        # 单一事实源:pdf 不得自带白名单,必须复用 common.CHEM_RE
        self.assertIs(pdf.CHEM_RE, common.CHEM_RE)

    def test_chem_whitelist_hits_and_misses(self):
        self.assertEqual(common.CHEM_RE.findall("CO2"), ["CO2"])
        self.assertTrue(common.CHEM_RE.search("Na2SO4"))
        # 不误伤:LC3 / EN 197-5 / 2026 不应整体命中
        self.assertIsNone(common.CHEM_RE.fullmatch("LC3"))
        self.assertNotIn("2026", common.CHEM_RE.findall("the year 2026"))


class TestPdfHtmlPipeline(unittest.TestCase):
    def test_enrich_subscript_and_links(self):
        html = pdf._enrich_html("<p>CO2 见 10.1016/j.x 与 example.com/a</p>")
        self.assertIn("CO<sub>2</sub>", html)
        self.assertIn('href="https://doi.org/10.1016/j.x"', html)
        self.assertIn('href="https://example.com/a"', html)

    def test_enrich_skips_code_and_links(self):
        html = pdf._enrich_html('<code>CO2</code> <a href="x">CO2</a>')
        # code / a 内的 CO2 不下标
        self.assertNotIn("CO<sub>2</sub>", html)


if __name__ == "__main__":
    unittest.main()