zcbot/tests/test_rendering.py

105 lines
3.5 KiB
Python

"""平台渲染层 rendering/ 守护测试。
防回归 + 防漂移:
- 三 profile docx 渲染端到端跑通(段落>0、表格==1)
- 化学式白名单单一事实源(pdf 与 docx 共用 common.CHEM_RE,且能正确下标 + 不误伤 LC3)
- pdf HTML 生成链(md→HTML→_enrich_html)在不依赖 chromium 下可验
不验 pdf 的 chromium 那步(需沙盒 chromium);那条走 deploy/sandbox/probe_chromium_pdf.sh。
"""
from __future__ import annotations
import sys
import tempfile
import unittest
import zipfile
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from rendering import common, docx_brief, docx_manuscript, pdf # noqa: E402
_SAMPLE = """# 方向标题
## 背景
面向**低碳水泥**,化学式 CO2 / C3S / Na2O,不误伤 LC3、EN 197-5、2026。
- 列表项一
1. 有序一
[1] 文献. 作者, 刊, 2026. DOI: 10.1016/j.cemconres.2026.107891
| 期刊 | 入选 |
|---|---|
| Cement and Concrete Research | 11 |
> 引用块提示。
"""
def _write_sections(d: Path) -> Path:
sec = d / "sections"
sec.mkdir()
(sec / "00.md").write_text(_SAMPLE, encoding="utf-8")
return sec
def _docx_paragraphs(path: Path) -> int:
from docx import Document
return sum(1 for _ in Document(str(path)).paragraphs)
class TestDocxProfiles(unittest.TestCase):
def test_three_profiles_render(self):
with tempfile.TemporaryDirectory() as td:
d = Path(td)
sec = _write_sections(d)
# brief
out_b = d / "brief.docx"
docx_brief.render_sections(sec, out_b, color=True)
self.assertTrue(out_b.exists() and out_b.stat().st_size > 0)
self.assertGreater(_docx_paragraphs(out_b), 0)
# paper
out_p = d / "paper.docx"
docx_manuscript.render_sections("paper", sec, out_p, lang="zh")
self.assertTrue(out_p.exists() and out_p.stat().st_size > 0)
# proposal
out_r = d / "proposal.docx"
docx_manuscript.render_sections("proposal", sec, out_r, fund_type="key_rd")
self.assertTrue(out_r.exists() and out_r.stat().st_size > 0)
# 每份都应有 1 张表
for f in (out_b, out_p, out_r):
with zipfile.ZipFile(f) as z:
self.assertIn("word/document.xml", z.namelist())
class TestChemSingleSource(unittest.TestCase):
def test_pdf_uses_common_chem(self):
# 单一事实源:pdf 不得自带白名单,必须复用 common.CHEM_RE
self.assertIs(pdf.CHEM_RE, common.CHEM_RE)
def test_chem_whitelist_hits_and_misses(self):
self.assertEqual(common.CHEM_RE.findall("CO2"), ["CO2"])
self.assertTrue(common.CHEM_RE.search("Na2SO4"))
# 不误伤:LC3 / EN 197-5 / 2026 不应整体命中
self.assertIsNone(common.CHEM_RE.fullmatch("LC3"))
self.assertNotIn("2026", common.CHEM_RE.findall("the year 2026"))
class TestPdfHtmlPipeline(unittest.TestCase):
def test_enrich_subscript_and_links(self):
html = pdf._enrich_html("<p>CO2 见 10.1016/j.x 与 example.com/a</p>")
self.assertIn("CO<sub>2</sub>", html)
self.assertIn('href="https://doi.org/10.1016/j.x"', html)
self.assertIn('href="https://example.com/a"', html)
def test_enrich_skips_code_and_links(self):
html = pdf._enrich_html('<code>CO2</code> <a href="x">CO2</a>')
# code / a 内的 CO2 不下标
self.assertNotIn("CO<sub>2</sub>", html)
if __name__ == "__main__":
unittest.main()