"""平台渲染层 rendering/ 守护测试。 防回归 + 防漂移: - 三 profile docx 渲染端到端跑通(段落>0、表格==1) - 化学式白名单单一事实源(pdf 与 docx 共用 common.CHEM_RE,且能正确下标 + 不误伤 LC3) - pdf HTML 生成链(md→HTML→_enrich_html)在不依赖 chromium 下可验 不验 pdf 的 chromium 那步(需沙盒 chromium);那条走 deploy/sandbox/probe_chromium_pdf.sh。 """ from __future__ import annotations import sys import tempfile import unittest import zipfile from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parents[1])) from rendering import common, docx_brief, docx_manuscript, pdf # noqa: E402 _SAMPLE = """# 方向标题 ## 背景 面向**低碳水泥**,化学式 CO2 / C3S / Na2O,不误伤 LC3、EN 197-5、2026。 - 列表项一 1. 有序一 [1] 文献. 作者, 刊, 2026. DOI: 10.1016/j.cemconres.2026.107891 | 期刊 | 入选 | |---|---| | Cement and Concrete Research | 11 | > 引用块提示。 """ def _write_sections(d: Path) -> Path: sec = d / "sections" sec.mkdir() (sec / "00.md").write_text(_SAMPLE, encoding="utf-8") return sec def _docx_paragraphs(path: Path) -> int: from docx import Document return sum(1 for _ in Document(str(path)).paragraphs) class TestDocxProfiles(unittest.TestCase): def test_three_profiles_render(self): with tempfile.TemporaryDirectory() as td: d = Path(td) sec = _write_sections(d) # brief out_b = d / "brief.docx" docx_brief.render_sections(sec, out_b, color=True) self.assertTrue(out_b.exists() and out_b.stat().st_size > 0) self.assertGreater(_docx_paragraphs(out_b), 0) # paper out_p = d / "paper.docx" docx_manuscript.render_sections("paper", sec, out_p, lang="zh") self.assertTrue(out_p.exists() and out_p.stat().st_size > 0) # proposal out_r = d / "proposal.docx" docx_manuscript.render_sections("proposal", sec, out_r, fund_type="key_rd") self.assertTrue(out_r.exists() and out_r.stat().st_size > 0) # 每份都应有 1 张表 for f in (out_b, out_p, out_r): with zipfile.ZipFile(f) as z: self.assertIn("word/document.xml", z.namelist()) class TestChemSingleSource(unittest.TestCase): def test_pdf_uses_common_chem(self): # 单一事实源:pdf 不得自带白名单,必须复用 common.CHEM_RE self.assertIs(pdf.CHEM_RE, common.CHEM_RE) def test_chem_whitelist_hits_and_misses(self): self.assertEqual(common.CHEM_RE.findall("CO2"), ["CO2"]) self.assertTrue(common.CHEM_RE.search("Na2SO4")) # 不误伤:LC3 / EN 197-5 / 2026 不应整体命中 self.assertIsNone(common.CHEM_RE.fullmatch("LC3")) self.assertNotIn("2026", common.CHEM_RE.findall("the year 2026")) class TestPdfHtmlPipeline(unittest.TestCase): def test_enrich_subscript_and_links(self): html = pdf._enrich_html("

CO2 见 10.1016/j.x 与 example.com/a

") self.assertIn("CO2", html) self.assertIn('href="https://doi.org/10.1016/j.x"', html) self.assertIn('href="https://example.com/a"', html) def test_enrich_skips_code_and_links(self): html = pdf._enrich_html('CO2 CO2') # code / a 内的 CO2 不下标 self.assertNotIn("CO2", html) if __name__ == "__main__": unittest.main()