105 lines
3.5 KiB
Python
105 lines
3.5 KiB
Python
"""平台渲染层 rendering/ 守护测试。
|
|
|
|
防回归 + 防漂移:
|
|
- 三 profile docx 渲染端到端跑通(段落>0、表格==1)
|
|
- 化学式白名单单一事实源(pdf 与 docx 共用 common.CHEM_RE,且能正确下标 + 不误伤 LC3)
|
|
- pdf HTML 生成链(md→HTML→_enrich_html)在不依赖 chromium 下可验
|
|
|
|
不验 pdf 的 chromium 那步(需沙盒 chromium);那条走 deploy/sandbox/probe_chromium_pdf.sh。
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import sys
|
|
import tempfile
|
|
import unittest
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
|
|
|
from rendering import common, docx_brief, docx_manuscript, pdf # noqa: E402
|
|
|
|
_SAMPLE = """# 方向标题
|
|
|
|
## 背景
|
|
|
|
面向**低碳水泥**,化学式 CO2 / C3S / Na2O,不误伤 LC3、EN 197-5、2026。
|
|
|
|
- 列表项一
|
|
1. 有序一
|
|
|
|
[1] 文献. 作者, 刊, 2026. DOI: 10.1016/j.cemconres.2026.107891
|
|
|
|
| 期刊 | 入选 |
|
|
|---|---|
|
|
| Cement and Concrete Research | 11 |
|
|
|
|
> 引用块提示。
|
|
"""
|
|
|
|
|
|
def _write_sections(d: Path) -> Path:
|
|
sec = d / "sections"
|
|
sec.mkdir()
|
|
(sec / "00.md").write_text(_SAMPLE, encoding="utf-8")
|
|
return sec
|
|
|
|
|
|
def _docx_paragraphs(path: Path) -> int:
|
|
from docx import Document
|
|
return sum(1 for _ in Document(str(path)).paragraphs)
|
|
|
|
|
|
class TestDocxProfiles(unittest.TestCase):
|
|
def test_three_profiles_render(self):
|
|
with tempfile.TemporaryDirectory() as td:
|
|
d = Path(td)
|
|
sec = _write_sections(d)
|
|
# brief
|
|
out_b = d / "brief.docx"
|
|
docx_brief.render_sections(sec, out_b, color=True)
|
|
self.assertTrue(out_b.exists() and out_b.stat().st_size > 0)
|
|
self.assertGreater(_docx_paragraphs(out_b), 0)
|
|
# paper
|
|
out_p = d / "paper.docx"
|
|
docx_manuscript.render_sections("paper", sec, out_p, lang="zh")
|
|
self.assertTrue(out_p.exists() and out_p.stat().st_size > 0)
|
|
# proposal
|
|
out_r = d / "proposal.docx"
|
|
docx_manuscript.render_sections("proposal", sec, out_r, fund_type="key_rd")
|
|
self.assertTrue(out_r.exists() and out_r.stat().st_size > 0)
|
|
# 每份都应有 1 张表
|
|
for f in (out_b, out_p, out_r):
|
|
with zipfile.ZipFile(f) as z:
|
|
self.assertIn("word/document.xml", z.namelist())
|
|
|
|
|
|
class TestChemSingleSource(unittest.TestCase):
|
|
def test_pdf_uses_common_chem(self):
|
|
# 单一事实源:pdf 不得自带白名单,必须复用 common.CHEM_RE
|
|
self.assertIs(pdf.CHEM_RE, common.CHEM_RE)
|
|
|
|
def test_chem_whitelist_hits_and_misses(self):
|
|
self.assertEqual(common.CHEM_RE.findall("CO2"), ["CO2"])
|
|
self.assertTrue(common.CHEM_RE.search("Na2SO4"))
|
|
# 不误伤:LC3 / EN 197-5 / 2026 不应整体命中
|
|
self.assertIsNone(common.CHEM_RE.fullmatch("LC3"))
|
|
self.assertNotIn("2026", common.CHEM_RE.findall("the year 2026"))
|
|
|
|
|
|
class TestPdfHtmlPipeline(unittest.TestCase):
|
|
def test_enrich_subscript_and_links(self):
|
|
html = pdf._enrich_html("<p>CO2 见 10.1016/j.x 与 example.com/a</p>")
|
|
self.assertIn("CO<sub>2</sub>", html)
|
|
self.assertIn('href="https://doi.org/10.1016/j.x"', html)
|
|
self.assertIn('href="https://example.com/a"', html)
|
|
|
|
def test_enrich_skips_code_and_links(self):
|
|
html = pdf._enrich_html('<code>CO2</code> <a href="x">CO2</a>')
|
|
# code / a 内的 CO2 不下标
|
|
self.assertNotIn("CO<sub>2</sub>", html)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|