zcbot/rendering/pdf.py

178 lines
7.2 KiB
Python

"""md(sections 目录或单 .md)→ PDF,沙盒自带 chromium 渲染。
渲染链(全程沙盒内,不进 weasyprint、不装额外包):
md --(python `markdown` 库)--> HTML --(chromium --headless --print-to-pdf)--> PDF
chromium 是镜像里已装的(给 mermaid 用),fonts-noto-cjk 也已装;chromium 是完整浏览器
内核,CSS 保真度比 weasyprint 高。冒烟见 deploy/sandbox/probe_chromium_pdf.sh。
视觉与 docx 一致:复用 common.CHEM_RE(化学式下标白名单,单一事实源)+ 商务红配色 +
DOI/URL 超链。引文 [n] 上标回链这版按字面渲染(后续与 docx 一起 DRY 再补)。
ASCII-only stdout(Windows GBK 控制台安全)。
"""
from __future__ import annotations
import os
import re
import shutil
import subprocess
import tempfile
from pathlib import Path
from .common import CHEM_RE
# ───────────────────────── 主题色(与 docx 商务红一致)─────────────────────────
PRIMARY = "#C00000"
TLDR_FILL = "#FBE9E9"
LINK_BLUE = "#1155CC"
TABLE_HEAD_FILL = "#C00000"
TABLE_ZEBRA = "#F8F0F0"
# 行内 DOI 子串(HTML-safe 边界)
_DOI_INLINE_RE = re.compile(r"10\.\d{4,9}/[^\s<>\"]+")
# 裸 URL / 域名 token
_URL_TOKEN_RE = re.compile(
r"(?<![\w/@.])((?:https?://)?[a-z0-9][\w.\-]*\.[a-z]{2,}(?:/[^\s<>\"]*)?)",
re.IGNORECASE,
)
# 切分 HTML 成 [文本, 标签, ...];只对文本 token 做下标/超链替换
_TAG_SPLIT = re.compile(r"(<[^>]+>)")
_SKIP_TAGS = {"a", "code", "pre", "script", "style", "head"}
_TAG_NAME_RE = re.compile(r"<\s*(/?)\s*([a-zA-Z0-9]+)")
def _log(msg: str) -> None:
print(f"[render_pdf] {msg}")
def _emit_chem(text: str) -> str:
def repl(m: re.Match) -> str:
return re.sub(r"(\d+)", r"<sub>\1</sub>", m.group(0))
return CHEM_RE.sub(repl, text)
def _emit_links(text: str) -> str:
def doi_repl(m: re.Match) -> str:
doi = m.group(0)
return f'<a href="https://doi.org/{doi}">{doi}</a>'
text = _DOI_INLINE_RE.sub(doi_repl, text)
out_parts = []
for piece in _TAG_SPLIT.split(text):
if piece.startswith("<"):
out_parts.append(piece)
continue
def url_repl(m: re.Match) -> str:
raw = m.group(1)
href = raw if raw.lower().startswith("http") else f"https://{raw}"
return f'<a href="{href}">{raw}</a>'
out_parts.append(_URL_TOKEN_RE.sub(url_repl, piece))
return "".join(out_parts)
def _enrich_html(html: str) -> str:
"""对 HTML 纯文本片段做化学式下标 + DOI/URL 超链;<a>/<code>/<pre> 内不动。"""
out = []
skip_depth = 0
for token in _TAG_SPLIT.split(html):
if not token:
continue
if token.startswith("<"):
m = _TAG_NAME_RE.match(token)
if m:
closing, name = m.group(1), m.group(2).lower()
if name in _SKIP_TAGS and not token.rstrip().endswith("/>"):
skip_depth += -1 if closing else 1
skip_depth = max(0, skip_depth)
out.append(token)
else:
out.append(token if skip_depth else _emit_links(_emit_chem(token)))
return "".join(out)
def _read_sections(src: Path) -> str:
if src.is_dir():
parts = [md.read_text(encoding="utf-8") for md in sorted(src.glob("*.md"))]
if not parts:
raise SystemExit(f"[render_pdf] no *.md under {src}")
return "\n\n".join(parts)
return src.read_text(encoding="utf-8")
def _css(color: bool) -> str:
primary = PRIMARY if color else "#000000"
head_fill = TABLE_HEAD_FILL if color else "#000000"
zebra = TABLE_ZEBRA if color else "#FFFFFF"
tldr = TLDR_FILL if color else "#FFFFFF"
link = LINK_BLUE if color else "#000000"
return f"""
@page {{ size: A4; margin: 2.2cm 2cm; }}
* {{ -webkit-print-color-adjust: exact; print-color-adjust: exact; }}
body {{ font-family: 'Times New Roman','Noto Serif CJK SC','Noto Sans CJK SC',serif;
font-size: 12pt; line-height: 1.6; color: #000; }}
h1 {{ font-family: 'Noto Sans CJK SC',sans-serif; font-size: 19pt; color: {primary};
border-bottom: 2px solid {primary}; padding-bottom: 4pt; margin: 22pt 0 12pt; }}
h2 {{ font-family: 'Noto Sans CJK SC',sans-serif; font-size: 15pt; color: {primary}; margin: 20pt 0 8pt; }}
h3 {{ font-family: 'Noto Sans CJK SC',sans-serif; font-size: 13pt; color: {primary}; margin: 16pt 0 6pt; }}
p {{ text-align: justify; margin: 6pt 0; }}
a {{ color: {link}; text-decoration: underline; word-break: break-all; }}
sub {{ font-size: 0.72em; }}
table {{ border-collapse: collapse; width: 100%; margin: 12pt 0; font-size: 10.5pt; }}
th {{ background: {head_fill}; color: #fff; padding: 6pt 8pt; border: 1px solid #999; text-align: center; }}
td {{ padding: 5pt 8pt; border: 1px solid #999; }}
tr:nth-child(even) td {{ background: {zebra}; }}
blockquote {{ border-left: 4px solid {primary}; background: {tldr}; margin: 12pt 0;
padding: 8pt 12pt; font-size: 11pt; }}
blockquote p {{ margin: 3pt 0; }}
code {{ font-family: Consolas,monospace; font-size: 10pt; background: #f5f5f5; padding: 1pt 3pt; }}
ul,ol {{ margin: 6pt 0; padding-left: 22pt; }}
li {{ margin: 3pt 0; }}
"""
def _find_chromium() -> str:
env = os.environ.get("CHROMIUM") or os.environ.get("CHROME")
cands = [env] if env else []
cands += ["chromium", "chromium-browser", "google-chrome",
"/usr/bin/chromium", "/usr/bin/chromium-browser"]
for c in cands:
if c and (shutil.which(c) or Path(c).exists()):
return shutil.which(c) or c
raise SystemExit("[render_pdf] chromium 不在沙盒里(镜像应已装,给 mermaid 用)。"
"确认 `which chromium` 或设 CHROMIUM 环境变量。")
def md_to_pdf(src: Path, out: Path, *, color: bool = True, profile: str = "") -> Path:
try:
import markdown
except ImportError:
raise SystemExit("[render_pdf] 缺 `markdown` 包。基础镜像应已装(requirements.txt);"
"本地补:.venv/Scripts/python.exe -m pip install markdown")
md_text = _read_sections(src)
body = markdown.markdown(
md_text, extensions=["tables", "fenced_code", "sane_lists", "attr_list"]
)
body = _enrich_html(body)
html = (f'<!DOCTYPE html><html lang="zh-CN"><head><meta charset="utf-8">'
f"<style>{_css(color)}</style></head><body>{body}</body></html>")
chromium = _find_chromium()
out.parent.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory(prefix="render-pdf-") as tmp:
html_path = Path(tmp) / "doc.html"
html_path.write_text(html, encoding="utf-8")
cmd = [
chromium, "--headless", "--disable-gpu", "--no-sandbox",
"--disable-dev-shm-usage", f"--user-data-dir={tmp}/cr",
"--no-pdf-header-footer",
f"--print-to-pdf={out}", html_path.as_uri(),
]
proc = subprocess.run(cmd, capture_output=True, timeout=120, check=False)
if proc.returncode != 0 or not out.exists() or out.stat().st_size == 0:
tail = (proc.stderr or proc.stdout or b"").decode("utf-8", "replace")[-600:]
raise SystemExit(f"[render_pdf] chromium 转 PDF 失败(rc={proc.returncode}):\n{tail}")
return out