178 lines
7.2 KiB
Python
178 lines
7.2 KiB
Python
"""md(sections 目录或单 .md)→ PDF,沙盒自带 chromium 渲染。
|
|
|
|
渲染链(全程沙盒内,不进 weasyprint、不装额外包):
|
|
md --(python `markdown` 库)--> HTML --(chromium --headless --print-to-pdf)--> PDF
|
|
|
|
chromium 是镜像里已装的(给 mermaid 用),fonts-noto-cjk 也已装;chromium 是完整浏览器
|
|
内核,CSS 保真度比 weasyprint 高。冒烟见 deploy/sandbox/probe_chromium_pdf.sh。
|
|
|
|
视觉与 docx 一致:复用 common.CHEM_RE(化学式下标白名单,单一事实源)+ 商务红配色 +
|
|
DOI/URL 超链。引文 [n] 上标回链这版按字面渲染(后续与 docx 一起 DRY 再补)。
|
|
ASCII-only stdout(Windows GBK 控制台安全)。
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
from .common import CHEM_RE
|
|
|
|
# ───────────────────────── 主题色(与 docx 商务红一致)─────────────────────────
|
|
PRIMARY = "#C00000"
|
|
TLDR_FILL = "#FBE9E9"
|
|
LINK_BLUE = "#1155CC"
|
|
TABLE_HEAD_FILL = "#C00000"
|
|
TABLE_ZEBRA = "#F8F0F0"
|
|
|
|
# 行内 DOI 子串(HTML-safe 边界)
|
|
_DOI_INLINE_RE = re.compile(r"10\.\d{4,9}/[^\s<>\"]+")
|
|
# 裸 URL / 域名 token
|
|
_URL_TOKEN_RE = re.compile(
|
|
r"(?<![\w/@.])((?:https?://)?[a-z0-9][\w.\-]*\.[a-z]{2,}(?:/[^\s<>\"]*)?)",
|
|
re.IGNORECASE,
|
|
)
|
|
# 切分 HTML 成 [文本, 标签, ...];只对文本 token 做下标/超链替换
|
|
_TAG_SPLIT = re.compile(r"(<[^>]+>)")
|
|
_SKIP_TAGS = {"a", "code", "pre", "script", "style", "head"}
|
|
_TAG_NAME_RE = re.compile(r"<\s*(/?)\s*([a-zA-Z0-9]+)")
|
|
|
|
|
|
def _log(msg: str) -> None:
|
|
print(f"[render_pdf] {msg}")
|
|
|
|
|
|
def _emit_chem(text: str) -> str:
|
|
def repl(m: re.Match) -> str:
|
|
return re.sub(r"(\d+)", r"<sub>\1</sub>", m.group(0))
|
|
return CHEM_RE.sub(repl, text)
|
|
|
|
|
|
def _emit_links(text: str) -> str:
|
|
def doi_repl(m: re.Match) -> str:
|
|
doi = m.group(0)
|
|
return f'<a href="https://doi.org/{doi}">{doi}</a>'
|
|
text = _DOI_INLINE_RE.sub(doi_repl, text)
|
|
|
|
out_parts = []
|
|
for piece in _TAG_SPLIT.split(text):
|
|
if piece.startswith("<"):
|
|
out_parts.append(piece)
|
|
continue
|
|
|
|
def url_repl(m: re.Match) -> str:
|
|
raw = m.group(1)
|
|
href = raw if raw.lower().startswith("http") else f"https://{raw}"
|
|
return f'<a href="{href}">{raw}</a>'
|
|
|
|
out_parts.append(_URL_TOKEN_RE.sub(url_repl, piece))
|
|
return "".join(out_parts)
|
|
|
|
|
|
def _enrich_html(html: str) -> str:
|
|
"""对 HTML 纯文本片段做化学式下标 + DOI/URL 超链;<a>/<code>/<pre> 内不动。"""
|
|
out = []
|
|
skip_depth = 0
|
|
for token in _TAG_SPLIT.split(html):
|
|
if not token:
|
|
continue
|
|
if token.startswith("<"):
|
|
m = _TAG_NAME_RE.match(token)
|
|
if m:
|
|
closing, name = m.group(1), m.group(2).lower()
|
|
if name in _SKIP_TAGS and not token.rstrip().endswith("/>"):
|
|
skip_depth += -1 if closing else 1
|
|
skip_depth = max(0, skip_depth)
|
|
out.append(token)
|
|
else:
|
|
out.append(token if skip_depth else _emit_links(_emit_chem(token)))
|
|
return "".join(out)
|
|
|
|
|
|
def _read_sections(src: Path) -> str:
|
|
if src.is_dir():
|
|
parts = [md.read_text(encoding="utf-8") for md in sorted(src.glob("*.md"))]
|
|
if not parts:
|
|
raise SystemExit(f"[render_pdf] no *.md under {src}")
|
|
return "\n\n".join(parts)
|
|
return src.read_text(encoding="utf-8")
|
|
|
|
|
|
def _css(color: bool) -> str:
|
|
primary = PRIMARY if color else "#000000"
|
|
head_fill = TABLE_HEAD_FILL if color else "#000000"
|
|
zebra = TABLE_ZEBRA if color else "#FFFFFF"
|
|
tldr = TLDR_FILL if color else "#FFFFFF"
|
|
link = LINK_BLUE if color else "#000000"
|
|
return f"""
|
|
@page {{ size: A4; margin: 2.2cm 2cm; }}
|
|
* {{ -webkit-print-color-adjust: exact; print-color-adjust: exact; }}
|
|
body {{ font-family: 'Times New Roman','Noto Serif CJK SC','Noto Sans CJK SC',serif;
|
|
font-size: 12pt; line-height: 1.6; color: #000; }}
|
|
h1 {{ font-family: 'Noto Sans CJK SC',sans-serif; font-size: 19pt; color: {primary};
|
|
border-bottom: 2px solid {primary}; padding-bottom: 4pt; margin: 22pt 0 12pt; }}
|
|
h2 {{ font-family: 'Noto Sans CJK SC',sans-serif; font-size: 15pt; color: {primary}; margin: 20pt 0 8pt; }}
|
|
h3 {{ font-family: 'Noto Sans CJK SC',sans-serif; font-size: 13pt; color: {primary}; margin: 16pt 0 6pt; }}
|
|
p {{ text-align: justify; margin: 6pt 0; }}
|
|
a {{ color: {link}; text-decoration: underline; word-break: break-all; }}
|
|
sub {{ font-size: 0.72em; }}
|
|
table {{ border-collapse: collapse; width: 100%; margin: 12pt 0; font-size: 10.5pt; }}
|
|
th {{ background: {head_fill}; color: #fff; padding: 6pt 8pt; border: 1px solid #999; text-align: center; }}
|
|
td {{ padding: 5pt 8pt; border: 1px solid #999; }}
|
|
tr:nth-child(even) td {{ background: {zebra}; }}
|
|
blockquote {{ border-left: 4px solid {primary}; background: {tldr}; margin: 12pt 0;
|
|
padding: 8pt 12pt; font-size: 11pt; }}
|
|
blockquote p {{ margin: 3pt 0; }}
|
|
code {{ font-family: Consolas,monospace; font-size: 10pt; background: #f5f5f5; padding: 1pt 3pt; }}
|
|
ul,ol {{ margin: 6pt 0; padding-left: 22pt; }}
|
|
li {{ margin: 3pt 0; }}
|
|
"""
|
|
|
|
|
|
def _find_chromium() -> str:
|
|
env = os.environ.get("CHROMIUM") or os.environ.get("CHROME")
|
|
cands = [env] if env else []
|
|
cands += ["chromium", "chromium-browser", "google-chrome",
|
|
"/usr/bin/chromium", "/usr/bin/chromium-browser"]
|
|
for c in cands:
|
|
if c and (shutil.which(c) or Path(c).exists()):
|
|
return shutil.which(c) or c
|
|
raise SystemExit("[render_pdf] chromium 不在沙盒里(镜像应已装,给 mermaid 用)。"
|
|
"确认 `which chromium` 或设 CHROMIUM 环境变量。")
|
|
|
|
|
|
def md_to_pdf(src: Path, out: Path, *, color: bool = True, profile: str = "") -> Path:
|
|
try:
|
|
import markdown
|
|
except ImportError:
|
|
raise SystemExit("[render_pdf] 缺 `markdown` 包。基础镜像应已装(requirements.txt);"
|
|
"本地补:.venv/Scripts/python.exe -m pip install markdown")
|
|
|
|
md_text = _read_sections(src)
|
|
body = markdown.markdown(
|
|
md_text, extensions=["tables", "fenced_code", "sane_lists", "attr_list"]
|
|
)
|
|
body = _enrich_html(body)
|
|
html = (f'<!DOCTYPE html><html lang="zh-CN"><head><meta charset="utf-8">'
|
|
f"<style>{_css(color)}</style></head><body>{body}</body></html>")
|
|
|
|
chromium = _find_chromium()
|
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
with tempfile.TemporaryDirectory(prefix="render-pdf-") as tmp:
|
|
html_path = Path(tmp) / "doc.html"
|
|
html_path.write_text(html, encoding="utf-8")
|
|
cmd = [
|
|
chromium, "--headless", "--disable-gpu", "--no-sandbox",
|
|
"--disable-dev-shm-usage", f"--user-data-dir={tmp}/cr",
|
|
"--no-pdf-header-footer",
|
|
f"--print-to-pdf={out}", html_path.as_uri(),
|
|
]
|
|
proc = subprocess.run(cmd, capture_output=True, timeout=120, check=False)
|
|
if proc.returncode != 0 or not out.exists() or out.stat().st_size == 0:
|
|
tail = (proc.stderr or proc.stdout or b"").decode("utf-8", "replace")[-600:]
|
|
raise SystemExit(f"[render_pdf] chromium 转 PDF 失败(rc={proc.returncode}):\n{tail}")
|
|
return out
|