paper_server/apps/resm/pdf_utils.py

"""PDF 解析/分类工具(纯 stdlib + pypdf, 不依赖 Django)。

独立成模块, 以便 ProcessPoolExecutor 的子进程能安全导入(fork/spawn 均可),
不会牵连 Django 模型与配置。tasks.py 从这里复用这些函数。
"""
import os
import re


def _pdf_page_count(content: bytes):
    """返回 PDF 页数; 无法确定时返回 None。

    优先用 pypdf 精确解析; 未安装或解析异常时退化为字节扫描
    (对未压缩对象树有效, Elsevier 的摘要预览页正属此类)。"""
    try:
        from io import BytesIO
        import logging
        # 坏 PDF 会让 pypdf 刷大量恢复日志, 这里只关心页数, 静音其 logger
        logging.getLogger("pypdf").setLevel(logging.CRITICAL)
        from pypdf import PdfReader
        return len(PdfReader(BytesIO(content), strict=False).pages)
    except ImportError:
        pass
    except Exception:
        return None
    try:
        counts = [int(m) for m in re.findall(rb"/Count\s+(\d+)", content)]
        if counts:
            return max(counts)
        n = len(re.findall(rb"/Type\s*/Page(?![sR])", content))
        if n:
            return n
    except Exception:
        pass
    return None


def _is_elsevier_preview_pdf(content: bytes) -> bool:
    """判断 Elsevier 返回的 PDF 是否为"摘要预览页"。

    Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回
    仅含摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小), 全文 XML 却可能正常。
    判据: 能确定页数且 <= 1 页。无法确定页数时返回 False(从宽, 不误杀真全文)。"""
    pages = _pdf_page_count(content)
    return pages is not None and pages <= 1


def _inspect_pdf(content: bytes):
    """对历史落库的 PDF 文件分类, 返回 (kind, pages)。

    kind:
      'broken'  - 非 PDF(魔数不符)或 pypdf 解析直接失败 -> 可安全删除重抓
      'preview' - 1 页摘要预览页
      'ok'      - 多页, 视为真全文, 不处理
      'unknown' - 魔数正常但页数判不出(通常因未装 pypdf) -> 不处理, 绝不当坏文件
    pages: 页数; 无法确定为 None。"""
    if not content or b"%PDF" not in content[:1024]:
        return "broken", 0
    try:
        from io import BytesIO
        import logging
        logging.getLogger("pypdf").setLevel(logging.CRITICAL)
        from pypdf import PdfReader
    except ImportError:
        # 没装 pypdf: 只能靠字节扫描, 判不出就 unknown(从宽, 不误判为坏)
        pages = _pdf_page_count(content)
        if pages is None:
            return "unknown", None
        return ("preview" if pages <= 1 else "ok"), pages
    try:
        pages = len(PdfReader(BytesIO(content), strict=False).pages)
    except Exception:
        return "broken", None
    if pages <= 0:
        return "broken", pages
    return ("preview" if pages == 1 else "ok"), pages


def classify_pdf_file(path: str):
    """并发 worker 入口: 读取并分类单个 PDF 文件路径。

    返回 (path, kind, pages)。除 _inspect_pdf 的四种 kind 外, 另有 IO 结果:
      'missing'    - 文件不存在
      'unreadable' - 打开失败(权限等)
    设计为纯函数(仅 stdlib + pypdf), 可被进程池安全 pickle / 导入。"""
    try:
        if not os.path.exists(path):
            return path, "missing", None
        with open(path, "rb") as f:
            content = f.read()
    except OSError:
        return path, "unreadable", None
    kind, pages = _inspect_pdf(content)
    return path, kind, pages