From 97b23a2b068e03d21ee8f610b9f0e5fa25209729 Mon Sep 17 00:00:00 2001 From: caoqianming Date: Mon, 29 Jun 2026 09:14:23 +0800 Subject: [PATCH] =?UTF-8?q?fix(resm):=20=E9=9D=99=E9=9F=B3=20pypdf=20?= =?UTF-8?q?=E8=A7=A3=E6=9E=90=E5=9D=8F=20PDF=20=E6=97=B6=E7=9A=84=E6=81=A2?= =?UTF-8?q?=E5=A4=8D=E6=97=A5=E5=BF=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _pdf_page_count 读到损坏 PDF 时 pypdf 会刷大量 incorrect header / Cannot find /Root 等恢复日志, 污染 fix_preview_pdf 等批处理输出。将 pypdf logger 调到 CRITICAL 静音; 解析失败仍按 None 处理(跳过该条)。 Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/resm/tasks.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index 3da4c96..b1409e7 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -607,6 +607,10 @@ def _pdf_page_count(content: bytes): (对未压缩对象树有效, Elsevier 的摘要预览页正属此类)。""" try: from io import BytesIO + import logging + # 坏 PDF 会让 pypdf 刷大量恢复日志(incorrect header / Cannot find /Root 等), + # 这里只关心页数, 静音其 logger 避免污染输出。 + logging.getLogger("pypdf").setLevel(logging.CRITICAL) from pypdf import PdfReader return len(PdfReader(BytesIO(content), strict=False).pages) except ImportError: