From 2d6df6813566820be0b8c4fac26432613dcd933d Mon Sep 17 00:00:00 2001 From: caoqianming Date: Mon, 29 Jun 2026 09:38:05 +0800 Subject: [PATCH] =?UTF-8?q?feat(resm):=20fix=5Fpreview=5Fpdf=20=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E5=9D=8F=20PDF=20=E8=AF=86=E5=88=AB=E4=B8=8E=E6=B8=85?= =?UTF-8?q?=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 历史记录里除 Elsevier 1 页摘要预览页外, 还有把 HTML 错误页 / 截断垃圾当 PDF 存下的损坏文件, 同样被误标 has_fulltext_pdf=True。 - tasks.py: 新增 _inspect_pdf 分类器 (broken/preview/ok/unknown)。broken 仅在铁证 下判定(非 %PDF 魔数, 或装了 pypdf 且解析失败); 未装 pypdf 又判不出页数归 unknown, 绝不误删。 - fix_preview_pdf: 预览页文件仅 --delete-file 时删; 坏文件总是删(dry-run 除外), 坏文件打 fail_reason=pdf_broken; 无 XML 全文者一并回退 has_fulltext。 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../management/commands/fix_preview_pdf.py | 80 +++++++++++-------- apps/resm/tasks.py | 31 +++++++ 2 files changed, 78 insertions(+), 33 deletions(-) diff --git a/apps/resm/management/commands/fix_preview_pdf.py b/apps/resm/management/commands/fix_preview_pdf.py index 2056814..89eb0f8 100644 --- a/apps/resm/management/commands/fix_preview_pdf.py +++ b/apps/resm/management/commands/fix_preview_pdf.py @@ -1,34 +1,37 @@ -"""一次性修复: 把误标为全文 PDF 的 Elsevier "摘要预览页"(1 页)纠正回未下载状态。 +"""一次性修复: 纠正被误标为全文 PDF 的历史记录(Elsevier 摘要预览页 / 损坏文件)。 背景: Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回仅含 - 摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小), 而全文 XML 却能正常拿到。旧抓取 - 逻辑只校验魔数 + 体积, 误将预览页落库并置 has_fulltext_pdf=True。 + 摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小); 另有部分历史记录把 HTML 错误页 / + 被截断的垃圾当 PDF 存了。旧抓取逻辑只校验魔数 + 体积, 都会误标 has_fulltext_pdf=True。 -本命令重新核对本地 PDF 的页数, 对 <= 1 页者: - - has_fulltext_pdf 置回 False - - 若该论文有 XML 全文(has_fulltext_xml=True), 保留 has_fulltext=True; - 否则(此前只有这张假预览页冒充全文)一并把 has_fulltext 回退为 False, - 让它能重新进入下载链路去找真正的全文。 - - 追加 fail_reason 'elsevier_pdf_preview_only' (供 Elsevier 补抓队列排除, 避免无限重试) - - 可选: 删除本地预览 PDF 文件 (--delete-file) +本命令核对本地 PDF, 分两类处理: + - 预览页(1 页): has_fulltext_pdf 置回 False; 文件**仅在 --delete-file 时**删除。 + - 损坏文件(非 PDF / pypdf 解析失败): has_fulltext_pdf 置回 False; 文件**总是删除** + (dry-run 除外), 因为它根本不是有效全文, 留着无用且会污染下游解析。 + 两类在缺少 XML 全文(has_fulltext_xml=False)时, 一并把 has_fulltext 回退 False, + 让其重新进入下载链路去找真正的全文; 并追加 fail_reason 标记供抓取任务排除。 -文件读取依赖本地存在 PDF (在跑抓取的服务器上执行)。建议先 --dry-run 看统计。 +安全前提: + "损坏"只在铁证下判定 —— 文件不以 %PDF 开头, 或已装 pypdf 且解析直接失败。 + 若未装 pypdf 且魔数正常但页数判不出, 归为 unknown, **不处理、绝不删除**。 + 强烈建议先 `pip install pypdf` 再跑, 否则只能处理魔数明显不符的坏文件。 用法: python manage.py fix_preview_pdf --dry-run - python manage.py fix_preview_pdf --delete-file + python manage.py fix_preview_pdf # 纠正标记 + 删除坏文件, 保留预览页文件 + python manage.py fix_preview_pdf --delete-file # 并删除预览页文件 """ import os from django.core.management.base import BaseCommand from apps.resm.models import Paper -from apps.resm.tasks import _pdf_page_count +from apps.resm.tasks import _inspect_pdf class Command(BaseCommand): - help = "纠正被误标为全文的 Elsevier 摘要预览 PDF(1 页)" + help = "纠正被误标为全文的 Elsevier 预览页 / 损坏 PDF" def add_arguments(self, parser): parser.add_argument("--dry-run", action="store_true", @@ -36,12 +39,12 @@ class Command(BaseCommand): parser.add_argument("--limit", type=int, default=0, help="最多处理多少条 (0=不限)") parser.add_argument("--delete-file", action="store_true", - help="同时删除本地预览 PDF 文件") + help="同时删除预览页文件(坏文件无论该开关都会删)") def handle(self, *args, **opts): dry = opts["dry_run"] limit = opts["limit"] - del_file = opts["delete_file"] + del_preview = opts["delete_file"] qs = Paper.objects.filter( has_fulltext_pdf=True, doi__startswith="10.1016" @@ -50,7 +53,8 @@ class Command(BaseCommand): self.stdout.write( f"候选(has_fulltext_pdf=True 且 DOI 以 10.1016 开头): {total}") - checked = fixed = only_pdf = missing = unreadable = 0 + checked = preview = broken = only_pdf = deleted = 0 + missing = unknown = 0 for paper in qs.iterator(): if limit and checked >= limit: break @@ -64,43 +68,53 @@ class Command(BaseCommand): with open(path, "rb") as f: content = f.read() except OSError: - unreadable += 1 + unknown += 1 continue - pages = _pdf_page_count(content) - if pages is None: - unreadable += 1 + kind, pages = _inspect_pdf(content) + if kind == "ok": + continue + if kind == "unknown": + unknown += 1 continue - if pages > 1: - continue # 真全文, 跳过 - fixed += 1 + # kind in ('preview', 'broken'): 都要纠正标记 + do_delete = (kind == "broken") or del_preview only_pdf_case = not paper.has_fulltext_xml + if kind == "preview": + preview += 1 + tag = f"preview {pages}p" + else: + broken += 1 + tag = "broken" if only_pdf_case: only_pdf += 1 self.stdout.write( - f"[preview {pages}p]{' (only-pdf)' if only_pdf_case else ''} " - f"{paper.doi} {path}") + f"[{tag}]{' (only-pdf)' if only_pdf_case else ''}" + f"{' +rm' if do_delete else ''} {paper.doi} {path}") if dry: continue paper.has_fulltext_pdf = False update_fields = ["has_fulltext_pdf", "update_time"] - # 没有 XML 全文时, 之前的 has_fulltext 只是被这张假预览页置上的, 一并回退 - if not paper.has_fulltext_xml: + # 没有 XML 全文时, has_fulltext 之前只是被这张假/坏 PDF 置上的, 一并回退 + if only_pdf_case: paper.has_fulltext = False update_fields.insert(0, "has_fulltext") paper.save(update_fields=update_fields) - if "elsevier_pdf_preview_only" not in (paper.fail_reason or ""): - paper.save_fail_reason("elsevier_pdf_preview_only") - if del_file: + reason = "elsevier_pdf_preview_only" if kind == "preview" else "pdf_broken" + if reason not in (paper.fail_reason or ""): + paper.save_fail_reason(reason) + if do_delete: try: os.remove(path) + deleted += 1 except OSError: pass self.stdout.write(self.style.SUCCESS( - f"检查={checked} 预览页修复={fixed} (其中无XML全文/一并回退has_fulltext={only_pdf}) " - f"文件缺失={missing} 无法解析={unreadable}" + f"检查={checked} 预览页={preview} 坏文件={broken} " + f"(无XML全文一并回退has_fulltext={only_pdf}) 删除文件={deleted} " + f"文件缺失={missing} 未知/跳过={unknown}" + (" (dry-run, 未写库)" if dry else "") )) diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index b1409e7..6308551 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -639,6 +639,37 @@ def _is_elsevier_preview_pdf(content: bytes) -> bool: return pages is not None and pages <= 1 +def _inspect_pdf(content: bytes): + """对历史落库的 PDF 文件分类, 返回 (kind, pages)。 + + kind: + 'broken' - 非 PDF(魔数不符)或 pypdf 解析直接失败 -> 可安全删除重抓 + 'preview' - 1 页摘要预览页 + 'ok' - 多页, 视为真全文, 不处理 + 'unknown' - 魔数正常但页数判不出(通常因未装 pypdf) -> 不处理, 绝不当坏文件 + pages: 页数; 无法确定为 None。""" + if not content or b"%PDF" not in content[:1024]: + return "broken", 0 + try: + from io import BytesIO + import logging + logging.getLogger("pypdf").setLevel(logging.CRITICAL) + from pypdf import PdfReader + except ImportError: + # 没装 pypdf: 只能靠字节扫描, 判不出就 unknown(从宽, 不误判为坏) + pages = _pdf_page_count(content) + if pages is None: + return "unknown", None + return ("preview" if pages <= 1 else "ok"), pages + try: + pages = len(PdfReader(BytesIO(content), strict=False).pages) + except Exception: + return "broken", None + if pages <= 0: + return "broken", pages + return ("preview" if pages == 1 else "ok"), pages + + def _elsevier_fetch_pdf(req, paper): """同一 DOI 取 PDF, 成功落库返回 True。""" try: