feat(resm): fix_preview_pdf 增加坏 PDF 识别与清理

历史记录里除 Elsevier 1 页摘要预览页外, 还有把 HTML 错误页 / 截断垃圾当 PDF
存下的损坏文件, 同样被误标 has_fulltext_pdf=True。

- tasks.py: 新增 _inspect_pdf 分类器 (broken/preview/ok/unknown)。broken 仅在铁证
  下判定(非 %PDF 魔数, 或装了 pypdf 且解析失败); 未装 pypdf 又判不出页数归 unknown,
  绝不误删。
- fix_preview_pdf: 预览页文件仅 --delete-file 时删; 坏文件总是删(dry-run 除外),
  坏文件打 fail_reason=pdf_broken; 无 XML 全文者一并回退 has_fulltext。

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
caoqianming 2026-06-29 09:38:05 +08:00
parent 97b23a2b06
commit 2d6df68135
2 changed files with 78 additions and 33 deletions

View File

@ -1,34 +1,37 @@
"""一次性修复: 把误标为全文 PDF 的 Elsevier "摘要预览页"(1 页)纠正回未下载状态 """一次性修复: 纠正被误标为全文 PDF 的历史记录(Elsevier 摘要预览页 / 损坏文件)
背景: 背景:
Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回仅含 Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回仅含
摘要的 1 页预览 PDF(魔数仍是 %PDF体积也不小), 而全文 XML 却能正常拿到旧抓取 摘要的 1 页预览 PDF(魔数仍是 %PDF体积也不小); 另有部分历史记录把 HTML 错误页 /
逻辑只校验魔数 + 体积, 误将预览页落库并置 has_fulltext_pdf=True 被截断的垃圾当 PDF 存了旧抓取逻辑只校验魔数 + 体积, 都会误标 has_fulltext_pdf=True
本命令重新核对本地 PDF 的页数, <= 1 页者: 本命令核对本地 PDF, 分两类处理:
- has_fulltext_pdf 置回 False - 预览页(1 ): has_fulltext_pdf 置回 False; 文件**仅在 --delete-file **删除
- 若该论文有 XML 全文(has_fulltext_xml=True), 保留 has_fulltext=True; - 损坏文件( PDF / pypdf 解析失败): has_fulltext_pdf 置回 False; 文件**总是删除**
否则(此前只有这张假预览页冒充全文)一并把 has_fulltext 回退为 False, (dry-run 除外), 因为它根本不是有效全文, 留着无用且会污染下游解析
让它能重新进入下载链路去找真正的全文 两类在缺少 XML 全文(has_fulltext_xml=False), 一并把 has_fulltext 回退 False,
- 追加 fail_reason 'elsevier_pdf_preview_only' ( Elsevier 补抓队列排除, 避免无限重试) 让其重新进入下载链路去找真正的全文; 并追加 fail_reason 标记供抓取任务排除
- 可选: 删除本地预览 PDF 文件 (--delete-file)
文件读取依赖本地存在 PDF (在跑抓取的服务器上执行)建议先 --dry-run 看统计 安全前提:
"损坏"只在铁证下判定 文件不以 %PDF 开头, 或已装 pypdf 且解析直接失败
若未装 pypdf 且魔数正常但页数判不出, 归为 unknown, **不处理绝不删除**
强烈建议先 `pip install pypdf` 再跑, 否则只能处理魔数明显不符的坏文件
用法: 用法:
python manage.py fix_preview_pdf --dry-run python manage.py fix_preview_pdf --dry-run
python manage.py fix_preview_pdf --delete-file python manage.py fix_preview_pdf # 纠正标记 + 删除坏文件, 保留预览页文件
python manage.py fix_preview_pdf --delete-file # 并删除预览页文件
""" """
import os import os
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from apps.resm.models import Paper from apps.resm.models import Paper
from apps.resm.tasks import _pdf_page_count from apps.resm.tasks import _inspect_pdf
class Command(BaseCommand): class Command(BaseCommand):
help = "纠正被误标为全文的 Elsevier 摘要预览 PDF(1 页)" help = "纠正被误标为全文的 Elsevier 预览页 / 损坏 PDF"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument("--dry-run", action="store_true", parser.add_argument("--dry-run", action="store_true",
@ -36,12 +39,12 @@ class Command(BaseCommand):
parser.add_argument("--limit", type=int, default=0, parser.add_argument("--limit", type=int, default=0,
help="最多处理多少条 (0=不限)") help="最多处理多少条 (0=不限)")
parser.add_argument("--delete-file", action="store_true", parser.add_argument("--delete-file", action="store_true",
help="同时删除本地预览 PDF 文件") help="同时删除预览页文件(坏文件无论该开关都会删)")
def handle(self, *args, **opts): def handle(self, *args, **opts):
dry = opts["dry_run"] dry = opts["dry_run"]
limit = opts["limit"] limit = opts["limit"]
del_file = opts["delete_file"] del_preview = opts["delete_file"]
qs = Paper.objects.filter( qs = Paper.objects.filter(
has_fulltext_pdf=True, doi__startswith="10.1016" has_fulltext_pdf=True, doi__startswith="10.1016"
@ -50,7 +53,8 @@ class Command(BaseCommand):
self.stdout.write( self.stdout.write(
f"候选(has_fulltext_pdf=True 且 DOI 以 10.1016 开头): {total}") f"候选(has_fulltext_pdf=True 且 DOI 以 10.1016 开头): {total}")
checked = fixed = only_pdf = missing = unreadable = 0 checked = preview = broken = only_pdf = deleted = 0
missing = unknown = 0
for paper in qs.iterator(): for paper in qs.iterator():
if limit and checked >= limit: if limit and checked >= limit:
break break
@ -64,43 +68,53 @@ class Command(BaseCommand):
with open(path, "rb") as f: with open(path, "rb") as f:
content = f.read() content = f.read()
except OSError: except OSError:
unreadable += 1 unknown += 1
continue continue
pages = _pdf_page_count(content) kind, pages = _inspect_pdf(content)
if pages is None: if kind == "ok":
unreadable += 1 continue
if kind == "unknown":
unknown += 1
continue continue
if pages > 1:
continue # 真全文, 跳过
fixed += 1 # kind in ('preview', 'broken'): 都要纠正标记
do_delete = (kind == "broken") or del_preview
only_pdf_case = not paper.has_fulltext_xml only_pdf_case = not paper.has_fulltext_xml
if kind == "preview":
preview += 1
tag = f"preview {pages}p"
else:
broken += 1
tag = "broken"
if only_pdf_case: if only_pdf_case:
only_pdf += 1 only_pdf += 1
self.stdout.write( self.stdout.write(
f"[preview {pages}p]{' (only-pdf)' if only_pdf_case else ''} " f"[{tag}]{' (only-pdf)' if only_pdf_case else ''}"
f"{paper.doi} {path}") f"{' +rm' if do_delete else ''} {paper.doi} {path}")
if dry: if dry:
continue continue
paper.has_fulltext_pdf = False paper.has_fulltext_pdf = False
update_fields = ["has_fulltext_pdf", "update_time"] update_fields = ["has_fulltext_pdf", "update_time"]
# 没有 XML 全文时, 之前的 has_fulltext 只是被这张假预览页置上的, 一并回退 # 没有 XML 全文时, has_fulltext 之前只是被这张假/坏 PDF 置上的, 一并回退
if not paper.has_fulltext_xml: if only_pdf_case:
paper.has_fulltext = False paper.has_fulltext = False
update_fields.insert(0, "has_fulltext") update_fields.insert(0, "has_fulltext")
paper.save(update_fields=update_fields) paper.save(update_fields=update_fields)
if "elsevier_pdf_preview_only" not in (paper.fail_reason or ""): reason = "elsevier_pdf_preview_only" if kind == "preview" else "pdf_broken"
paper.save_fail_reason("elsevier_pdf_preview_only") if reason not in (paper.fail_reason or ""):
if del_file: paper.save_fail_reason(reason)
if do_delete:
try: try:
os.remove(path) os.remove(path)
deleted += 1
except OSError: except OSError:
pass pass
self.stdout.write(self.style.SUCCESS( self.stdout.write(self.style.SUCCESS(
f"检查={checked} 预览页修复={fixed} (其中无XML全文/一并回退has_fulltext={only_pdf}) " f"检查={checked} 预览页={preview} 坏文件={broken} "
f"文件缺失={missing} 无法解析={unreadable}" f"(无XML全文一并回退has_fulltext={only_pdf}) 删除文件={deleted} "
f"文件缺失={missing} 未知/跳过={unknown}"
+ (" (dry-run, 未写库)" if dry else "") + (" (dry-run, 未写库)" if dry else "")
)) ))

View File

@ -639,6 +639,37 @@ def _is_elsevier_preview_pdf(content: bytes) -> bool:
return pages is not None and pages <= 1 return pages is not None and pages <= 1
def _inspect_pdf(content: bytes):
"""对历史落库的 PDF 文件分类, 返回 (kind, pages)。
kind:
'broken' - PDF(魔数不符) pypdf 解析直接失败 -> 可安全删除重抓
'preview' - 1 页摘要预览页
'ok' - 多页, 视为真全文, 不处理
'unknown' - 魔数正常但页数判不出(通常因未装 pypdf) -> 不处理, 绝不当坏文件
pages: 页数; 无法确定为 None"""
if not content or b"%PDF" not in content[:1024]:
return "broken", 0
try:
from io import BytesIO
import logging
logging.getLogger("pypdf").setLevel(logging.CRITICAL)
from pypdf import PdfReader
except ImportError:
# 没装 pypdf: 只能靠字节扫描, 判不出就 unknown(从宽, 不误判为坏)
pages = _pdf_page_count(content)
if pages is None:
return "unknown", None
return ("preview" if pages <= 1 else "ok"), pages
try:
pages = len(PdfReader(BytesIO(content), strict=False).pages)
except Exception:
return "broken", None
if pages <= 0:
return "broken", pages
return ("preview" if pages == 1 else "ok"), pages
def _elsevier_fetch_pdf(req, paper): def _elsevier_fetch_pdf(req, paper):
"""同一 DOI 取 PDF, 成功落库返回 True。""" """同一 DOI 取 PDF, 成功落库返回 True。"""
try: try: