paper_server/apps/resm/management/commands/fix_preview_pdf.py

121 lines
5.0 KiB
Python

"""一次性修复: 纠正被误标为全文 PDF 的历史记录(Elsevier 摘要预览页 / 损坏文件)。
背景:
Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回仅含
摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小); 另有部分历史记录把 HTML 错误页 /
被截断的垃圾当 PDF 存了。旧抓取逻辑只校验魔数 + 体积, 都会误标 has_fulltext_pdf=True。
本命令核对本地 PDF, 分两类处理:
- 预览页(1 页): has_fulltext_pdf 置回 False; 文件**仅在 --delete-file 时**删除。
- 损坏文件(非 PDF / pypdf 解析失败): has_fulltext_pdf 置回 False; 文件**总是删除**
(dry-run 除外), 因为它根本不是有效全文, 留着无用且会污染下游解析。
两类在缺少 XML 全文(has_fulltext_xml=False)时, 一并把 has_fulltext 回退 False,
让其重新进入下载链路去找真正的全文; 并追加 fail_reason 标记供抓取任务排除。
安全前提:
"损坏"只在铁证下判定 —— 文件不以 %PDF 开头, 或已装 pypdf 且解析直接失败。
若未装 pypdf 且魔数正常但页数判不出, 归为 unknown, **不处理、绝不删除**。
强烈建议先 `pip install pypdf` 再跑, 否则只能处理魔数明显不符的坏文件。
用法:
python manage.py fix_preview_pdf --dry-run
python manage.py fix_preview_pdf # 纠正标记 + 删除坏文件, 保留预览页文件
python manage.py fix_preview_pdf --delete-file # 并删除预览页文件
"""
import os
from django.core.management.base import BaseCommand
from apps.resm.models import Paper
from apps.resm.tasks import _inspect_pdf
class Command(BaseCommand):
help = "纠正被误标为全文的 Elsevier 预览页 / 损坏 PDF"
def add_arguments(self, parser):
parser.add_argument("--dry-run", action="store_true",
help="只统计, 不写库 / 不删文件")
parser.add_argument("--limit", type=int, default=0,
help="最多处理多少条 (0=不限)")
parser.add_argument("--delete-file", action="store_true",
help="同时删除预览页文件(坏文件无论该开关都会删)")
def handle(self, *args, **opts):
dry = opts["dry_run"]
limit = opts["limit"]
del_preview = opts["delete_file"]
qs = Paper.objects.filter(
has_fulltext_pdf=True, doi__startswith="10.1016"
).order_by("id")
total = qs.count()
self.stdout.write(
f"候选(has_fulltext_pdf=True 且 DOI 以 10.1016 开头): {total}")
checked = preview = broken = only_pdf = deleted = 0
missing = unknown = 0
for paper in qs.iterator():
if limit and checked >= limit:
break
checked += 1
path = paper.init_paper_path("pdf")
if not os.path.exists(path):
missing += 1
continue
try:
with open(path, "rb") as f:
content = f.read()
except OSError:
unknown += 1
continue
kind, pages = _inspect_pdf(content)
if kind == "ok":
continue
if kind == "unknown":
unknown += 1
continue
# kind in ('preview', 'broken'): 都要纠正标记
do_delete = (kind == "broken") or del_preview
only_pdf_case = not paper.has_fulltext_xml
if kind == "preview":
preview += 1
tag = f"preview {pages}p"
else:
broken += 1
tag = "broken"
if only_pdf_case:
only_pdf += 1
self.stdout.write(
f"[{tag}]{' (only-pdf)' if only_pdf_case else ''}"
f"{' +rm' if do_delete else ''} {paper.doi} {path}")
if dry:
continue
paper.has_fulltext_pdf = False
update_fields = ["has_fulltext_pdf", "update_time"]
# 没有 XML 全文时, has_fulltext 之前只是被这张假/坏 PDF 置上的, 一并回退
if only_pdf_case:
paper.has_fulltext = False
update_fields.insert(0, "has_fulltext")
paper.save(update_fields=update_fields)
reason = "elsevier_pdf_preview_only" if kind == "preview" else "pdf_broken"
if reason not in (paper.fail_reason or ""):
paper.save_fail_reason(reason)
if do_delete:
try:
os.remove(path)
deleted += 1
except OSError:
pass
self.stdout.write(self.style.SUCCESS(
f"检查={checked} 预览页={preview} 坏文件={broken} "
f"(无XML全文一并回退has_fulltext={only_pdf}) 删除文件={deleted} "
f"文件缺失={missing} 未知/跳过={unknown}"
+ (" (dry-run, 未写库)" if dry else "")
))