"""一次性修复: 纠正被误标为全文 PDF 的历史记录(Elsevier 摘要预览页 / 损坏文件)。 背景: Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回仅含 摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小); 另有部分历史记录把 HTML 错误页 / 被截断的垃圾当 PDF 存了。旧抓取逻辑只校验魔数 + 体积, 都会误标 has_fulltext_pdf=True。 本命令核对本地 PDF, 分两类处理: - 预览页(1 页): has_fulltext_pdf 置回 False; 文件**仅在 --delete-file 时**删除。 - 损坏文件(非 PDF / pypdf 解析失败): has_fulltext_pdf 置回 False; 文件**总是删除** (dry-run 除外), 因为它根本不是有效全文, 留着无用且会污染下游解析。 两类在缺少 XML 全文(has_fulltext_xml=False)时, 一并把 has_fulltext 回退 False, 让其重新进入下载链路去找真正的全文; 并追加 fail_reason 标记供抓取任务排除。 安全前提: "损坏"只在铁证下判定 —— 文件不以 %PDF 开头, 或已装 pypdf 且解析直接失败。 若未装 pypdf 且魔数正常但页数判不出, 归为 unknown, **不处理、绝不删除**。 强烈建议先 `pip install pypdf` 再跑, 否则只能处理魔数明显不符的坏文件。 用法: python manage.py fix_preview_pdf --dry-run python manage.py fix_preview_pdf # 纠正标记 + 删除坏文件, 保留预览页文件 python manage.py fix_preview_pdf --delete-file # 并删除预览页文件 """ import os from django.core.management.base import BaseCommand from apps.resm.models import Paper from apps.resm.tasks import _inspect_pdf class Command(BaseCommand): help = "纠正被误标为全文的 Elsevier 预览页 / 损坏 PDF" def add_arguments(self, parser): parser.add_argument("--dry-run", action="store_true", help="只统计, 不写库 / 不删文件") parser.add_argument("--limit", type=int, default=0, help="最多处理多少条 (0=不限)") parser.add_argument("--delete-file", action="store_true", help="同时删除预览页文件(坏文件无论该开关都会删)") def handle(self, *args, **opts): dry = opts["dry_run"] limit = opts["limit"] del_preview = opts["delete_file"] qs = Paper.objects.filter( has_fulltext_pdf=True, doi__startswith="10.1016" ).order_by("id") total = qs.count() self.stdout.write( f"候选(has_fulltext_pdf=True 且 DOI 以 10.1016 开头): {total}") checked = preview = broken = only_pdf = deleted = 0 missing = unknown = 0 for paper in qs.iterator(): if limit and checked >= limit: break checked += 1 path = paper.init_paper_path("pdf") if not os.path.exists(path): missing += 1 continue try: with open(path, "rb") as f: content = f.read() except OSError: unknown += 1 continue kind, pages = _inspect_pdf(content) if kind == "ok": continue if kind == "unknown": unknown += 1 continue # kind in ('preview', 'broken'): 都要纠正标记 do_delete = (kind == "broken") or del_preview only_pdf_case = not paper.has_fulltext_xml if kind == "preview": preview += 1 tag = f"preview {pages}p" else: broken += 1 tag = "broken" if only_pdf_case: only_pdf += 1 self.stdout.write( f"[{tag}]{' (only-pdf)' if only_pdf_case else ''}" f"{' +rm' if do_delete else ''} {paper.doi} {path}") if dry: continue paper.has_fulltext_pdf = False update_fields = ["has_fulltext_pdf", "update_time"] # 没有 XML 全文时, has_fulltext 之前只是被这张假/坏 PDF 置上的, 一并回退 if only_pdf_case: paper.has_fulltext = False update_fields.insert(0, "has_fulltext") paper.save(update_fields=update_fields) reason = "elsevier_pdf_preview_only" if kind == "preview" else "pdf_broken" if reason not in (paper.fail_reason or ""): paper.save_fail_reason(reason) if do_delete: try: os.remove(path) deleted += 1 except OSError: pass self.stdout.write(self.style.SUCCESS( f"检查={checked} 预览页={preview} 坏文件={broken} " f"(无XML全文一并回退has_fulltext={only_pdf}) 删除文件={deleted} " f"文件缺失={missing} 未知/跳过={unknown}" + (" (dry-run, 未写库)" if dry else "") ))