diff --git a/apps/resm/management/__init__.py b/apps/resm/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/resm/management/commands/__init__.py b/apps/resm/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/resm/management/commands/fix_preview_pdf.py b/apps/resm/management/commands/fix_preview_pdf.py new file mode 100644 index 0000000..2056814 --- /dev/null +++ b/apps/resm/management/commands/fix_preview_pdf.py @@ -0,0 +1,106 @@ +"""一次性修复: 把误标为全文 PDF 的 Elsevier "摘要预览页"(1 页)纠正回未下载状态。 + +背景: + Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回仅含 + 摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小), 而全文 XML 却能正常拿到。旧抓取 + 逻辑只校验魔数 + 体积, 误将预览页落库并置 has_fulltext_pdf=True。 + +本命令重新核对本地 PDF 的页数, 对 <= 1 页者: + - has_fulltext_pdf 置回 False + - 若该论文有 XML 全文(has_fulltext_xml=True), 保留 has_fulltext=True; + 否则(此前只有这张假预览页冒充全文)一并把 has_fulltext 回退为 False, + 让它能重新进入下载链路去找真正的全文。 + - 追加 fail_reason 'elsevier_pdf_preview_only' (供 Elsevier 补抓队列排除, 避免无限重试) + - 可选: 删除本地预览 PDF 文件 (--delete-file) + +文件读取依赖本地存在 PDF (在跑抓取的服务器上执行)。建议先 --dry-run 看统计。 + +用法: + python manage.py fix_preview_pdf --dry-run + python manage.py fix_preview_pdf --delete-file +""" +import os + +from django.core.management.base import BaseCommand + +from apps.resm.models import Paper +from apps.resm.tasks import _pdf_page_count + + +class Command(BaseCommand): + help = "纠正被误标为全文的 Elsevier 摘要预览 PDF(1 页)" + + def add_arguments(self, parser): + parser.add_argument("--dry-run", action="store_true", + help="只统计, 不写库 / 不删文件") + parser.add_argument("--limit", type=int, default=0, + help="最多处理多少条 (0=不限)") + parser.add_argument("--delete-file", action="store_true", + help="同时删除本地预览 PDF 文件") + + def handle(self, *args, **opts): + dry = opts["dry_run"] + limit = opts["limit"] + del_file = opts["delete_file"] + + qs = Paper.objects.filter( + has_fulltext_pdf=True, doi__startswith="10.1016" + ).order_by("id") + total = qs.count() + self.stdout.write( + f"候选(has_fulltext_pdf=True 且 DOI 以 10.1016 开头): {total}") + + checked = fixed = only_pdf = missing = unreadable = 0 + for paper in qs.iterator(): + if limit and checked >= limit: + break + checked += 1 + + path = paper.init_paper_path("pdf") + if not os.path.exists(path): + missing += 1 + continue + try: + with open(path, "rb") as f: + content = f.read() + except OSError: + unreadable += 1 + continue + + pages = _pdf_page_count(content) + if pages is None: + unreadable += 1 + continue + if pages > 1: + continue # 真全文, 跳过 + + fixed += 1 + only_pdf_case = not paper.has_fulltext_xml + if only_pdf_case: + only_pdf += 1 + self.stdout.write( + f"[preview {pages}p]{' (only-pdf)' if only_pdf_case else ''} " + f"{paper.doi} {path}") + if dry: + continue + + paper.has_fulltext_pdf = False + update_fields = ["has_fulltext_pdf", "update_time"] + # 没有 XML 全文时, 之前的 has_fulltext 只是被这张假预览页置上的, 一并回退 + if not paper.has_fulltext_xml: + paper.has_fulltext = False + update_fields.insert(0, "has_fulltext") + paper.save(update_fields=update_fields) + if "elsevier_pdf_preview_only" not in (paper.fail_reason or ""): + paper.save_fail_reason("elsevier_pdf_preview_only") + if del_file: + try: + os.remove(path) + except OSError: + pass + + self.stdout.write(self.style.SUCCESS( + f"检查={checked} 预览页修复={fixed} (其中无XML全文/一并回退has_fulltext={only_pdf}) " + f"文件缺失={missing} 无法解析={unreadable}" + + (" (dry-run, 未写库)" if dry else "") + )) diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index e1e1dc1..3da4c96 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -11,6 +11,7 @@ from lxml import etree from celery import current_app from datetime import datetime, timedelta import random +import re from .d_oaurl import download_from_url_playwright import asyncio import sys @@ -599,6 +600,41 @@ def _elsevier_fetch_xml(req, paper): return True, has_fulltext, None +def _pdf_page_count(content: bytes): + """返回 PDF 页数; 无法确定时返回 None。 + + 优先用 pypdf 精确解析; 未安装或解析异常时退化为字节扫描 + (对未压缩对象树有效, Elsevier 的摘要预览页正属此类)。""" + try: + from io import BytesIO + from pypdf import PdfReader + return len(PdfReader(BytesIO(content), strict=False).pages) + except ImportError: + pass + except Exception: + return None + try: + counts = [int(m) for m in re.findall(rb"/Count\s+(\d+)", content)] + if counts: + return max(counts) + n = len(re.findall(rb"/Type\s*/Page(?![sR])", content)) + if n: + return n + except Exception: + pass + return None + + +def _is_elsevier_preview_pdf(content: bytes) -> bool: + """判断 Elsevier 返回的 PDF 是否为"摘要预览页"。 + + Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回 + 仅含摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小), 全文 XML 却可能正常。 + 判据: 能确定页数且 <= 1 页。无法确定页数时返回 False(从宽, 不误杀真全文)。""" + pages = _pdf_page_count(content) + return pages is not None and pages <= 1 + + def _elsevier_fetch_pdf(req, paper): """同一 DOI 取 PDF, 成功落库返回 True。""" try: @@ -617,6 +653,10 @@ def _elsevier_fetch_pdf(req, paper): res.headers.get("content-type", "").startswith("application/pdf") ) if is_pdf and len(res.content) > 1024: # 至少1KB + # 排除"摘要预览页"(1 页): 否则会被误标 has_fulltext_pdf=True + if _is_elsevier_preview_pdf(res.content): + paper.save_fail_reason("elsevier_pdf_preview_only") + return False paper.save_file_pdf(res.content, save_obj=True) return True return False @@ -645,7 +685,9 @@ def get_abstract_from_elsevier(number_of_task:int = 20, exclude_failed:bool=True # 存量补 PDF: 已有全文标记但还没下到 PDF qs_pdf = Paper.objects.filter( has_fulltext=True, has_fulltext_pdf=False, has_abstract=True - ).exclude(fetch_status="downloading").filter(doi__startswith="10.1016") + ).exclude(fetch_status="downloading" + ).exclude(fail_reason__contains="elsevier_pdf_preview_only" + ).filter(doi__startswith="10.1016") if not qs.exists() and not qs_pdf.exists(): return "done" # 不自重发, 交给 beat 轮询拉起 @@ -880,6 +922,9 @@ def save_pdf_from_elsevier(paper:Paper): except requests.RequestException as e: return f"elsevier_request_error: {str(e)}" if res.status_code == 200: + if _is_elsevier_preview_pdf(res.content): + paper.save_fail_reason("elsevier_pdf_preview_only") + return "elsevier_pdf_preview_only" paper.save_file_pdf(res.content, save_obj=True) return "success" else: diff --git a/requirements.txt b/requirements.txt index d26eb45..5dcb395 100755 --- a/requirements.txt +++ b/requirements.txt @@ -26,3 +26,4 @@ pillow>=10.0.0 opencv-python>=4.8.0 DrissionPage>=4.1.0 curl-cffi>=0.7.0 +pypdf>=4.0.0