fix(resm): 识别 Elsevier 摘要预览 PDF, 避免误标全文
Elsevier Article API 对未授权/in-press 文章, application/pdf 端点会返回仅含 摘要的 1 页预览 PDF (魔数仍是 %PDF、体积也不小), 而全文 XML 可正常获取。旧逻辑 只校验魔数+体积, 误将预览页落库并置 has_fulltext_pdf=True。 - tasks.py: 新增 _pdf_page_count / _is_elsevier_preview_pdf (优先 pypdf, 退化 字节扫描), _elsevier_fetch_pdf 与 save_pdf_from_elsevier 落库前排除 1 页预览页, 打 fail_reason=elsevier_pdf_preview_only; 补抓队列 qs_pdf 排除该标记避免无限重试 - 新增管理命令 fix_preview_pdf: 扫描存量误标记录, 回退 has_fulltext_pdf; 无 XML 全文者一并回退 has_fulltext, 让其重进下载链 - requirements.txt: 增加 pypdf>=4.0.0 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1e54070d6d
commit
e695e04de7
|
|
@ -0,0 +1,106 @@
|
|||
"""一次性修复: 把误标为全文 PDF 的 Elsevier "摘要预览页"(1 页)纠正回未下载状态。
|
||||
|
||||
背景:
|
||||
Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回仅含
|
||||
摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小), 而全文 XML 却能正常拿到。旧抓取
|
||||
逻辑只校验魔数 + 体积, 误将预览页落库并置 has_fulltext_pdf=True。
|
||||
|
||||
本命令重新核对本地 PDF 的页数, 对 <= 1 页者:
|
||||
- has_fulltext_pdf 置回 False
|
||||
- 若该论文有 XML 全文(has_fulltext_xml=True), 保留 has_fulltext=True;
|
||||
否则(此前只有这张假预览页冒充全文)一并把 has_fulltext 回退为 False,
|
||||
让它能重新进入下载链路去找真正的全文。
|
||||
- 追加 fail_reason 'elsevier_pdf_preview_only' (供 Elsevier 补抓队列排除, 避免无限重试)
|
||||
- 可选: 删除本地预览 PDF 文件 (--delete-file)
|
||||
|
||||
文件读取依赖本地存在 PDF (在跑抓取的服务器上执行)。建议先 --dry-run 看统计。
|
||||
|
||||
用法:
|
||||
python manage.py fix_preview_pdf --dry-run
|
||||
python manage.py fix_preview_pdf --delete-file
|
||||
"""
|
||||
import os
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from apps.resm.models import Paper
|
||||
from apps.resm.tasks import _pdf_page_count
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "纠正被误标为全文的 Elsevier 摘要预览 PDF(1 页)"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="只统计, 不写库 / 不删文件")
|
||||
parser.add_argument("--limit", type=int, default=0,
|
||||
help="最多处理多少条 (0=不限)")
|
||||
parser.add_argument("--delete-file", action="store_true",
|
||||
help="同时删除本地预览 PDF 文件")
|
||||
|
||||
def handle(self, *args, **opts):
|
||||
dry = opts["dry_run"]
|
||||
limit = opts["limit"]
|
||||
del_file = opts["delete_file"]
|
||||
|
||||
qs = Paper.objects.filter(
|
||||
has_fulltext_pdf=True, doi__startswith="10.1016"
|
||||
).order_by("id")
|
||||
total = qs.count()
|
||||
self.stdout.write(
|
||||
f"候选(has_fulltext_pdf=True 且 DOI 以 10.1016 开头): {total}")
|
||||
|
||||
checked = fixed = only_pdf = missing = unreadable = 0
|
||||
for paper in qs.iterator():
|
||||
if limit and checked >= limit:
|
||||
break
|
||||
checked += 1
|
||||
|
||||
path = paper.init_paper_path("pdf")
|
||||
if not os.path.exists(path):
|
||||
missing += 1
|
||||
continue
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
content = f.read()
|
||||
except OSError:
|
||||
unreadable += 1
|
||||
continue
|
||||
|
||||
pages = _pdf_page_count(content)
|
||||
if pages is None:
|
||||
unreadable += 1
|
||||
continue
|
||||
if pages > 1:
|
||||
continue # 真全文, 跳过
|
||||
|
||||
fixed += 1
|
||||
only_pdf_case = not paper.has_fulltext_xml
|
||||
if only_pdf_case:
|
||||
only_pdf += 1
|
||||
self.stdout.write(
|
||||
f"[preview {pages}p]{' (only-pdf)' if only_pdf_case else ''} "
|
||||
f"{paper.doi} {path}")
|
||||
if dry:
|
||||
continue
|
||||
|
||||
paper.has_fulltext_pdf = False
|
||||
update_fields = ["has_fulltext_pdf", "update_time"]
|
||||
# 没有 XML 全文时, 之前的 has_fulltext 只是被这张假预览页置上的, 一并回退
|
||||
if not paper.has_fulltext_xml:
|
||||
paper.has_fulltext = False
|
||||
update_fields.insert(0, "has_fulltext")
|
||||
paper.save(update_fields=update_fields)
|
||||
if "elsevier_pdf_preview_only" not in (paper.fail_reason or ""):
|
||||
paper.save_fail_reason("elsevier_pdf_preview_only")
|
||||
if del_file:
|
||||
try:
|
||||
os.remove(path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
self.stdout.write(self.style.SUCCESS(
|
||||
f"检查={checked} 预览页修复={fixed} (其中无XML全文/一并回退has_fulltext={only_pdf}) "
|
||||
f"文件缺失={missing} 无法解析={unreadable}"
|
||||
+ (" (dry-run, 未写库)" if dry else "")
|
||||
))
|
||||
|
|
@ -11,6 +11,7 @@ from lxml import etree
|
|||
from celery import current_app
|
||||
from datetime import datetime, timedelta
|
||||
import random
|
||||
import re
|
||||
from .d_oaurl import download_from_url_playwright
|
||||
import asyncio
|
||||
import sys
|
||||
|
|
@ -599,6 +600,41 @@ def _elsevier_fetch_xml(req, paper):
|
|||
return True, has_fulltext, None
|
||||
|
||||
|
||||
def _pdf_page_count(content: bytes):
|
||||
"""返回 PDF 页数; 无法确定时返回 None。
|
||||
|
||||
优先用 pypdf 精确解析; 未安装或解析异常时退化为字节扫描
|
||||
(对未压缩对象树有效, Elsevier 的摘要预览页正属此类)。"""
|
||||
try:
|
||||
from io import BytesIO
|
||||
from pypdf import PdfReader
|
||||
return len(PdfReader(BytesIO(content), strict=False).pages)
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
return None
|
||||
try:
|
||||
counts = [int(m) for m in re.findall(rb"/Count\s+(\d+)", content)]
|
||||
if counts:
|
||||
return max(counts)
|
||||
n = len(re.findall(rb"/Type\s*/Page(?![sR])", content))
|
||||
if n:
|
||||
return n
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _is_elsevier_preview_pdf(content: bytes) -> bool:
|
||||
"""判断 Elsevier 返回的 PDF 是否为"摘要预览页"。
|
||||
|
||||
Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回
|
||||
仅含摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小), 全文 XML 却可能正常。
|
||||
判据: 能确定页数且 <= 1 页。无法确定页数时返回 False(从宽, 不误杀真全文)。"""
|
||||
pages = _pdf_page_count(content)
|
||||
return pages is not None and pages <= 1
|
||||
|
||||
|
||||
def _elsevier_fetch_pdf(req, paper):
|
||||
"""同一 DOI 取 PDF, 成功落库返回 True。"""
|
||||
try:
|
||||
|
|
@ -617,6 +653,10 @@ def _elsevier_fetch_pdf(req, paper):
|
|||
res.headers.get("content-type", "").startswith("application/pdf")
|
||||
)
|
||||
if is_pdf and len(res.content) > 1024: # 至少1KB
|
||||
# 排除"摘要预览页"(1 页): 否则会被误标 has_fulltext_pdf=True
|
||||
if _is_elsevier_preview_pdf(res.content):
|
||||
paper.save_fail_reason("elsevier_pdf_preview_only")
|
||||
return False
|
||||
paper.save_file_pdf(res.content, save_obj=True)
|
||||
return True
|
||||
return False
|
||||
|
|
@ -645,7 +685,9 @@ def get_abstract_from_elsevier(number_of_task:int = 20, exclude_failed:bool=True
|
|||
# 存量补 PDF: 已有全文标记但还没下到 PDF
|
||||
qs_pdf = Paper.objects.filter(
|
||||
has_fulltext=True, has_fulltext_pdf=False, has_abstract=True
|
||||
).exclude(fetch_status="downloading").filter(doi__startswith="10.1016")
|
||||
).exclude(fetch_status="downloading"
|
||||
).exclude(fail_reason__contains="elsevier_pdf_preview_only"
|
||||
).filter(doi__startswith="10.1016")
|
||||
|
||||
if not qs.exists() and not qs_pdf.exists():
|
||||
return "done" # 不自重发, 交给 beat 轮询拉起
|
||||
|
|
@ -880,6 +922,9 @@ def save_pdf_from_elsevier(paper:Paper):
|
|||
except requests.RequestException as e:
|
||||
return f"elsevier_request_error: {str(e)}"
|
||||
if res.status_code == 200:
|
||||
if _is_elsevier_preview_pdf(res.content):
|
||||
paper.save_fail_reason("elsevier_pdf_preview_only")
|
||||
return "elsevier_pdf_preview_only"
|
||||
paper.save_file_pdf(res.content, save_obj=True)
|
||||
return "success"
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -26,3 +26,4 @@ pillow>=10.0.0
|
|||
opencv-python>=4.8.0
|
||||
DrissionPage>=4.1.0
|
||||
curl-cffi>=0.7.0
|
||||
pypdf>=4.0.0
|
||||
|
|
|
|||
Loading…
Reference in New Issue