fix(resm): 识别 Elsevier 摘要预览 PDF, 避免误标全文
Elsevier Article API 对未授权/in-press 文章, application/pdf 端点会返回仅含 摘要的 1 页预览 PDF (魔数仍是 %PDF、体积也不小), 而全文 XML 可正常获取。旧逻辑 只校验魔数+体积, 误将预览页落库并置 has_fulltext_pdf=True。 - tasks.py: 新增 _pdf_page_count / _is_elsevier_preview_pdf (优先 pypdf, 退化 字节扫描), _elsevier_fetch_pdf 与 save_pdf_from_elsevier 落库前排除 1 页预览页, 打 fail_reason=elsevier_pdf_preview_only; 补抓队列 qs_pdf 排除该标记避免无限重试 - 新增管理命令 fix_preview_pdf: 扫描存量误标记录, 回退 has_fulltext_pdf; 无 XML 全文者一并回退 has_fulltext, 让其重进下载链 - requirements.txt: 增加 pypdf>=4.0.0 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1e54070d6d
commit
e695e04de7
|
|
@ -0,0 +1,106 @@
|
||||||
|
"""一次性修复: 把误标为全文 PDF 的 Elsevier "摘要预览页"(1 页)纠正回未下载状态。
|
||||||
|
|
||||||
|
背景:
|
||||||
|
Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回仅含
|
||||||
|
摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小), 而全文 XML 却能正常拿到。旧抓取
|
||||||
|
逻辑只校验魔数 + 体积, 误将预览页落库并置 has_fulltext_pdf=True。
|
||||||
|
|
||||||
|
本命令重新核对本地 PDF 的页数, 对 <= 1 页者:
|
||||||
|
- has_fulltext_pdf 置回 False
|
||||||
|
- 若该论文有 XML 全文(has_fulltext_xml=True), 保留 has_fulltext=True;
|
||||||
|
否则(此前只有这张假预览页冒充全文)一并把 has_fulltext 回退为 False,
|
||||||
|
让它能重新进入下载链路去找真正的全文。
|
||||||
|
- 追加 fail_reason 'elsevier_pdf_preview_only' (供 Elsevier 补抓队列排除, 避免无限重试)
|
||||||
|
- 可选: 删除本地预览 PDF 文件 (--delete-file)
|
||||||
|
|
||||||
|
文件读取依赖本地存在 PDF (在跑抓取的服务器上执行)。建议先 --dry-run 看统计。
|
||||||
|
|
||||||
|
用法:
|
||||||
|
python manage.py fix_preview_pdf --dry-run
|
||||||
|
python manage.py fix_preview_pdf --delete-file
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
|
from apps.resm.models import Paper
|
||||||
|
from apps.resm.tasks import _pdf_page_count
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "纠正被误标为全文的 Elsevier 摘要预览 PDF(1 页)"
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument("--dry-run", action="store_true",
|
||||||
|
help="只统计, 不写库 / 不删文件")
|
||||||
|
parser.add_argument("--limit", type=int, default=0,
|
||||||
|
help="最多处理多少条 (0=不限)")
|
||||||
|
parser.add_argument("--delete-file", action="store_true",
|
||||||
|
help="同时删除本地预览 PDF 文件")
|
||||||
|
|
||||||
|
def handle(self, *args, **opts):
|
||||||
|
dry = opts["dry_run"]
|
||||||
|
limit = opts["limit"]
|
||||||
|
del_file = opts["delete_file"]
|
||||||
|
|
||||||
|
qs = Paper.objects.filter(
|
||||||
|
has_fulltext_pdf=True, doi__startswith="10.1016"
|
||||||
|
).order_by("id")
|
||||||
|
total = qs.count()
|
||||||
|
self.stdout.write(
|
||||||
|
f"候选(has_fulltext_pdf=True 且 DOI 以 10.1016 开头): {total}")
|
||||||
|
|
||||||
|
checked = fixed = only_pdf = missing = unreadable = 0
|
||||||
|
for paper in qs.iterator():
|
||||||
|
if limit and checked >= limit:
|
||||||
|
break
|
||||||
|
checked += 1
|
||||||
|
|
||||||
|
path = paper.init_paper_path("pdf")
|
||||||
|
if not os.path.exists(path):
|
||||||
|
missing += 1
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
content = f.read()
|
||||||
|
except OSError:
|
||||||
|
unreadable += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
pages = _pdf_page_count(content)
|
||||||
|
if pages is None:
|
||||||
|
unreadable += 1
|
||||||
|
continue
|
||||||
|
if pages > 1:
|
||||||
|
continue # 真全文, 跳过
|
||||||
|
|
||||||
|
fixed += 1
|
||||||
|
only_pdf_case = not paper.has_fulltext_xml
|
||||||
|
if only_pdf_case:
|
||||||
|
only_pdf += 1
|
||||||
|
self.stdout.write(
|
||||||
|
f"[preview {pages}p]{' (only-pdf)' if only_pdf_case else ''} "
|
||||||
|
f"{paper.doi} {path}")
|
||||||
|
if dry:
|
||||||
|
continue
|
||||||
|
|
||||||
|
paper.has_fulltext_pdf = False
|
||||||
|
update_fields = ["has_fulltext_pdf", "update_time"]
|
||||||
|
# 没有 XML 全文时, 之前的 has_fulltext 只是被这张假预览页置上的, 一并回退
|
||||||
|
if not paper.has_fulltext_xml:
|
||||||
|
paper.has_fulltext = False
|
||||||
|
update_fields.insert(0, "has_fulltext")
|
||||||
|
paper.save(update_fields=update_fields)
|
||||||
|
if "elsevier_pdf_preview_only" not in (paper.fail_reason or ""):
|
||||||
|
paper.save_fail_reason("elsevier_pdf_preview_only")
|
||||||
|
if del_file:
|
||||||
|
try:
|
||||||
|
os.remove(path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self.stdout.write(self.style.SUCCESS(
|
||||||
|
f"检查={checked} 预览页修复={fixed} (其中无XML全文/一并回退has_fulltext={only_pdf}) "
|
||||||
|
f"文件缺失={missing} 无法解析={unreadable}"
|
||||||
|
+ (" (dry-run, 未写库)" if dry else "")
|
||||||
|
))
|
||||||
|
|
@ -11,6 +11,7 @@ from lxml import etree
|
||||||
from celery import current_app
|
from celery import current_app
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import random
|
import random
|
||||||
|
import re
|
||||||
from .d_oaurl import download_from_url_playwright
|
from .d_oaurl import download_from_url_playwright
|
||||||
import asyncio
|
import asyncio
|
||||||
import sys
|
import sys
|
||||||
|
|
@ -599,6 +600,41 @@ def _elsevier_fetch_xml(req, paper):
|
||||||
return True, has_fulltext, None
|
return True, has_fulltext, None
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_page_count(content: bytes):
|
||||||
|
"""返回 PDF 页数; 无法确定时返回 None。
|
||||||
|
|
||||||
|
优先用 pypdf 精确解析; 未安装或解析异常时退化为字节扫描
|
||||||
|
(对未压缩对象树有效, Elsevier 的摘要预览页正属此类)。"""
|
||||||
|
try:
|
||||||
|
from io import BytesIO
|
||||||
|
from pypdf import PdfReader
|
||||||
|
return len(PdfReader(BytesIO(content), strict=False).pages)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
counts = [int(m) for m in re.findall(rb"/Count\s+(\d+)", content)]
|
||||||
|
if counts:
|
||||||
|
return max(counts)
|
||||||
|
n = len(re.findall(rb"/Type\s*/Page(?![sR])", content))
|
||||||
|
if n:
|
||||||
|
return n
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _is_elsevier_preview_pdf(content: bytes) -> bool:
|
||||||
|
"""判断 Elsevier 返回的 PDF 是否为"摘要预览页"。
|
||||||
|
|
||||||
|
Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回
|
||||||
|
仅含摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小), 全文 XML 却可能正常。
|
||||||
|
判据: 能确定页数且 <= 1 页。无法确定页数时返回 False(从宽, 不误杀真全文)。"""
|
||||||
|
pages = _pdf_page_count(content)
|
||||||
|
return pages is not None and pages <= 1
|
||||||
|
|
||||||
|
|
||||||
def _elsevier_fetch_pdf(req, paper):
|
def _elsevier_fetch_pdf(req, paper):
|
||||||
"""同一 DOI 取 PDF, 成功落库返回 True。"""
|
"""同一 DOI 取 PDF, 成功落库返回 True。"""
|
||||||
try:
|
try:
|
||||||
|
|
@ -617,6 +653,10 @@ def _elsevier_fetch_pdf(req, paper):
|
||||||
res.headers.get("content-type", "").startswith("application/pdf")
|
res.headers.get("content-type", "").startswith("application/pdf")
|
||||||
)
|
)
|
||||||
if is_pdf and len(res.content) > 1024: # 至少1KB
|
if is_pdf and len(res.content) > 1024: # 至少1KB
|
||||||
|
# 排除"摘要预览页"(1 页): 否则会被误标 has_fulltext_pdf=True
|
||||||
|
if _is_elsevier_preview_pdf(res.content):
|
||||||
|
paper.save_fail_reason("elsevier_pdf_preview_only")
|
||||||
|
return False
|
||||||
paper.save_file_pdf(res.content, save_obj=True)
|
paper.save_file_pdf(res.content, save_obj=True)
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
@ -645,7 +685,9 @@ def get_abstract_from_elsevier(number_of_task:int = 20, exclude_failed:bool=True
|
||||||
# 存量补 PDF: 已有全文标记但还没下到 PDF
|
# 存量补 PDF: 已有全文标记但还没下到 PDF
|
||||||
qs_pdf = Paper.objects.filter(
|
qs_pdf = Paper.objects.filter(
|
||||||
has_fulltext=True, has_fulltext_pdf=False, has_abstract=True
|
has_fulltext=True, has_fulltext_pdf=False, has_abstract=True
|
||||||
).exclude(fetch_status="downloading").filter(doi__startswith="10.1016")
|
).exclude(fetch_status="downloading"
|
||||||
|
).exclude(fail_reason__contains="elsevier_pdf_preview_only"
|
||||||
|
).filter(doi__startswith="10.1016")
|
||||||
|
|
||||||
if not qs.exists() and not qs_pdf.exists():
|
if not qs.exists() and not qs_pdf.exists():
|
||||||
return "done" # 不自重发, 交给 beat 轮询拉起
|
return "done" # 不自重发, 交给 beat 轮询拉起
|
||||||
|
|
@ -880,6 +922,9 @@ def save_pdf_from_elsevier(paper:Paper):
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
return f"elsevier_request_error: {str(e)}"
|
return f"elsevier_request_error: {str(e)}"
|
||||||
if res.status_code == 200:
|
if res.status_code == 200:
|
||||||
|
if _is_elsevier_preview_pdf(res.content):
|
||||||
|
paper.save_fail_reason("elsevier_pdf_preview_only")
|
||||||
|
return "elsevier_pdf_preview_only"
|
||||||
paper.save_file_pdf(res.content, save_obj=True)
|
paper.save_file_pdf(res.content, save_obj=True)
|
||||||
return "success"
|
return "success"
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -26,3 +26,4 @@ pillow>=10.0.0
|
||||||
opencv-python>=4.8.0
|
opencv-python>=4.8.0
|
||||||
DrissionPage>=4.1.0
|
DrissionPage>=4.1.0
|
||||||
curl-cffi>=0.7.0
|
curl-cffi>=0.7.0
|
||||||
|
pypdf>=4.0.0
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue