feat(resm): 加 try_sciencedirect_pdf 单篇试验命令
验证"从 ScienceDirect 网页(pdfft)直下真 PDF"在当前服务器 IP / 机构会话下是否可行, 再决定要不要进流水线。流程: API 取 PII -> 拼 pdfft URL -> curl-cffi 伪装指纹请求 (可选注入机构 Cookie) -> _inspect_pdf 判定真全文/预览/被挡。仅单篇, 默认只探测。 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
88b51f97b0
commit
7e6142780a
|
|
@ -0,0 +1,148 @@
|
||||||
|
"""单篇试验: 尝试从 ScienceDirect 网页(pdfft)下载某 DOI 的真排版 PDF。
|
||||||
|
|
||||||
|
用途: 验证"网页直下"在当前服务器 IP / 机构会话下是否可行, 再决定要不要进流水线。
|
||||||
|
不批量、不轮询, 只对一个 DOI 跑一次, 默认只探测不写库(加 --save 才落盘)。
|
||||||
|
|
||||||
|
流程:
|
||||||
|
1. 调 Elsevier API(text/xml) 取该 DOI 的 PII
|
||||||
|
2. 拼 https://www.sciencedirect.com/science/article/pii/{PII}/pdfft?download=true
|
||||||
|
3. 用 curl-cffi 伪装 Chrome 指纹请求(可选注入机构 Cookie)
|
||||||
|
4. 用 _inspect_pdf 判定拿到的是真全文(多页)还是预览页/坏内容/被挡(403/HTML)
|
||||||
|
|
||||||
|
注意:
|
||||||
|
- ScienceDirect 走 Cloudflare + 付费墙, 且页面声明 TDM 预留(应走 API)。
|
||||||
|
非机构 IP 通常 403; 机构 IP(订阅按 IP)+ 过 Cloudflare 才可能拿到。
|
||||||
|
- 批量爬网页违反 Elsevier 条款且可能导致机构 IP 被封, 此命令仅供单篇可行性验证。
|
||||||
|
|
||||||
|
用法:
|
||||||
|
python manage.py try_sciencedirect_pdf 10.1016/j.conbuildmat.2026.146897
|
||||||
|
python manage.py try_sciencedirect_pdf <doi> --cookie-file cookie.txt
|
||||||
|
python manage.py try_sciencedirect_pdf <doi> --cookie "EUID=...; SD_REMOTEACCESS=..." --save
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.core.management.base import BaseCommand, CommandError
|
||||||
|
|
||||||
|
from apps.resm.models import Paper
|
||||||
|
from apps.resm.pdf_utils import _inspect_pdf
|
||||||
|
|
||||||
|
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "单篇试验: 从 ScienceDirect 网页(pdfft)尝试下载真 PDF"
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument("doi", help="目标 DOI, 如 10.1016/j.conbuildmat.2026.146897")
|
||||||
|
parser.add_argument("--cookie", default="", help="机构会话 Cookie 头(整串)")
|
||||||
|
parser.add_argument("--cookie-file", default="",
|
||||||
|
help="从文件读取 Cookie(优先于 --cookie)")
|
||||||
|
parser.add_argument("--save", action="store_true",
|
||||||
|
help="确认拿到真全文 PDF 时落盘到该 Paper")
|
||||||
|
|
||||||
|
def handle(self, *args, **opts):
|
||||||
|
doi = opts["doi"].strip()
|
||||||
|
cookie = opts["cookie"]
|
||||||
|
if opts["cookie_file"]:
|
||||||
|
with open(opts["cookie_file"], "r", encoding="utf-8") as f:
|
||||||
|
cookie = f.read().strip()
|
||||||
|
|
||||||
|
# 1. 取 PII
|
||||||
|
pii = self._fetch_pii(doi)
|
||||||
|
if not pii:
|
||||||
|
raise CommandError(f"未能从 Elsevier API 取到 PII: {doi}")
|
||||||
|
self.stdout.write(f"PII: {pii}")
|
||||||
|
|
||||||
|
url = (f"https://www.sciencedirect.com/science/article/pii/{pii}"
|
||||||
|
f"/pdfft?download=true")
|
||||||
|
self.stdout.write(f"URL: {url}")
|
||||||
|
|
||||||
|
# 2. curl-cffi 请求
|
||||||
|
try:
|
||||||
|
import curl_cffi.requests as cf
|
||||||
|
except ImportError:
|
||||||
|
raise CommandError("curl_cffi 未安装: pip install curl-cffi")
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": _UA,
|
||||||
|
"Accept": "application/pdf,application/octet-stream,*/*",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Referer": f"https://www.sciencedirect.com/science/article/pii/{pii}",
|
||||||
|
}
|
||||||
|
if cookie:
|
||||||
|
headers["Cookie"] = cookie
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = cf.get(url, impersonate="chrome131", headers=headers,
|
||||||
|
timeout=60, allow_redirects=True)
|
||||||
|
except Exception as e:
|
||||||
|
raise CommandError(f"请求失败: {e!r}")
|
||||||
|
|
||||||
|
ctype = r.headers.get("content-type", "")
|
||||||
|
content = r.content
|
||||||
|
self.stdout.write(
|
||||||
|
f"HTTP {r.status_code} content-type={ctype} len={len(content)} "
|
||||||
|
f"server={r.headers.get('server')} cf-ray={r.headers.get('cf-ray')}")
|
||||||
|
self.stdout.write(f"最终 URL: {r.url}")
|
||||||
|
|
||||||
|
# 3. 判定内容
|
||||||
|
if r.status_code != 200:
|
||||||
|
self._diagnose(r.status_code, content)
|
||||||
|
return
|
||||||
|
kind, pages = _inspect_pdf(content)
|
||||||
|
self.stdout.write(f"内容判定: kind={kind} pages={pages}")
|
||||||
|
|
||||||
|
if kind == "ok":
|
||||||
|
self.stdout.write(self.style.SUCCESS(
|
||||||
|
f"✓ 拿到真全文 PDF ({pages} 页, {len(content)} bytes)"))
|
||||||
|
if opts["save"]:
|
||||||
|
paper = Paper.objects.filter(doi=doi).first()
|
||||||
|
if not paper:
|
||||||
|
self.stdout.write("⚠ 库中无此 DOI, 未落盘")
|
||||||
|
else:
|
||||||
|
paper.save_file_pdf(content, save_obj=True)
|
||||||
|
self.stdout.write(self.style.SUCCESS(f"已落盘: {paper.init_paper_path('pdf')}"))
|
||||||
|
elif kind == "preview":
|
||||||
|
self.stdout.write(self.style.WARNING("✗ 仍是预览页(1 页), 网页这条路对该篇无效"))
|
||||||
|
elif kind == "broken":
|
||||||
|
head = content[:300].decode("utf-8", "replace").replace("\n", " ")
|
||||||
|
self.stdout.write(self.style.WARNING(
|
||||||
|
f"✗ 不是有效 PDF(可能被挡/付费墙页). 内容开头: {head}"))
|
||||||
|
else:
|
||||||
|
self.stdout.write(self.style.WARNING(f"✗ 无法判定: {kind}"))
|
||||||
|
|
||||||
|
def _fetch_pii(self, doi):
|
||||||
|
import requests
|
||||||
|
from lxml import etree
|
||||||
|
H = {"X-ELS-Insttoken": settings.ELSEVIER_INST_TOKEN,
|
||||||
|
"X-ELS-APIKey": settings.ELSEVIER_API_KEY}
|
||||||
|
try:
|
||||||
|
rx = requests.get(
|
||||||
|
f"https://api.elsevier.com/content/article/doi/{doi}",
|
||||||
|
params={"httpAccept": "text/xml"}, headers=H, timeout=(3, 30))
|
||||||
|
except requests.RequestException as e:
|
||||||
|
raise CommandError(f"取 XML 失败: {e!r}")
|
||||||
|
if rx.status_code != 200:
|
||||||
|
raise CommandError(f"取 XML 非 200: {rx.status_code}")
|
||||||
|
root = etree.fromstring(rx.content)
|
||||||
|
nodes = root.xpath("//*[local-name()='pii']/text()")
|
||||||
|
if not nodes:
|
||||||
|
return None
|
||||||
|
return re.sub(r"[^A-Za-z0-9]", "", nodes[0])
|
||||||
|
|
||||||
|
def _diagnose(self, status, content):
|
||||||
|
if status == 403:
|
||||||
|
is_cf = b"cloudflare" in content[:5000].lower() or b"cf-" in content[:2000].lower()
|
||||||
|
tdm = b"tdm-reservation" in content[:5000]
|
||||||
|
self.stdout.write(self.style.WARNING(
|
||||||
|
"✗ 403 被拒。" + ("Cloudflare 挑战页。" if is_cf else "")
|
||||||
|
+ ("页面声明 TDM 预留(应走 API)。" if tdm else "")
|
||||||
|
+ " 多半是: 该服务器 IP 不在机构订阅网段, 或未带有效机构 Cookie。"))
|
||||||
|
elif status in (401, 302, 301):
|
||||||
|
self.stdout.write(self.style.WARNING(
|
||||||
|
f"✗ {status}: 多半跳登录/未认证, 需要机构会话 Cookie。"))
|
||||||
|
else:
|
||||||
|
head = content[:300].decode("utf-8", "replace").replace("\n", " ")
|
||||||
|
self.stdout.write(self.style.WARNING(f"✗ {status}. 开头: {head}"))
|
||||||
Loading…
Reference in New Issue