diff --git a/apps/resm/management/commands/try_sciencedirect_pdf.py b/apps/resm/management/commands/try_sciencedirect_pdf.py new file mode 100644 index 0000000..ae2a1fb --- /dev/null +++ b/apps/resm/management/commands/try_sciencedirect_pdf.py @@ -0,0 +1,148 @@ +"""单篇试验: 尝试从 ScienceDirect 网页(pdfft)下载某 DOI 的真排版 PDF。 + +用途: 验证"网页直下"在当前服务器 IP / 机构会话下是否可行, 再决定要不要进流水线。 +不批量、不轮询, 只对一个 DOI 跑一次, 默认只探测不写库(加 --save 才落盘)。 + +流程: + 1. 调 Elsevier API(text/xml) 取该 DOI 的 PII + 2. 拼 https://www.sciencedirect.com/science/article/pii/{PII}/pdfft?download=true + 3. 用 curl-cffi 伪装 Chrome 指纹请求(可选注入机构 Cookie) + 4. 用 _inspect_pdf 判定拿到的是真全文(多页)还是预览页/坏内容/被挡(403/HTML) + +注意: + - ScienceDirect 走 Cloudflare + 付费墙, 且页面声明 TDM 预留(应走 API)。 + 非机构 IP 通常 403; 机构 IP(订阅按 IP)+ 过 Cloudflare 才可能拿到。 + - 批量爬网页违反 Elsevier 条款且可能导致机构 IP 被封, 此命令仅供单篇可行性验证。 + +用法: + python manage.py try_sciencedirect_pdf 10.1016/j.conbuildmat.2026.146897 + python manage.py try_sciencedirect_pdf --cookie-file cookie.txt + python manage.py try_sciencedirect_pdf --cookie "EUID=...; SD_REMOTEACCESS=..." --save +""" +import re + +from django.conf import settings +from django.core.management.base import BaseCommand, CommandError + +from apps.resm.models import Paper +from apps.resm.pdf_utils import _inspect_pdf + +_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36") + + +class Command(BaseCommand): + help = "单篇试验: 从 ScienceDirect 网页(pdfft)尝试下载真 PDF" + + def add_arguments(self, parser): + parser.add_argument("doi", help="目标 DOI, 如 10.1016/j.conbuildmat.2026.146897") + parser.add_argument("--cookie", default="", help="机构会话 Cookie 头(整串)") + parser.add_argument("--cookie-file", default="", + help="从文件读取 Cookie(优先于 --cookie)") + parser.add_argument("--save", action="store_true", + help="确认拿到真全文 PDF 时落盘到该 Paper") + + def handle(self, *args, **opts): + doi = opts["doi"].strip() + cookie = opts["cookie"] + if opts["cookie_file"]: + with open(opts["cookie_file"], "r", encoding="utf-8") as f: + cookie = f.read().strip() + + # 1. 取 PII + pii = self._fetch_pii(doi) + if not pii: + raise CommandError(f"未能从 Elsevier API 取到 PII: {doi}") + self.stdout.write(f"PII: {pii}") + + url = (f"https://www.sciencedirect.com/science/article/pii/{pii}" + f"/pdfft?download=true") + self.stdout.write(f"URL: {url}") + + # 2. curl-cffi 请求 + try: + import curl_cffi.requests as cf + except ImportError: + raise CommandError("curl_cffi 未安装: pip install curl-cffi") + + headers = { + "User-Agent": _UA, + "Accept": "application/pdf,application/octet-stream,*/*", + "Accept-Language": "en-US,en;q=0.9", + "Referer": f"https://www.sciencedirect.com/science/article/pii/{pii}", + } + if cookie: + headers["Cookie"] = cookie + + try: + r = cf.get(url, impersonate="chrome131", headers=headers, + timeout=60, allow_redirects=True) + except Exception as e: + raise CommandError(f"请求失败: {e!r}") + + ctype = r.headers.get("content-type", "") + content = r.content + self.stdout.write( + f"HTTP {r.status_code} content-type={ctype} len={len(content)} " + f"server={r.headers.get('server')} cf-ray={r.headers.get('cf-ray')}") + self.stdout.write(f"最终 URL: {r.url}") + + # 3. 判定内容 + if r.status_code != 200: + self._diagnose(r.status_code, content) + return + kind, pages = _inspect_pdf(content) + self.stdout.write(f"内容判定: kind={kind} pages={pages}") + + if kind == "ok": + self.stdout.write(self.style.SUCCESS( + f"✓ 拿到真全文 PDF ({pages} 页, {len(content)} bytes)")) + if opts["save"]: + paper = Paper.objects.filter(doi=doi).first() + if not paper: + self.stdout.write("⚠ 库中无此 DOI, 未落盘") + else: + paper.save_file_pdf(content, save_obj=True) + self.stdout.write(self.style.SUCCESS(f"已落盘: {paper.init_paper_path('pdf')}")) + elif kind == "preview": + self.stdout.write(self.style.WARNING("✗ 仍是预览页(1 页), 网页这条路对该篇无效")) + elif kind == "broken": + head = content[:300].decode("utf-8", "replace").replace("\n", " ") + self.stdout.write(self.style.WARNING( + f"✗ 不是有效 PDF(可能被挡/付费墙页). 内容开头: {head}")) + else: + self.stdout.write(self.style.WARNING(f"✗ 无法判定: {kind}")) + + def _fetch_pii(self, doi): + import requests + from lxml import etree + H = {"X-ELS-Insttoken": settings.ELSEVIER_INST_TOKEN, + "X-ELS-APIKey": settings.ELSEVIER_API_KEY} + try: + rx = requests.get( + f"https://api.elsevier.com/content/article/doi/{doi}", + params={"httpAccept": "text/xml"}, headers=H, timeout=(3, 30)) + except requests.RequestException as e: + raise CommandError(f"取 XML 失败: {e!r}") + if rx.status_code != 200: + raise CommandError(f"取 XML 非 200: {rx.status_code}") + root = etree.fromstring(rx.content) + nodes = root.xpath("//*[local-name()='pii']/text()") + if not nodes: + return None + return re.sub(r"[^A-Za-z0-9]", "", nodes[0]) + + def _diagnose(self, status, content): + if status == 403: + is_cf = b"cloudflare" in content[:5000].lower() or b"cf-" in content[:2000].lower() + tdm = b"tdm-reservation" in content[:5000] + self.stdout.write(self.style.WARNING( + "✗ 403 被拒。" + ("Cloudflare 挑战页。" if is_cf else "") + + ("页面声明 TDM 预留(应走 API)。" if tdm else "") + + " 多半是: 该服务器 IP 不在机构订阅网段, 或未带有效机构 Cookie。")) + elif status in (401, 302, 301): + self.stdout.write(self.style.WARNING( + f"✗ {status}: 多半跳登录/未认证, 需要机构会话 Cookie。")) + else: + head = content[:300].decode("utf-8", "replace").replace("\n", " ") + self.stdout.write(self.style.WARNING(f"✗ {status}. 开头: {head}"))