#!/usr/bin/env python """独立脚本: 从 ScienceDirect 网页下载排版 PDF。 与 apps.resm 解耦, 独立运行。核心难点是 Cloudflare 人机校验: Playwright 自建 浏览器带自动化指纹会被 Turnstile 识破而死循环("are you a robot"), 因此推荐 连接你手动启动的真实 Chrome(由真人过一次验证), 脚本只负责驱动它下载。 前提: 运行方 IP 在机构订阅网段(ScienceDirect 按 IP 授权)。 依赖: playwright, requests, lxml, pypdf(可选, 用于精确判页数)。 凭证: 默认从项目 config/conf.py 读取 ELSEVIER_API_KEY / ELSEVIER_INST_TOKEN, 也可用 --pii 直接给 PII 跳过取号。 用法(推荐 CDP 模式): 1) 单独起一个带调试端口的 Chrome(独立档案, 不影响日常浏览器): Windows: & "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" \ --remote-debugging-port=9222 --user-data-dir="D:\\chrome-sd-profile" Linux: google-chrome --remote-debugging-port=9222 --user-data-dir=/tmp/sd-profile 2) 在该 Chrome 里手动打开任一 ScienceDirect 文章, 亲手过掉 Cloudflare 验证。 3) 运行本脚本(可一次多篇): python scripts/sd_download.py 10.1016/j.conbuildmat.2026.146897 \ --cdp http://localhost:9222 --out ./sd_pdfs 不加 --cdp 时脚本自行启动浏览器(大概率被 Cloudflare 拦, 仅调试用)。 提示: 批量爬 ScienceDirect 违反 Elsevier 条款且可能导致机构 IP 被封, 仅供少量补抓。 """ import argparse import asyncio import os import re import sys from io import BytesIO # 项目根入 sys.path, 以便读取 config/conf.py 的凭证 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if ROOT not in sys.path: sys.path.insert(0, ROOT) _UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36") _STEALTH_ARGS = [ "--disable-blink-features=AutomationControlled", "--no-first-run", "--no-default-browser-check", "--disable-infobars", "--disable-extensions", "--disable-notifications", ] _CHALLENGE_KW = ("just a moment", "moment", "checking your browser", "attention required") # ------------------------------ PII / PDF 工具 ------------------------------ def get_creds(): try: from config.conf import ELSEVIER_API_KEY, ELSEVIER_INST_TOKEN return ELSEVIER_API_KEY, ELSEVIER_INST_TOKEN except Exception as e: print(f"[warn] 读取 config/conf.py 凭证失败: {e!r}") return None, None def fetch_pii(doi): """调 Elsevier API(text/xml) 取归一化 PII; 失败返回 None。""" import requests from lxml import etree key, token = get_creds() if not key: return None headers = {"X-ELS-APIKey": key} if token: headers["X-ELS-Insttoken"] = token try: r = requests.get(f"https://api.elsevier.com/content/article/doi/{doi}", params={"httpAccept": "text/xml"}, headers=headers, timeout=(3, 30)) except requests.RequestException as e: print(f"[warn] 取 PII 请求失败: {e!r}") return None if r.status_code != 200: print(f"[warn] 取 PII 非 200: {r.status_code}") return None try: root = etree.fromstring(r.content) except Exception: return None nodes = root.xpath("//*[local-name()='pii']/text()") if not nodes: return None return re.sub(r"[^A-Za-z0-9]", "", nodes[0]) def pdf_page_count(content: bytes): """返回页数, 判不出返回 None。优先 pypdf, 退化字节扫描。""" try: import logging logging.getLogger("pypdf").setLevel(logging.CRITICAL) from pypdf import PdfReader return len(PdfReader(BytesIO(content), strict=False).pages) except ImportError: pass except Exception: return None try: counts = [int(m) for m in re.findall(rb"/Count\s+(\d+)", content)] if counts: return max(counts) n = len(re.findall(rb"/Type\s*/Page(?![sR])", content)) if n: return n except Exception: pass return None def classify(content: bytes): """(kind, pages): broken / preview(1页) / ok(多页) / unknown。""" if not content or b"%PDF" not in content[:1024]: return "broken", 0 pages = pdf_page_count(content) if pages is None: return "unknown", None if pages <= 0: return "broken", pages return ("preview" if pages == 1 else "ok"), pages # ------------------------------ 浏览器下载 ------------------------------ async def _wait_challenge_cleared(page, rounds=30, interval=2000): for _ in range(rounds): try: await page.wait_for_timeout(interval) except Exception: pass try: title = (await page.title()) or "" except Exception: continue # 跳转中, 下一轮再看 if title and not any(k in title.lower() for k in _CHALLENGE_KW): return True return False async def _find_pdfft_href(page): try: return await page.evaluate( "() => { const a=document.querySelector('a[href*=\"pdfft\"]');" " return a ? a.href : null; }") except Exception: return None async def _grab_pdf(page, url, timeout): # 优先 download 事件 try: async with page.expect_download(timeout=min(timeout, 45000)) as dl: try: await page.goto(url, timeout=timeout) except Exception: pass download = await dl.value path = await download.path() if path: with open(path, "rb") as f: data = f.read() if data: return data except Exception: pass # 退化: 拦截内联 application/pdf 响应 captured = {} async def on_resp(resp): try: if "application/pdf" in resp.headers.get("content-type", ""): captured["b"] = await resp.body() except Exception: pass page.on("response", on_resp) try: try: await page.goto(url.replace("?download=true", ""), timeout=timeout) except Exception: pass for _ in range(12): if captured.get("b"): break await page.wait_for_timeout(1000) finally: try: page.remove_listener("response", on_resp) except Exception: pass return captured.get("b") async def download_one(pii, save_path, cdp_url=None, headless=False, timeout=60000): """返回 (ok, msg)。""" from playwright.async_api import async_playwright article = f"https://www.sciencedirect.com/science/article/pii/{pii}" pdf_url = f"{article}/pdfft?download=true" async with async_playwright() as p: connected = bool(cdp_url) if connected: browser = await p.chromium.connect_over_cdp(cdp_url) else: try: browser = await p.chromium.launch(headless=headless, channel="chrome", args=_STEALTH_ARGS) except Exception: browser = await p.chromium.launch(headless=headless, args=_STEALTH_ARGS) page = None try: if connected: ctx = browser.contexts[0] if browser.contexts else await browser.new_context() else: ctx = await browser.new_context( viewport={"width": 1920, "height": 1080}, user_agent=_UA, locale="en-US", accept_downloads=True) page = await ctx.new_page() page.set_default_timeout(timeout) if not connected: await page.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>false});") try: await page.goto(article, wait_until="domcontentloaded", timeout=40000) except Exception: pass rounds = 60 if connected else 20 if not await _wait_challenge_cleared(page, rounds=rounds): return False, "cloudflare_not_cleared" # reload 拿干净文章页 try: await page.goto(article, wait_until="domcontentloaded", timeout=40000) await page.wait_for_timeout(4000) except Exception: pass href = await _find_pdfft_href(page) or pdf_url body = await _grab_pdf(page, href, timeout) if not body: return False, "no_pdf_captured" kind, pages = classify(body) if kind != "ok": head = body[:160].decode("utf-8", "replace").replace("\n", " ") return False, f"not_fulltext kind={kind} pages={pages} len={len(body)} head={head!r}" os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True) with open(save_path, "wb") as f: f.write(body) return True, f"ok pages={pages} len={len(body)}" finally: try: if connected and page: await page.close() await browser.close() except Exception: pass # ------------------------------ CLI ------------------------------ async def _run(args): os.makedirs(args.out, exist_ok=True) ok_n = 0 for doi in args.doi: doi = doi.strip() pii = args.pii or fetch_pii(doi) if not pii: print(f"[FAIL] {doi}: 取不到 PII") continue save = os.path.join(args.out, doi.replace("/", "_") + ".pdf") print(f"[..] {doi} PII={pii} -> {save}") ok, msg = await download_one(pii, save, cdp_url=args.cdp or None, headless=args.headless, timeout=args.timeout * 1000) if ok: ok_n += 1 print(f"[OK] {doi}: {msg}") else: print(f"[FAIL] {doi}: {msg}") print(f"完成: {ok_n}/{len(args.doi)} 成功") def main(): ap = argparse.ArgumentParser(description="从 ScienceDirect 网页下载 PDF(独立脚本)") ap.add_argument("doi", nargs="+", help="一个或多个 DOI") ap.add_argument("--cdp", default="", help="连接手动启动的 Chrome, 如 http://localhost:9222(推荐)") ap.add_argument("--out", default="./sd_pdfs", help="PDF 输出目录") ap.add_argument("--pii", default="", help="直接指定 PII(仅单篇时用, 跳过 API 取号)") ap.add_argument("--headless", action="store_true", help="无头(非 CDP 模式, 调试用)") ap.add_argument("--timeout", type=int, default=60, help="单步超时(秒)") args = ap.parse_args() if sys.platform == "win32": asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) asyncio.run(_run(args)) if __name__ == "__main__": main()