Compare commits

..

No commits in common. "main" and "fix/elsevier-preview-pdf" have entirely different histories.

7 changed files with 100 additions and 618 deletions

1
.gitignore vendored
View File

@ -26,4 +26,3 @@ sh/*
temp/* temp/*
nohup.out nohup.out
scripts/* scripts/*
!scripts/sd_download.py

View File

@ -1,67 +1,34 @@
"""一次性修复: 纠正被误标为全文 PDF 的历史记录(Elsevier 摘要预览页 / 损坏文件) """一次性修复: 把误标为全文 PDF 的 Elsevier "摘要预览页"(1 页)纠正回未下载状态
背景: 背景:
Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回仅含 Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回仅含
摘要的 1 页预览 PDF(魔数仍是 %PDF体积也不小); 另有部分历史记录把 HTML 错误页 / 摘要的 1 页预览 PDF(魔数仍是 %PDF体积也不小), 而全文 XML 却能正常拿到旧抓取
被截断的垃圾当 PDF 存了旧抓取逻辑只校验魔数 + 体积, 都会误标 has_fulltext_pdf=True 逻辑只校验魔数 + 体积, 误将预览页落库并置 has_fulltext_pdf=True
本命令核对本地 PDF, 分两类处理: 本命令重新核对本地 PDF 的页数, <= 1 页者:
- 预览页(1 ): has_fulltext_pdf 置回 False; 文件**仅在 --delete-file **删除 - has_fulltext_pdf 置回 False
- 损坏文件( PDF / pypdf 解析失败): has_fulltext_pdf 置回 False; 文件**总是删除** - 若该论文有 XML 全文(has_fulltext_xml=True), 保留 has_fulltext=True;
(dry-run 除外), 因为它根本不是有效全文, 留着无用且会污染下游解析 否则(此前只有这张假预览页冒充全文)一并把 has_fulltext 回退为 False,
两类在缺少 XML 全文(has_fulltext_xml=False), 一并把 has_fulltext 回退 False, 让它能重新进入下载链路去找真正的全文
让其重新进入下载链路去找真正的全文; 并追加 fail_reason 标记供抓取任务排除 - 追加 fail_reason 'elsevier_pdf_preview_only' ( Elsevier 补抓队列排除, 避免无限重试)
- 可选: 删除本地预览 PDF 文件 (--delete-file)
性能: 文件读取依赖本地存在 PDF (在跑抓取的服务器上执行)建议先 --dry-run 看统计
读文件 + pypdf 解析是 CPU/IO 密集, ProcessPoolExecutor 并行(--workers, 默认 CPU 核数);
数据库写入留在主进程串行(坏文件仅占少数, 非瓶颈, 也避免子进程共享 DB 连接)
安全前提:
"损坏"只在铁证下判定 文件不以 %PDF 开头, 或已装 pypdf 且解析直接失败
若未装 pypdf 且魔数正常但页数判不出, 归为 unknown, **不处理绝不删除**
强烈建议先 `pip install pypdf` 再跑, 否则只能处理魔数明显不符的坏文件
用法: 用法:
python manage.py fix_preview_pdf --dry-run python manage.py fix_preview_pdf --dry-run
python manage.py fix_preview_pdf # 纠正标记 + 删坏文件, 保留预览页文件 python manage.py fix_preview_pdf --delete-file
python manage.py fix_preview_pdf --delete-file # 并删除预览页文件
python manage.py fix_preview_pdf --workers 16 # 指定并发进程数
""" """
import os import os
from concurrent.futures import ProcessPoolExecutor
from django.conf import settings
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.utils import timezone
from apps.resm.models import Paper from apps.resm.models import Paper
from apps.resm.pdf_utils import classify_pdf_file from apps.resm.tasks import _pdf_page_count
def _pdf_path(doi, pub_date):
"""按 doi + publication_date 推算 PDF 落盘路径(不创建目录, 只读用)。"""
safe = doi.replace("/", "_")
if pub_date is None:
d = os.path.join(settings.BASE_DIR, "media/papers", "unknown")
else:
d = os.path.join(settings.BASE_DIR, "media/papers",
str(pub_date.year), str(pub_date.month), str(pub_date.day))
return os.path.join(d, f"{safe}.pdf")
def _batched(iterable, size):
batch = []
for item in iterable:
batch.append(item)
if len(batch) >= size:
yield batch
batch = []
if batch:
yield batch
class Command(BaseCommand): class Command(BaseCommand):
help = "纠正被误标为全文的 Elsevier 预览页 / 损坏 PDF(多进程并发)" help = "纠正被误标为全文的 Elsevier 摘要预览 PDF(1 页)"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument("--dry-run", action="store_true", parser.add_argument("--dry-run", action="store_true",
@ -69,98 +36,71 @@ class Command(BaseCommand):
parser.add_argument("--limit", type=int, default=0, parser.add_argument("--limit", type=int, default=0,
help="最多处理多少条 (0=不限)") help="最多处理多少条 (0=不限)")
parser.add_argument("--delete-file", action="store_true", parser.add_argument("--delete-file", action="store_true",
help="同时删除预览页文件(坏文件无论该开关都会删)") help="同时删除本地预览 PDF 文件")
parser.add_argument("--workers", type=int, default=0,
help="并发进程数 (0=CPU 核数)")
parser.add_argument("--batch", type=int, default=2000,
help="每批处理多少条(控制内存与进度粒度)")
def handle(self, *args, **opts): def handle(self, *args, **opts):
dry = opts["dry_run"] dry = opts["dry_run"]
limit = opts["limit"] limit = opts["limit"]
del_preview = opts["delete_file"] del_file = opts["delete_file"]
batch = max(1, opts["batch"])
workers = opts["workers"] or (os.cpu_count() or 4)
qs = Paper.objects.filter( qs = Paper.objects.filter(
has_fulltext_pdf=True, doi__startswith="10.1016" has_fulltext_pdf=True, doi__startswith="10.1016"
).order_by("id") ).order_by("id")
total = qs.count() total = qs.count()
self.stdout.write( self.stdout.write(
f"候选(has_fulltext_pdf=True 且 DOI 以 10.1016 开头): {total}; " f"候选(has_fulltext_pdf=True 且 DOI 以 10.1016 开头): {total}")
f"workers={workers} batch={batch}"
+ (" (dry-run)" if dry else ""))
rows_iter = qs.values( checked = fixed = only_pdf = missing = unreadable = 0
"id", "doi", "publication_date", "has_fulltext_xml", "fail_reason" for paper in qs.iterator():
).iterator(chunk_size=batch) if limit and checked >= limit:
break
checked += 1
checked = preview = broken = only_pdf = deleted = 0 path = paper.init_paper_path("pdf")
missing = unknown = 0 if not os.path.exists(path):
missing += 1
continue
try:
with open(path, "rb") as f:
content = f.read()
except OSError:
unreadable += 1
continue
with ProcessPoolExecutor(max_workers=workers) as ex: pages = _pdf_page_count(content)
stop = False if pages is None:
for chunk in _batched(rows_iter, batch): unreadable += 1
if stop: continue
break if pages > 1:
paths = [_pdf_path(r["doi"], r["publication_date"]) for r in chunk] continue # 真全文, 跳过
results = ex.map(classify_pdf_file, paths, chunksize=32)
for r, (_path, kind, pages) in zip(chunk, results):
if limit and checked >= limit:
stop = True
break
checked += 1
if kind == "missing": fixed += 1
missing += 1 only_pdf_case = not paper.has_fulltext_xml
continue if only_pdf_case:
if kind in ("ok", "unknown", "unreadable"): only_pdf += 1
if kind != "ok": self.stdout.write(
unknown += 1 f"[preview {pages}p]{' (only-pdf)' if only_pdf_case else ''} "
continue f"{paper.doi} {path}")
if dry:
continue
# kind in ('preview', 'broken'): 纠正标记 paper.has_fulltext_pdf = False
do_delete = (kind == "broken") or del_preview update_fields = ["has_fulltext_pdf", "update_time"]
only_pdf_case = not r["has_fulltext_xml"] # 没有 XML 全文时, 之前的 has_fulltext 只是被这张假预览页置上的, 一并回退
if kind == "preview": if not paper.has_fulltext_xml:
preview += 1 paper.has_fulltext = False
tag = f"preview {pages}p" update_fields.insert(0, "has_fulltext")
reason = "elsevier_pdf_preview_only" paper.save(update_fields=update_fields)
else: if "elsevier_pdf_preview_only" not in (paper.fail_reason or ""):
broken += 1 paper.save_fail_reason("elsevier_pdf_preview_only")
tag = "broken" if del_file:
reason = "pdf_broken" try:
if only_pdf_case: os.remove(path)
only_pdf += 1 except OSError:
self.stdout.write( pass
f"[{tag}]{' (only-pdf)' if only_pdf_case else ''}"
f"{' +rm' if do_delete else ''} {r['doi']} {_path}")
if dry:
continue
fr = r["fail_reason"]
if reason not in (fr or ""):
fr = f"{fr};{reason}" if fr else f";{reason}"
upd = {"has_fulltext_pdf": False, "fail_reason": fr,
"update_time": timezone.now()}
if only_pdf_case:
upd["has_fulltext"] = False
Paper.objects.filter(id=r["id"]).update(**upd)
if do_delete:
try:
os.remove(_path)
deleted += 1
except OSError:
pass
self.stdout.write(
f" 进度 checked={checked}/{total} preview={preview} "
f"broken={broken} deleted={deleted} missing={missing} "
f"unknown={unknown}")
self.stdout.write(self.style.SUCCESS( self.stdout.write(self.style.SUCCESS(
f"完成 检查={checked} 预览页={preview} 坏文件={broken} " f"检查={checked} 预览页修复={fixed} (其中无XML全文/一并回退has_fulltext={only_pdf}) "
f"(无XML全文一并回退has_fulltext={only_pdf}) 删除文件={deleted} " f"文件缺失={missing} 无法解析={unreadable}"
f"文件缺失={missing} 未知/跳过={unknown}"
+ (" (dry-run, 未写库)" if dry else "") + (" (dry-run, 未写库)" if dry else "")
)) ))

View File

@ -1,87 +0,0 @@
"""种子数据:对接《全球材料前沿动态简报》三、前沿科技 检索清单,补充期刊 / 关键词监控。
简报已列但 0009 已收录的期刊(Ceramics International / CCR / CCC / Construction and
Building Materials)不重复添加;此处只补简报新增项:
- 一级检索源(Nature/Science 系顶刊):Nature MaterialsNature Communications
Communications MaterialsScience AdvancesNature Reviews MaterialsScientific Reports
- 二级检索源补充(建材 TOP):Engineering StructuresMaterials Today
- 统一检索关键词(简报第三节):低碳水泥 / 储能建材 / 碳化机理 / 固废基胶凝 / 建材碳捕集
(OpenAlex 语料为英文, value 用英文搜索词,name 标中文)
期刊监控只按 ISSN 过滤不带主题词,Nature Communications / Scientific Reports 等综合性
大刊会拉入非建材论文;简报要求的"建材主题 + TOP5 筛选"需在下游按关键词二次筛选,本表不承担
全部复用每天 05:00 monitor_papers 周期任务(0009 已注册),无需新增调度
get_or_create 保证迁移可安全重跑
"""
from django.db import migrations
from apps.utils.snowflake import idWorker
# 一级检索源:Nature/Science 系材料类顶刊(简报「前沿科技」一级)
JOURNALS_TIER1 = [
("1476-1122", "Nature Materials"),
("2041-1723", "Nature Communications"),
("2662-4443", "Communications Materials"),
("2375-2548", "Science Advances"),
("2058-8437", "Nature Reviews Materials"),
("2045-2322", "Scientific Reports"),
]
NOTE_TIER1 = "前沿顶刊"
# 二级检索源补充:建材 / 无机非金属国际 TOP(简报已列、0009 未收录的)
JOURNALS_TIER2 = [
("0141-0296", "Engineering Structures"),
("1369-7021", "Materials Today"),
]
NOTE_TIER2 = "建材TOP顶刊"
# 统一检索关键词(简报第三节,英文搜索词 + 中文名)
SEARCHES = [
("low carbon cement", "低碳水泥"),
("energy storage building material", "储能建筑材料"),
("concrete carbonation", "混凝土碳化机理"),
("geopolymer", "工业固废基地聚物"),
("supplementary cementitious material", "固废基胶凝材料"),
("carbon capture cement", "建材碳捕集"),
]
NOTE_SEARCH = "低碳建材前沿"
def seed(apps, schema_editor):
PaperMonitor = apps.get_model("resm", "PaperMonitor")
for issn, name in JOURNALS_TIER1:
PaperMonitor.objects.get_or_create(
type="journal", value=issn,
defaults={"id": idWorker.get_id(), "name": name, "note": NOTE_TIER1,
"is_active": True, "days": 7},
)
for issn, name in JOURNALS_TIER2:
PaperMonitor.objects.get_or_create(
type="journal", value=issn,
defaults={"id": idWorker.get_id(), "name": name, "note": NOTE_TIER2,
"is_active": True, "days": 7},
)
for term, name in SEARCHES:
PaperMonitor.objects.get_or_create(
type="search", value=term,
defaults={"id": idWorker.get_id(), "name": name, "note": NOTE_SEARCH,
"is_active": True, "days": 7},
)
def unseed(apps, schema_editor):
PaperMonitor = apps.get_model("resm", "PaperMonitor")
journals = [i for i, _ in JOURNALS_TIER1] + [i for i, _ in JOURNALS_TIER2]
PaperMonitor.objects.filter(type="journal", value__in=journals).delete()
PaperMonitor.objects.filter(type="search", value__in=[t for t, _ in SEARCHES]).delete()
class Migration(migrations.Migration):
dependencies = [
("resm", "0010_seed_ensure_fetch_running"),
]
operations = [
migrations.RunPython(seed, unseed),
]

View File

@ -1,94 +0,0 @@
"""PDF 解析/分类工具(纯 stdlib + pypdf, 不依赖 Django)。
独立成模块, 以便 ProcessPoolExecutor 的子进程能安全导入(fork/spawn 均可),
不会牵连 Django 模型与配置tasks.py 从这里复用这些函数
"""
import os
import re
def _pdf_page_count(content: bytes):
"""返回 PDF 页数; 无法确定时返回 None。
优先用 pypdf 精确解析; 未安装或解析异常时退化为字节扫描
(对未压缩对象树有效, Elsevier 的摘要预览页正属此类)"""
try:
from io import BytesIO
import logging
# 坏 PDF 会让 pypdf 刷大量恢复日志, 这里只关心页数, 静音其 logger
logging.getLogger("pypdf").setLevel(logging.CRITICAL)
from pypdf import PdfReader
return len(PdfReader(BytesIO(content), strict=False).pages)
except ImportError:
pass
except Exception:
return None
try:
counts = [int(m) for m in re.findall(rb"/Count\s+(\d+)", content)]
if counts:
return max(counts)
n = len(re.findall(rb"/Type\s*/Page(?![sR])", content))
if n:
return n
except Exception:
pass
return None
def _is_elsevier_preview_pdf(content: bytes) -> bool:
"""判断 Elsevier 返回的 PDF 是否为"摘要预览页"
Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回
仅含摘要的 1 页预览 PDF(魔数仍是 %PDF体积也不小), 全文 XML 却可能正常
判据: 能确定页数且 <= 1 无法确定页数时返回 False(从宽, 不误杀真全文)"""
pages = _pdf_page_count(content)
return pages is not None and pages <= 1
def _inspect_pdf(content: bytes):
"""对历史落库的 PDF 文件分类, 返回 (kind, pages)。
kind:
'broken' - PDF(魔数不符) pypdf 解析直接失败 -> 可安全删除重抓
'preview' - 1 页摘要预览页
'ok' - 多页, 视为真全文, 不处理
'unknown' - 魔数正常但页数判不出(通常因未装 pypdf) -> 不处理, 绝不当坏文件
pages: 页数; 无法确定为 None"""
if not content or b"%PDF" not in content[:1024]:
return "broken", 0
try:
from io import BytesIO
import logging
logging.getLogger("pypdf").setLevel(logging.CRITICAL)
from pypdf import PdfReader
except ImportError:
# 没装 pypdf: 只能靠字节扫描, 判不出就 unknown(从宽, 不误判为坏)
pages = _pdf_page_count(content)
if pages is None:
return "unknown", None
return ("preview" if pages <= 1 else "ok"), pages
try:
pages = len(PdfReader(BytesIO(content), strict=False).pages)
except Exception:
return "broken", None
if pages <= 0:
return "broken", pages
return ("preview" if pages == 1 else "ok"), pages
def classify_pdf_file(path: str):
"""并发 worker 入口: 读取并分类单个 PDF 文件路径。
返回 (path, kind, pages) _inspect_pdf 的四种 kind , 另有 IO 结果:
'missing' - 文件不存在
'unreadable' - 打开失败(权限等)
设计为纯函数( stdlib + pypdf), 可被进程池安全 pickle / 导入"""
try:
if not os.path.exists(path):
return path, "missing", None
with open(path, "rb") as f:
content = f.read()
except OSError:
return path, "unreadable", None
kind, pages = _inspect_pdf(content)
return path, kind, pages

View File

@ -11,7 +11,7 @@ from lxml import etree
from celery import current_app from celery import current_app
from datetime import datetime, timedelta from datetime import datetime, timedelta
import random import random
from .pdf_utils import _is_elsevier_preview_pdf import re
from .d_oaurl import download_from_url_playwright from .d_oaurl import download_from_url_playwright
import asyncio import asyncio
import sys import sys
@ -600,6 +600,41 @@ def _elsevier_fetch_xml(req, paper):
return True, has_fulltext, None return True, has_fulltext, None
def _pdf_page_count(content: bytes):
"""返回 PDF 页数; 无法确定时返回 None。
优先用 pypdf 精确解析; 未安装或解析异常时退化为字节扫描
(对未压缩对象树有效, Elsevier 的摘要预览页正属此类)"""
try:
from io import BytesIO
from pypdf import PdfReader
return len(PdfReader(BytesIO(content), strict=False).pages)
except ImportError:
pass
except Exception:
return None
try:
counts = [int(m) for m in re.findall(rb"/Count\s+(\d+)", content)]
if counts:
return max(counts)
n = len(re.findall(rb"/Type\s*/Page(?![sR])", content))
if n:
return n
except Exception:
pass
return None
def _is_elsevier_preview_pdf(content: bytes) -> bool:
"""判断 Elsevier 返回的 PDF 是否为"摘要预览页"
Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回
仅含摘要的 1 页预览 PDF(魔数仍是 %PDF体积也不小), 全文 XML 却可能正常
判据: 能确定页数且 <= 1 无法确定页数时返回 False(从宽, 不误杀真全文)"""
pages = _pdf_page_count(content)
return pages is not None and pages <= 1
def _elsevier_fetch_pdf(req, paper): def _elsevier_fetch_pdf(req, paper):
"""同一 DOI 取 PDF, 成功落库返回 True。""" """同一 DOI 取 PDF, 成功落库返回 True。"""
try: try:

View File

@ -1,311 +0,0 @@
#!/usr/bin/env python
"""独立脚本: 从 ScienceDirect 网页下载排版 PDF。
apps.resm 解耦, 独立运行核心难点是 Cloudflare 人机校验: Playwright 自建
浏览器带自动化指纹会被 Turnstile 识破而死循环("are you a robot"), 因此推荐
连接你手动启动的真实 Chrome(由真人过一次验证), 脚本只负责驱动它下载
前提: 运行方 IP 在机构订阅网段(ScienceDirect IP 授权)
依赖: playwright, requests, lxml, pypdf(可选, 用于精确判页数)
凭证: 默认从项目 config/conf.py 读取 ELSEVIER_API_KEY / ELSEVIER_INST_TOKEN,
也可用 --pii 直接给 PII 跳过取号
用法(推荐 CDP 模式):
1) 单独起一个带调试端口的 Chrome(独立档案, 不影响日常浏览器):
Windows:
& "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" \
--remote-debugging-port=9222 --user-data-dir="D:\\chrome-sd-profile"
Linux:
google-chrome --remote-debugging-port=9222 --user-data-dir=/tmp/sd-profile
2) 在该 Chrome 里手动打开任一 ScienceDirect 文章, 亲手过掉 Cloudflare 验证
3) 运行本脚本(可一次多篇):
python scripts/sd_download.py 10.1016/j.conbuildmat.2026.146897 \
--cdp http://localhost:9222 --out ./sd_pdfs
不加 --cdp 时脚本自行启动浏览器(大概率被 Cloudflare , 仅调试用)
提示: 批量爬 ScienceDirect 违反 Elsevier 条款且可能导致机构 IP 被封, 仅供少量补抓
"""
import argparse
import asyncio
import os
import re
import sys
from io import BytesIO
# 项目根入 sys.path, 以便读取 config/conf.py 的凭证
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if ROOT not in sys.path:
sys.path.insert(0, ROOT)
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
_STEALTH_ARGS = [
"--disable-blink-features=AutomationControlled",
"--no-first-run", "--no-default-browser-check",
"--disable-infobars", "--disable-extensions", "--disable-notifications",
]
_CHALLENGE_KW = ("just a moment", "moment", "checking your browser",
"attention required")
# ------------------------------ PII / PDF 工具 ------------------------------
def get_creds():
try:
from config.conf import ELSEVIER_API_KEY, ELSEVIER_INST_TOKEN
return ELSEVIER_API_KEY, ELSEVIER_INST_TOKEN
except Exception as e:
print(f"[warn] 读取 config/conf.py 凭证失败: {e!r}")
return None, None
def fetch_pii(doi):
"""调 Elsevier API(text/xml) 取归一化 PII; 失败返回 None。"""
import requests
from lxml import etree
key, token = get_creds()
if not key:
return None
headers = {"X-ELS-APIKey": key}
if token:
headers["X-ELS-Insttoken"] = token
try:
r = requests.get(f"https://api.elsevier.com/content/article/doi/{doi}",
params={"httpAccept": "text/xml"}, headers=headers,
timeout=(3, 30))
except requests.RequestException as e:
print(f"[warn] 取 PII 请求失败: {e!r}")
return None
if r.status_code != 200:
print(f"[warn] 取 PII 非 200: {r.status_code}")
return None
try:
root = etree.fromstring(r.content)
except Exception:
return None
nodes = root.xpath("//*[local-name()='pii']/text()")
if not nodes:
return None
return re.sub(r"[^A-Za-z0-9]", "", nodes[0])
def pdf_page_count(content: bytes):
"""返回页数, 判不出返回 None。优先 pypdf, 退化字节扫描。"""
try:
import logging
logging.getLogger("pypdf").setLevel(logging.CRITICAL)
from pypdf import PdfReader
return len(PdfReader(BytesIO(content), strict=False).pages)
except ImportError:
pass
except Exception:
return None
try:
counts = [int(m) for m in re.findall(rb"/Count\s+(\d+)", content)]
if counts:
return max(counts)
n = len(re.findall(rb"/Type\s*/Page(?![sR])", content))
if n:
return n
except Exception:
pass
return None
def classify(content: bytes):
"""(kind, pages): broken / preview(1页) / ok(多页) / unknown。"""
if not content or b"%PDF" not in content[:1024]:
return "broken", 0
pages = pdf_page_count(content)
if pages is None:
return "unknown", None
if pages <= 0:
return "broken", pages
return ("preview" if pages == 1 else "ok"), pages
# ------------------------------ 浏览器下载 ------------------------------
async def _wait_challenge_cleared(page, rounds=30, interval=2000):
for _ in range(rounds):
try:
await page.wait_for_timeout(interval)
except Exception:
pass
try:
title = (await page.title()) or ""
except Exception:
continue # 跳转中, 下一轮再看
if title and not any(k in title.lower() for k in _CHALLENGE_KW):
return True
return False
async def _find_pdfft_href(page):
try:
return await page.evaluate(
"() => { const a=document.querySelector('a[href*=\"pdfft\"]');"
" return a ? a.href : null; }")
except Exception:
return None
async def _grab_pdf(page, url, timeout):
# 优先 download 事件
try:
async with page.expect_download(timeout=min(timeout, 45000)) as dl:
try:
await page.goto(url, timeout=timeout)
except Exception:
pass
download = await dl.value
path = await download.path()
if path:
with open(path, "rb") as f:
data = f.read()
if data:
return data
except Exception:
pass
# 退化: 拦截内联 application/pdf 响应
captured = {}
async def on_resp(resp):
try:
if "application/pdf" in resp.headers.get("content-type", ""):
captured["b"] = await resp.body()
except Exception:
pass
page.on("response", on_resp)
try:
try:
await page.goto(url.replace("?download=true", ""), timeout=timeout)
except Exception:
pass
for _ in range(12):
if captured.get("b"):
break
await page.wait_for_timeout(1000)
finally:
try:
page.remove_listener("response", on_resp)
except Exception:
pass
return captured.get("b")
async def download_one(pii, save_path, cdp_url=None, headless=False,
timeout=60000):
"""返回 (ok, msg)。"""
from playwright.async_api import async_playwright
article = f"https://www.sciencedirect.com/science/article/pii/{pii}"
pdf_url = f"{article}/pdfft?download=true"
async with async_playwright() as p:
connected = bool(cdp_url)
if connected:
browser = await p.chromium.connect_over_cdp(cdp_url)
else:
try:
browser = await p.chromium.launch(headless=headless,
channel="chrome",
args=_STEALTH_ARGS)
except Exception:
browser = await p.chromium.launch(headless=headless,
args=_STEALTH_ARGS)
page = None
try:
if connected:
ctx = browser.contexts[0] if browser.contexts else await browser.new_context()
else:
ctx = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=_UA, locale="en-US", accept_downloads=True)
page = await ctx.new_page()
page.set_default_timeout(timeout)
if not connected:
await page.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>false});")
try:
await page.goto(article, wait_until="domcontentloaded",
timeout=40000)
except Exception:
pass
rounds = 60 if connected else 20
if not await _wait_challenge_cleared(page, rounds=rounds):
return False, "cloudflare_not_cleared"
# reload 拿干净文章页
try:
await page.goto(article, wait_until="domcontentloaded",
timeout=40000)
await page.wait_for_timeout(4000)
except Exception:
pass
href = await _find_pdfft_href(page) or pdf_url
body = await _grab_pdf(page, href, timeout)
if not body:
return False, "no_pdf_captured"
kind, pages = classify(body)
if kind != "ok":
head = body[:160].decode("utf-8", "replace").replace("\n", " ")
return False, f"not_fulltext kind={kind} pages={pages} len={len(body)} head={head!r}"
os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
with open(save_path, "wb") as f:
f.write(body)
return True, f"ok pages={pages} len={len(body)}"
finally:
try:
if connected and page:
await page.close()
await browser.close()
except Exception:
pass
# ------------------------------ CLI ------------------------------
async def _run(args):
os.makedirs(args.out, exist_ok=True)
ok_n = 0
for doi in args.doi:
doi = doi.strip()
pii = args.pii or fetch_pii(doi)
if not pii:
print(f"[FAIL] {doi}: 取不到 PII")
continue
save = os.path.join(args.out, doi.replace("/", "_") + ".pdf")
print(f"[..] {doi} PII={pii} -> {save}")
ok, msg = await download_one(pii, save, cdp_url=args.cdp or None,
headless=args.headless, timeout=args.timeout * 1000)
if ok:
ok_n += 1
print(f"[OK] {doi}: {msg}")
else:
print(f"[FAIL] {doi}: {msg}")
print(f"完成: {ok_n}/{len(args.doi)} 成功")
def main():
ap = argparse.ArgumentParser(description="从 ScienceDirect 网页下载 PDF(独立脚本)")
ap.add_argument("doi", nargs="+", help="一个或多个 DOI")
ap.add_argument("--cdp", default="",
help="连接手动启动的 Chrome, 如 http://localhost:9222(推荐)")
ap.add_argument("--out", default="./sd_pdfs", help="PDF 输出目录")
ap.add_argument("--pii", default="", help="直接指定 PII(仅单篇时用, 跳过 API 取号)")
ap.add_argument("--headless", action="store_true", help="无头(非 CDP 模式, 调试用)")
ap.add_argument("--timeout", type=int, default=60, help="单步超时(秒)")
args = ap.parse_args()
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
asyncio.run(_run(args))
if __name__ == "__main__":
main()