Compare commits
No commits in common. "main" and "fix/elsevier-preview-pdf" have entirely different histories.
main
...
fix/elsevi
|
|
@ -26,4 +26,3 @@ sh/*
|
||||||
temp/*
|
temp/*
|
||||||
nohup.out
|
nohup.out
|
||||||
scripts/*
|
scripts/*
|
||||||
!scripts/sd_download.py
|
|
||||||
|
|
@ -1,67 +1,34 @@
|
||||||
"""一次性修复: 纠正被误标为全文 PDF 的历史记录(Elsevier 摘要预览页 / 损坏文件)。
|
"""一次性修复: 把误标为全文 PDF 的 Elsevier "摘要预览页"(1 页)纠正回未下载状态。
|
||||||
|
|
||||||
背景:
|
背景:
|
||||||
Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回仅含
|
Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回仅含
|
||||||
摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小); 另有部分历史记录把 HTML 错误页 /
|
摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小), 而全文 XML 却能正常拿到。旧抓取
|
||||||
被截断的垃圾当 PDF 存了。旧抓取逻辑只校验魔数 + 体积, 都会误标 has_fulltext_pdf=True。
|
逻辑只校验魔数 + 体积, 误将预览页落库并置 has_fulltext_pdf=True。
|
||||||
|
|
||||||
本命令核对本地 PDF, 分两类处理:
|
本命令重新核对本地 PDF 的页数, 对 <= 1 页者:
|
||||||
- 预览页(1 页): has_fulltext_pdf 置回 False; 文件**仅在 --delete-file 时**删除。
|
- has_fulltext_pdf 置回 False
|
||||||
- 损坏文件(非 PDF / pypdf 解析失败): has_fulltext_pdf 置回 False; 文件**总是删除**
|
- 若该论文有 XML 全文(has_fulltext_xml=True), 保留 has_fulltext=True;
|
||||||
(dry-run 除外), 因为它根本不是有效全文, 留着无用且会污染下游解析。
|
否则(此前只有这张假预览页冒充全文)一并把 has_fulltext 回退为 False,
|
||||||
两类在缺少 XML 全文(has_fulltext_xml=False)时, 一并把 has_fulltext 回退 False,
|
让它能重新进入下载链路去找真正的全文。
|
||||||
让其重新进入下载链路去找真正的全文; 并追加 fail_reason 标记供抓取任务排除。
|
- 追加 fail_reason 'elsevier_pdf_preview_only' (供 Elsevier 补抓队列排除, 避免无限重试)
|
||||||
|
- 可选: 删除本地预览 PDF 文件 (--delete-file)
|
||||||
|
|
||||||
性能:
|
文件读取依赖本地存在 PDF (在跑抓取的服务器上执行)。建议先 --dry-run 看统计。
|
||||||
读文件 + pypdf 解析是 CPU/IO 密集, 用 ProcessPoolExecutor 并行(--workers, 默认 CPU 核数);
|
|
||||||
数据库写入留在主进程串行(坏文件仅占少数, 非瓶颈, 也避免子进程共享 DB 连接)。
|
|
||||||
|
|
||||||
安全前提:
|
|
||||||
"损坏"只在铁证下判定 —— 文件不以 %PDF 开头, 或已装 pypdf 且解析直接失败。
|
|
||||||
若未装 pypdf 且魔数正常但页数判不出, 归为 unknown, **不处理、绝不删除**。
|
|
||||||
强烈建议先 `pip install pypdf` 再跑, 否则只能处理魔数明显不符的坏文件。
|
|
||||||
|
|
||||||
用法:
|
用法:
|
||||||
python manage.py fix_preview_pdf --dry-run
|
python manage.py fix_preview_pdf --dry-run
|
||||||
python manage.py fix_preview_pdf # 纠正标记 + 删坏文件, 保留预览页文件
|
python manage.py fix_preview_pdf --delete-file
|
||||||
python manage.py fix_preview_pdf --delete-file # 并删除预览页文件
|
|
||||||
python manage.py fix_preview_pdf --workers 16 # 指定并发进程数
|
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
from concurrent.futures import ProcessPoolExecutor
|
|
||||||
|
|
||||||
from django.conf import settings
|
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
from django.utils import timezone
|
|
||||||
|
|
||||||
from apps.resm.models import Paper
|
from apps.resm.models import Paper
|
||||||
from apps.resm.pdf_utils import classify_pdf_file
|
from apps.resm.tasks import _pdf_page_count
|
||||||
|
|
||||||
|
|
||||||
def _pdf_path(doi, pub_date):
|
|
||||||
"""按 doi + publication_date 推算 PDF 落盘路径(不创建目录, 只读用)。"""
|
|
||||||
safe = doi.replace("/", "_")
|
|
||||||
if pub_date is None:
|
|
||||||
d = os.path.join(settings.BASE_DIR, "media/papers", "unknown")
|
|
||||||
else:
|
|
||||||
d = os.path.join(settings.BASE_DIR, "media/papers",
|
|
||||||
str(pub_date.year), str(pub_date.month), str(pub_date.day))
|
|
||||||
return os.path.join(d, f"{safe}.pdf")
|
|
||||||
|
|
||||||
|
|
||||||
def _batched(iterable, size):
|
|
||||||
batch = []
|
|
||||||
for item in iterable:
|
|
||||||
batch.append(item)
|
|
||||||
if len(batch) >= size:
|
|
||||||
yield batch
|
|
||||||
batch = []
|
|
||||||
if batch:
|
|
||||||
yield batch
|
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
help = "纠正被误标为全文的 Elsevier 预览页 / 损坏 PDF(多进程并发)"
|
help = "纠正被误标为全文的 Elsevier 摘要预览 PDF(1 页)"
|
||||||
|
|
||||||
def add_arguments(self, parser):
|
def add_arguments(self, parser):
|
||||||
parser.add_argument("--dry-run", action="store_true",
|
parser.add_argument("--dry-run", action="store_true",
|
||||||
|
|
@ -69,98 +36,71 @@ class Command(BaseCommand):
|
||||||
parser.add_argument("--limit", type=int, default=0,
|
parser.add_argument("--limit", type=int, default=0,
|
||||||
help="最多处理多少条 (0=不限)")
|
help="最多处理多少条 (0=不限)")
|
||||||
parser.add_argument("--delete-file", action="store_true",
|
parser.add_argument("--delete-file", action="store_true",
|
||||||
help="同时删除预览页文件(坏文件无论该开关都会删)")
|
help="同时删除本地预览 PDF 文件")
|
||||||
parser.add_argument("--workers", type=int, default=0,
|
|
||||||
help="并发进程数 (0=CPU 核数)")
|
|
||||||
parser.add_argument("--batch", type=int, default=2000,
|
|
||||||
help="每批处理多少条(控制内存与进度粒度)")
|
|
||||||
|
|
||||||
def handle(self, *args, **opts):
|
def handle(self, *args, **opts):
|
||||||
dry = opts["dry_run"]
|
dry = opts["dry_run"]
|
||||||
limit = opts["limit"]
|
limit = opts["limit"]
|
||||||
del_preview = opts["delete_file"]
|
del_file = opts["delete_file"]
|
||||||
batch = max(1, opts["batch"])
|
|
||||||
workers = opts["workers"] or (os.cpu_count() or 4)
|
|
||||||
|
|
||||||
qs = Paper.objects.filter(
|
qs = Paper.objects.filter(
|
||||||
has_fulltext_pdf=True, doi__startswith="10.1016"
|
has_fulltext_pdf=True, doi__startswith="10.1016"
|
||||||
).order_by("id")
|
).order_by("id")
|
||||||
total = qs.count()
|
total = qs.count()
|
||||||
self.stdout.write(
|
self.stdout.write(
|
||||||
f"候选(has_fulltext_pdf=True 且 DOI 以 10.1016 开头): {total}; "
|
f"候选(has_fulltext_pdf=True 且 DOI 以 10.1016 开头): {total}")
|
||||||
f"workers={workers} batch={batch}"
|
|
||||||
+ (" (dry-run)" if dry else ""))
|
|
||||||
|
|
||||||
rows_iter = qs.values(
|
checked = fixed = only_pdf = missing = unreadable = 0
|
||||||
"id", "doi", "publication_date", "has_fulltext_xml", "fail_reason"
|
for paper in qs.iterator():
|
||||||
).iterator(chunk_size=batch)
|
if limit and checked >= limit:
|
||||||
|
break
|
||||||
|
checked += 1
|
||||||
|
|
||||||
checked = preview = broken = only_pdf = deleted = 0
|
path = paper.init_paper_path("pdf")
|
||||||
missing = unknown = 0
|
if not os.path.exists(path):
|
||||||
|
missing += 1
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
content = f.read()
|
||||||
|
except OSError:
|
||||||
|
unreadable += 1
|
||||||
|
continue
|
||||||
|
|
||||||
with ProcessPoolExecutor(max_workers=workers) as ex:
|
pages = _pdf_page_count(content)
|
||||||
stop = False
|
if pages is None:
|
||||||
for chunk in _batched(rows_iter, batch):
|
unreadable += 1
|
||||||
if stop:
|
continue
|
||||||
break
|
if pages > 1:
|
||||||
paths = [_pdf_path(r["doi"], r["publication_date"]) for r in chunk]
|
continue # 真全文, 跳过
|
||||||
results = ex.map(classify_pdf_file, paths, chunksize=32)
|
|
||||||
for r, (_path, kind, pages) in zip(chunk, results):
|
|
||||||
if limit and checked >= limit:
|
|
||||||
stop = True
|
|
||||||
break
|
|
||||||
checked += 1
|
|
||||||
|
|
||||||
if kind == "missing":
|
fixed += 1
|
||||||
missing += 1
|
only_pdf_case = not paper.has_fulltext_xml
|
||||||
continue
|
if only_pdf_case:
|
||||||
if kind in ("ok", "unknown", "unreadable"):
|
only_pdf += 1
|
||||||
if kind != "ok":
|
self.stdout.write(
|
||||||
unknown += 1
|
f"[preview {pages}p]{' (only-pdf)' if only_pdf_case else ''} "
|
||||||
continue
|
f"{paper.doi} {path}")
|
||||||
|
if dry:
|
||||||
|
continue
|
||||||
|
|
||||||
# kind in ('preview', 'broken'): 纠正标记
|
paper.has_fulltext_pdf = False
|
||||||
do_delete = (kind == "broken") or del_preview
|
update_fields = ["has_fulltext_pdf", "update_time"]
|
||||||
only_pdf_case = not r["has_fulltext_xml"]
|
# 没有 XML 全文时, 之前的 has_fulltext 只是被这张假预览页置上的, 一并回退
|
||||||
if kind == "preview":
|
if not paper.has_fulltext_xml:
|
||||||
preview += 1
|
paper.has_fulltext = False
|
||||||
tag = f"preview {pages}p"
|
update_fields.insert(0, "has_fulltext")
|
||||||
reason = "elsevier_pdf_preview_only"
|
paper.save(update_fields=update_fields)
|
||||||
else:
|
if "elsevier_pdf_preview_only" not in (paper.fail_reason or ""):
|
||||||
broken += 1
|
paper.save_fail_reason("elsevier_pdf_preview_only")
|
||||||
tag = "broken"
|
if del_file:
|
||||||
reason = "pdf_broken"
|
try:
|
||||||
if only_pdf_case:
|
os.remove(path)
|
||||||
only_pdf += 1
|
except OSError:
|
||||||
self.stdout.write(
|
pass
|
||||||
f"[{tag}]{' (only-pdf)' if only_pdf_case else ''}"
|
|
||||||
f"{' +rm' if do_delete else ''} {r['doi']} {_path}")
|
|
||||||
if dry:
|
|
||||||
continue
|
|
||||||
|
|
||||||
fr = r["fail_reason"]
|
|
||||||
if reason not in (fr or ""):
|
|
||||||
fr = f"{fr};{reason}" if fr else f";{reason}"
|
|
||||||
upd = {"has_fulltext_pdf": False, "fail_reason": fr,
|
|
||||||
"update_time": timezone.now()}
|
|
||||||
if only_pdf_case:
|
|
||||||
upd["has_fulltext"] = False
|
|
||||||
Paper.objects.filter(id=r["id"]).update(**upd)
|
|
||||||
if do_delete:
|
|
||||||
try:
|
|
||||||
os.remove(_path)
|
|
||||||
deleted += 1
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
self.stdout.write(
|
|
||||||
f" 进度 checked={checked}/{total} preview={preview} "
|
|
||||||
f"broken={broken} deleted={deleted} missing={missing} "
|
|
||||||
f"unknown={unknown}")
|
|
||||||
|
|
||||||
self.stdout.write(self.style.SUCCESS(
|
self.stdout.write(self.style.SUCCESS(
|
||||||
f"完成 检查={checked} 预览页={preview} 坏文件={broken} "
|
f"检查={checked} 预览页修复={fixed} (其中无XML全文/一并回退has_fulltext={only_pdf}) "
|
||||||
f"(无XML全文一并回退has_fulltext={only_pdf}) 删除文件={deleted} "
|
f"文件缺失={missing} 无法解析={unreadable}"
|
||||||
f"文件缺失={missing} 未知/跳过={unknown}"
|
|
||||||
+ (" (dry-run, 未写库)" if dry else "")
|
+ (" (dry-run, 未写库)" if dry else "")
|
||||||
))
|
))
|
||||||
|
|
|
||||||
|
|
@ -1,87 +0,0 @@
|
||||||
"""种子数据:对接《全球材料前沿动态简报》三、前沿科技 检索清单,补充期刊 / 关键词监控。
|
|
||||||
|
|
||||||
简报已列但 0009 已收录的期刊(Ceramics International / CCR / CCC / Construction and
|
|
||||||
Building Materials)不重复添加;此处只补简报新增项:
|
|
||||||
|
|
||||||
- 一级检索源(Nature/Science 系顶刊):Nature Materials、Nature Communications、
|
|
||||||
Communications Materials、Science Advances、Nature Reviews Materials、Scientific Reports
|
|
||||||
- 二级检索源补充(建材 TOP):Engineering Structures、Materials Today
|
|
||||||
- 统一检索关键词(简报第三节):低碳水泥 / 储能建材 / 碳化机理 / 固废基胶凝 / 建材碳捕集 等
|
|
||||||
(OpenAlex 语料为英文,故 value 用英文搜索词,name 标中文)
|
|
||||||
|
|
||||||
期刊监控只按 ISSN 过滤、不带主题词,Nature Communications / Scientific Reports 等综合性
|
|
||||||
大刊会拉入非建材论文;简报要求的"建材主题 + TOP5 筛选"需在下游按关键词二次筛选,本表不承担。
|
|
||||||
全部复用每天 05:00 的 monitor_papers 周期任务(0009 已注册),无需新增调度。
|
|
||||||
get_or_create 保证迁移可安全重跑。
|
|
||||||
"""
|
|
||||||
from django.db import migrations
|
|
||||||
from apps.utils.snowflake import idWorker
|
|
||||||
|
|
||||||
# 一级检索源:Nature/Science 系材料类顶刊(简报「前沿科技」一级)
|
|
||||||
JOURNALS_TIER1 = [
|
|
||||||
("1476-1122", "Nature Materials"),
|
|
||||||
("2041-1723", "Nature Communications"),
|
|
||||||
("2662-4443", "Communications Materials"),
|
|
||||||
("2375-2548", "Science Advances"),
|
|
||||||
("2058-8437", "Nature Reviews Materials"),
|
|
||||||
("2045-2322", "Scientific Reports"),
|
|
||||||
]
|
|
||||||
NOTE_TIER1 = "前沿顶刊"
|
|
||||||
|
|
||||||
# 二级检索源补充:建材 / 无机非金属国际 TOP(简报已列、0009 未收录的)
|
|
||||||
JOURNALS_TIER2 = [
|
|
||||||
("0141-0296", "Engineering Structures"),
|
|
||||||
("1369-7021", "Materials Today"),
|
|
||||||
]
|
|
||||||
NOTE_TIER2 = "建材TOP顶刊"
|
|
||||||
|
|
||||||
# 统一检索关键词(简报第三节,英文搜索词 + 中文名)
|
|
||||||
SEARCHES = [
|
|
||||||
("low carbon cement", "低碳水泥"),
|
|
||||||
("energy storage building material", "储能建筑材料"),
|
|
||||||
("concrete carbonation", "混凝土碳化机理"),
|
|
||||||
("geopolymer", "工业固废基地聚物"),
|
|
||||||
("supplementary cementitious material", "固废基胶凝材料"),
|
|
||||||
("carbon capture cement", "建材碳捕集"),
|
|
||||||
]
|
|
||||||
NOTE_SEARCH = "低碳建材前沿"
|
|
||||||
|
|
||||||
|
|
||||||
def seed(apps, schema_editor):
|
|
||||||
PaperMonitor = apps.get_model("resm", "PaperMonitor")
|
|
||||||
for issn, name in JOURNALS_TIER1:
|
|
||||||
PaperMonitor.objects.get_or_create(
|
|
||||||
type="journal", value=issn,
|
|
||||||
defaults={"id": idWorker.get_id(), "name": name, "note": NOTE_TIER1,
|
|
||||||
"is_active": True, "days": 7},
|
|
||||||
)
|
|
||||||
for issn, name in JOURNALS_TIER2:
|
|
||||||
PaperMonitor.objects.get_or_create(
|
|
||||||
type="journal", value=issn,
|
|
||||||
defaults={"id": idWorker.get_id(), "name": name, "note": NOTE_TIER2,
|
|
||||||
"is_active": True, "days": 7},
|
|
||||||
)
|
|
||||||
for term, name in SEARCHES:
|
|
||||||
PaperMonitor.objects.get_or_create(
|
|
||||||
type="search", value=term,
|
|
||||||
defaults={"id": idWorker.get_id(), "name": name, "note": NOTE_SEARCH,
|
|
||||||
"is_active": True, "days": 7},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def unseed(apps, schema_editor):
|
|
||||||
PaperMonitor = apps.get_model("resm", "PaperMonitor")
|
|
||||||
journals = [i for i, _ in JOURNALS_TIER1] + [i for i, _ in JOURNALS_TIER2]
|
|
||||||
PaperMonitor.objects.filter(type="journal", value__in=journals).delete()
|
|
||||||
PaperMonitor.objects.filter(type="search", value__in=[t for t, _ in SEARCHES]).delete()
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
("resm", "0010_seed_ensure_fetch_running"),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.RunPython(seed, unseed),
|
|
||||||
]
|
|
||||||
|
|
@ -1,94 +0,0 @@
|
||||||
"""PDF 解析/分类工具(纯 stdlib + pypdf, 不依赖 Django)。
|
|
||||||
|
|
||||||
独立成模块, 以便 ProcessPoolExecutor 的子进程能安全导入(fork/spawn 均可),
|
|
||||||
不会牵连 Django 模型与配置。tasks.py 从这里复用这些函数。
|
|
||||||
"""
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
def _pdf_page_count(content: bytes):
|
|
||||||
"""返回 PDF 页数; 无法确定时返回 None。
|
|
||||||
|
|
||||||
优先用 pypdf 精确解析; 未安装或解析异常时退化为字节扫描
|
|
||||||
(对未压缩对象树有效, Elsevier 的摘要预览页正属此类)。"""
|
|
||||||
try:
|
|
||||||
from io import BytesIO
|
|
||||||
import logging
|
|
||||||
# 坏 PDF 会让 pypdf 刷大量恢复日志, 这里只关心页数, 静音其 logger
|
|
||||||
logging.getLogger("pypdf").setLevel(logging.CRITICAL)
|
|
||||||
from pypdf import PdfReader
|
|
||||||
return len(PdfReader(BytesIO(content), strict=False).pages)
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
counts = [int(m) for m in re.findall(rb"/Count\s+(\d+)", content)]
|
|
||||||
if counts:
|
|
||||||
return max(counts)
|
|
||||||
n = len(re.findall(rb"/Type\s*/Page(?![sR])", content))
|
|
||||||
if n:
|
|
||||||
return n
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _is_elsevier_preview_pdf(content: bytes) -> bool:
|
|
||||||
"""判断 Elsevier 返回的 PDF 是否为"摘要预览页"。
|
|
||||||
|
|
||||||
Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回
|
|
||||||
仅含摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小), 全文 XML 却可能正常。
|
|
||||||
判据: 能确定页数且 <= 1 页。无法确定页数时返回 False(从宽, 不误杀真全文)。"""
|
|
||||||
pages = _pdf_page_count(content)
|
|
||||||
return pages is not None and pages <= 1
|
|
||||||
|
|
||||||
|
|
||||||
def _inspect_pdf(content: bytes):
|
|
||||||
"""对历史落库的 PDF 文件分类, 返回 (kind, pages)。
|
|
||||||
|
|
||||||
kind:
|
|
||||||
'broken' - 非 PDF(魔数不符)或 pypdf 解析直接失败 -> 可安全删除重抓
|
|
||||||
'preview' - 1 页摘要预览页
|
|
||||||
'ok' - 多页, 视为真全文, 不处理
|
|
||||||
'unknown' - 魔数正常但页数判不出(通常因未装 pypdf) -> 不处理, 绝不当坏文件
|
|
||||||
pages: 页数; 无法确定为 None。"""
|
|
||||||
if not content or b"%PDF" not in content[:1024]:
|
|
||||||
return "broken", 0
|
|
||||||
try:
|
|
||||||
from io import BytesIO
|
|
||||||
import logging
|
|
||||||
logging.getLogger("pypdf").setLevel(logging.CRITICAL)
|
|
||||||
from pypdf import PdfReader
|
|
||||||
except ImportError:
|
|
||||||
# 没装 pypdf: 只能靠字节扫描, 判不出就 unknown(从宽, 不误判为坏)
|
|
||||||
pages = _pdf_page_count(content)
|
|
||||||
if pages is None:
|
|
||||||
return "unknown", None
|
|
||||||
return ("preview" if pages <= 1 else "ok"), pages
|
|
||||||
try:
|
|
||||||
pages = len(PdfReader(BytesIO(content), strict=False).pages)
|
|
||||||
except Exception:
|
|
||||||
return "broken", None
|
|
||||||
if pages <= 0:
|
|
||||||
return "broken", pages
|
|
||||||
return ("preview" if pages == 1 else "ok"), pages
|
|
||||||
|
|
||||||
|
|
||||||
def classify_pdf_file(path: str):
|
|
||||||
"""并发 worker 入口: 读取并分类单个 PDF 文件路径。
|
|
||||||
|
|
||||||
返回 (path, kind, pages)。除 _inspect_pdf 的四种 kind 外, 另有 IO 结果:
|
|
||||||
'missing' - 文件不存在
|
|
||||||
'unreadable' - 打开失败(权限等)
|
|
||||||
设计为纯函数(仅 stdlib + pypdf), 可被进程池安全 pickle / 导入。"""
|
|
||||||
try:
|
|
||||||
if not os.path.exists(path):
|
|
||||||
return path, "missing", None
|
|
||||||
with open(path, "rb") as f:
|
|
||||||
content = f.read()
|
|
||||||
except OSError:
|
|
||||||
return path, "unreadable", None
|
|
||||||
kind, pages = _inspect_pdf(content)
|
|
||||||
return path, kind, pages
|
|
||||||
|
|
@ -11,7 +11,7 @@ from lxml import etree
|
||||||
from celery import current_app
|
from celery import current_app
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import random
|
import random
|
||||||
from .pdf_utils import _is_elsevier_preview_pdf
|
import re
|
||||||
from .d_oaurl import download_from_url_playwright
|
from .d_oaurl import download_from_url_playwright
|
||||||
import asyncio
|
import asyncio
|
||||||
import sys
|
import sys
|
||||||
|
|
@ -600,6 +600,41 @@ def _elsevier_fetch_xml(req, paper):
|
||||||
return True, has_fulltext, None
|
return True, has_fulltext, None
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_page_count(content: bytes):
|
||||||
|
"""返回 PDF 页数; 无法确定时返回 None。
|
||||||
|
|
||||||
|
优先用 pypdf 精确解析; 未安装或解析异常时退化为字节扫描
|
||||||
|
(对未压缩对象树有效, Elsevier 的摘要预览页正属此类)。"""
|
||||||
|
try:
|
||||||
|
from io import BytesIO
|
||||||
|
from pypdf import PdfReader
|
||||||
|
return len(PdfReader(BytesIO(content), strict=False).pages)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
counts = [int(m) for m in re.findall(rb"/Count\s+(\d+)", content)]
|
||||||
|
if counts:
|
||||||
|
return max(counts)
|
||||||
|
n = len(re.findall(rb"/Type\s*/Page(?![sR])", content))
|
||||||
|
if n:
|
||||||
|
return n
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _is_elsevier_preview_pdf(content: bytes) -> bool:
|
||||||
|
"""判断 Elsevier 返回的 PDF 是否为"摘要预览页"。
|
||||||
|
|
||||||
|
Elsevier Article API 对未授权 / in-press 文章, application/pdf 端点会返回
|
||||||
|
仅含摘要的 1 页预览 PDF(魔数仍是 %PDF、体积也不小), 全文 XML 却可能正常。
|
||||||
|
判据: 能确定页数且 <= 1 页。无法确定页数时返回 False(从宽, 不误杀真全文)。"""
|
||||||
|
pages = _pdf_page_count(content)
|
||||||
|
return pages is not None and pages <= 1
|
||||||
|
|
||||||
|
|
||||||
def _elsevier_fetch_pdf(req, paper):
|
def _elsevier_fetch_pdf(req, paper):
|
||||||
"""同一 DOI 取 PDF, 成功落库返回 True。"""
|
"""同一 DOI 取 PDF, 成功落库返回 True。"""
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -1,311 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
"""独立脚本: 从 ScienceDirect 网页下载排版 PDF。
|
|
||||||
|
|
||||||
与 apps.resm 解耦, 独立运行。核心难点是 Cloudflare 人机校验: Playwright 自建
|
|
||||||
浏览器带自动化指纹会被 Turnstile 识破而死循环("are you a robot"), 因此推荐
|
|
||||||
连接你手动启动的真实 Chrome(由真人过一次验证), 脚本只负责驱动它下载。
|
|
||||||
|
|
||||||
前提: 运行方 IP 在机构订阅网段(ScienceDirect 按 IP 授权)。
|
|
||||||
|
|
||||||
依赖: playwright, requests, lxml, pypdf(可选, 用于精确判页数)。
|
|
||||||
凭证: 默认从项目 config/conf.py 读取 ELSEVIER_API_KEY / ELSEVIER_INST_TOKEN,
|
|
||||||
也可用 --pii 直接给 PII 跳过取号。
|
|
||||||
|
|
||||||
用法(推荐 CDP 模式):
|
|
||||||
1) 单独起一个带调试端口的 Chrome(独立档案, 不影响日常浏览器):
|
|
||||||
Windows:
|
|
||||||
& "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" \
|
|
||||||
--remote-debugging-port=9222 --user-data-dir="D:\\chrome-sd-profile"
|
|
||||||
Linux:
|
|
||||||
google-chrome --remote-debugging-port=9222 --user-data-dir=/tmp/sd-profile
|
|
||||||
2) 在该 Chrome 里手动打开任一 ScienceDirect 文章, 亲手过掉 Cloudflare 验证。
|
|
||||||
3) 运行本脚本(可一次多篇):
|
|
||||||
python scripts/sd_download.py 10.1016/j.conbuildmat.2026.146897 \
|
|
||||||
--cdp http://localhost:9222 --out ./sd_pdfs
|
|
||||||
|
|
||||||
不加 --cdp 时脚本自行启动浏览器(大概率被 Cloudflare 拦, 仅调试用)。
|
|
||||||
|
|
||||||
提示: 批量爬 ScienceDirect 违反 Elsevier 条款且可能导致机构 IP 被封, 仅供少量补抓。
|
|
||||||
"""
|
|
||||||
import argparse
|
|
||||||
import asyncio
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
# 项目根入 sys.path, 以便读取 config/conf.py 的凭证
|
|
||||||
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
if ROOT not in sys.path:
|
|
||||||
sys.path.insert(0, ROOT)
|
|
||||||
|
|
||||||
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
||||||
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
|
||||||
_STEALTH_ARGS = [
|
|
||||||
"--disable-blink-features=AutomationControlled",
|
|
||||||
"--no-first-run", "--no-default-browser-check",
|
|
||||||
"--disable-infobars", "--disable-extensions", "--disable-notifications",
|
|
||||||
]
|
|
||||||
_CHALLENGE_KW = ("just a moment", "moment", "checking your browser",
|
|
||||||
"attention required")
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------ PII / PDF 工具 ------------------------------
|
|
||||||
|
|
||||||
def get_creds():
|
|
||||||
try:
|
|
||||||
from config.conf import ELSEVIER_API_KEY, ELSEVIER_INST_TOKEN
|
|
||||||
return ELSEVIER_API_KEY, ELSEVIER_INST_TOKEN
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[warn] 读取 config/conf.py 凭证失败: {e!r}")
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_pii(doi):
|
|
||||||
"""调 Elsevier API(text/xml) 取归一化 PII; 失败返回 None。"""
|
|
||||||
import requests
|
|
||||||
from lxml import etree
|
|
||||||
key, token = get_creds()
|
|
||||||
if not key:
|
|
||||||
return None
|
|
||||||
headers = {"X-ELS-APIKey": key}
|
|
||||||
if token:
|
|
||||||
headers["X-ELS-Insttoken"] = token
|
|
||||||
try:
|
|
||||||
r = requests.get(f"https://api.elsevier.com/content/article/doi/{doi}",
|
|
||||||
params={"httpAccept": "text/xml"}, headers=headers,
|
|
||||||
timeout=(3, 30))
|
|
||||||
except requests.RequestException as e:
|
|
||||||
print(f"[warn] 取 PII 请求失败: {e!r}")
|
|
||||||
return None
|
|
||||||
if r.status_code != 200:
|
|
||||||
print(f"[warn] 取 PII 非 200: {r.status_code}")
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
root = etree.fromstring(r.content)
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
nodes = root.xpath("//*[local-name()='pii']/text()")
|
|
||||||
if not nodes:
|
|
||||||
return None
|
|
||||||
return re.sub(r"[^A-Za-z0-9]", "", nodes[0])
|
|
||||||
|
|
||||||
|
|
||||||
def pdf_page_count(content: bytes):
|
|
||||||
"""返回页数, 判不出返回 None。优先 pypdf, 退化字节扫描。"""
|
|
||||||
try:
|
|
||||||
import logging
|
|
||||||
logging.getLogger("pypdf").setLevel(logging.CRITICAL)
|
|
||||||
from pypdf import PdfReader
|
|
||||||
return len(PdfReader(BytesIO(content), strict=False).pages)
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
counts = [int(m) for m in re.findall(rb"/Count\s+(\d+)", content)]
|
|
||||||
if counts:
|
|
||||||
return max(counts)
|
|
||||||
n = len(re.findall(rb"/Type\s*/Page(?![sR])", content))
|
|
||||||
if n:
|
|
||||||
return n
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def classify(content: bytes):
|
|
||||||
"""(kind, pages): broken / preview(1页) / ok(多页) / unknown。"""
|
|
||||||
if not content or b"%PDF" not in content[:1024]:
|
|
||||||
return "broken", 0
|
|
||||||
pages = pdf_page_count(content)
|
|
||||||
if pages is None:
|
|
||||||
return "unknown", None
|
|
||||||
if pages <= 0:
|
|
||||||
return "broken", pages
|
|
||||||
return ("preview" if pages == 1 else "ok"), pages
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------ 浏览器下载 ------------------------------
|
|
||||||
|
|
||||||
async def _wait_challenge_cleared(page, rounds=30, interval=2000):
|
|
||||||
for _ in range(rounds):
|
|
||||||
try:
|
|
||||||
await page.wait_for_timeout(interval)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
title = (await page.title()) or ""
|
|
||||||
except Exception:
|
|
||||||
continue # 跳转中, 下一轮再看
|
|
||||||
if title and not any(k in title.lower() for k in _CHALLENGE_KW):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
async def _find_pdfft_href(page):
|
|
||||||
try:
|
|
||||||
return await page.evaluate(
|
|
||||||
"() => { const a=document.querySelector('a[href*=\"pdfft\"]');"
|
|
||||||
" return a ? a.href : null; }")
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
async def _grab_pdf(page, url, timeout):
|
|
||||||
# 优先 download 事件
|
|
||||||
try:
|
|
||||||
async with page.expect_download(timeout=min(timeout, 45000)) as dl:
|
|
||||||
try:
|
|
||||||
await page.goto(url, timeout=timeout)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
download = await dl.value
|
|
||||||
path = await download.path()
|
|
||||||
if path:
|
|
||||||
with open(path, "rb") as f:
|
|
||||||
data = f.read()
|
|
||||||
if data:
|
|
||||||
return data
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
# 退化: 拦截内联 application/pdf 响应
|
|
||||||
captured = {}
|
|
||||||
|
|
||||||
async def on_resp(resp):
|
|
||||||
try:
|
|
||||||
if "application/pdf" in resp.headers.get("content-type", ""):
|
|
||||||
captured["b"] = await resp.body()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
page.on("response", on_resp)
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
await page.goto(url.replace("?download=true", ""), timeout=timeout)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
for _ in range(12):
|
|
||||||
if captured.get("b"):
|
|
||||||
break
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
finally:
|
|
||||||
try:
|
|
||||||
page.remove_listener("response", on_resp)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return captured.get("b")
|
|
||||||
|
|
||||||
|
|
||||||
async def download_one(pii, save_path, cdp_url=None, headless=False,
|
|
||||||
timeout=60000):
|
|
||||||
"""返回 (ok, msg)。"""
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
article = f"https://www.sciencedirect.com/science/article/pii/{pii}"
|
|
||||||
pdf_url = f"{article}/pdfft?download=true"
|
|
||||||
|
|
||||||
async with async_playwright() as p:
|
|
||||||
connected = bool(cdp_url)
|
|
||||||
if connected:
|
|
||||||
browser = await p.chromium.connect_over_cdp(cdp_url)
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
browser = await p.chromium.launch(headless=headless,
|
|
||||||
channel="chrome",
|
|
||||||
args=_STEALTH_ARGS)
|
|
||||||
except Exception:
|
|
||||||
browser = await p.chromium.launch(headless=headless,
|
|
||||||
args=_STEALTH_ARGS)
|
|
||||||
page = None
|
|
||||||
try:
|
|
||||||
if connected:
|
|
||||||
ctx = browser.contexts[0] if browser.contexts else await browser.new_context()
|
|
||||||
else:
|
|
||||||
ctx = await browser.new_context(
|
|
||||||
viewport={"width": 1920, "height": 1080},
|
|
||||||
user_agent=_UA, locale="en-US", accept_downloads=True)
|
|
||||||
page = await ctx.new_page()
|
|
||||||
page.set_default_timeout(timeout)
|
|
||||||
if not connected:
|
|
||||||
await page.add_init_script(
|
|
||||||
"Object.defineProperty(navigator,'webdriver',{get:()=>false});")
|
|
||||||
|
|
||||||
try:
|
|
||||||
await page.goto(article, wait_until="domcontentloaded",
|
|
||||||
timeout=40000)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
rounds = 60 if connected else 20
|
|
||||||
if not await _wait_challenge_cleared(page, rounds=rounds):
|
|
||||||
return False, "cloudflare_not_cleared"
|
|
||||||
|
|
||||||
# reload 拿干净文章页
|
|
||||||
try:
|
|
||||||
await page.goto(article, wait_until="domcontentloaded",
|
|
||||||
timeout=40000)
|
|
||||||
await page.wait_for_timeout(4000)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
href = await _find_pdfft_href(page) or pdf_url
|
|
||||||
body = await _grab_pdf(page, href, timeout)
|
|
||||||
if not body:
|
|
||||||
return False, "no_pdf_captured"
|
|
||||||
kind, pages = classify(body)
|
|
||||||
if kind != "ok":
|
|
||||||
head = body[:160].decode("utf-8", "replace").replace("\n", " ")
|
|
||||||
return False, f"not_fulltext kind={kind} pages={pages} len={len(body)} head={head!r}"
|
|
||||||
os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
|
|
||||||
with open(save_path, "wb") as f:
|
|
||||||
f.write(body)
|
|
||||||
return True, f"ok pages={pages} len={len(body)}"
|
|
||||||
finally:
|
|
||||||
try:
|
|
||||||
if connected and page:
|
|
||||||
await page.close()
|
|
||||||
await browser.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------ CLI ------------------------------
|
|
||||||
|
|
||||||
async def _run(args):
|
|
||||||
os.makedirs(args.out, exist_ok=True)
|
|
||||||
ok_n = 0
|
|
||||||
for doi in args.doi:
|
|
||||||
doi = doi.strip()
|
|
||||||
pii = args.pii or fetch_pii(doi)
|
|
||||||
if not pii:
|
|
||||||
print(f"[FAIL] {doi}: 取不到 PII")
|
|
||||||
continue
|
|
||||||
save = os.path.join(args.out, doi.replace("/", "_") + ".pdf")
|
|
||||||
print(f"[..] {doi} PII={pii} -> {save}")
|
|
||||||
ok, msg = await download_one(pii, save, cdp_url=args.cdp or None,
|
|
||||||
headless=args.headless, timeout=args.timeout * 1000)
|
|
||||||
if ok:
|
|
||||||
ok_n += 1
|
|
||||||
print(f"[OK] {doi}: {msg}")
|
|
||||||
else:
|
|
||||||
print(f"[FAIL] {doi}: {msg}")
|
|
||||||
print(f"完成: {ok_n}/{len(args.doi)} 成功")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
ap = argparse.ArgumentParser(description="从 ScienceDirect 网页下载 PDF(独立脚本)")
|
|
||||||
ap.add_argument("doi", nargs="+", help="一个或多个 DOI")
|
|
||||||
ap.add_argument("--cdp", default="",
|
|
||||||
help="连接手动启动的 Chrome, 如 http://localhost:9222(推荐)")
|
|
||||||
ap.add_argument("--out", default="./sd_pdfs", help="PDF 输出目录")
|
|
||||||
ap.add_argument("--pii", default="", help="直接指定 PII(仅单篇时用, 跳过 API 取号)")
|
|
||||||
ap.add_argument("--headless", action="store_true", help="无头(非 CDP 模式, 调试用)")
|
|
||||||
ap.add_argument("--timeout", type=int, default=60, help="单步超时(秒)")
|
|
||||||
args = ap.parse_args()
|
|
||||||
|
|
||||||
if sys.platform == "win32":
|
|
||||||
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
|
|
||||||
asyncio.run(_run(args))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
Loading…
Reference in New Issue