119 lines
3.7 KiB
Python
119 lines
3.7 KiB
Python
"""paper_server 客户端 helper。base_url 默认硬编码,env 可覆盖。"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
import httpx
|
|
|
|
_BASE_URL = os.environ.get("PAPER_SERVER_URL", "http://paper.xxhhcty.xyz:8080").rstrip("/")
|
|
_API = f"{_BASE_URL}/api/resm/paper"
|
|
_PDF = f"{_BASE_URL}/resm/paper" # /resm/paper/<id>/pdf/
|
|
_TIMEOUT = 30.0
|
|
|
|
_LIST_FIELDS = (
|
|
"id",
|
|
"doi",
|
|
"title",
|
|
"first_author",
|
|
"publication_year",
|
|
"publication_name",
|
|
"has_fulltext_pdf",
|
|
"has_abstract",
|
|
"type",
|
|
)
|
|
|
|
|
|
def _safe_doi(doi: str) -> str:
|
|
return doi.replace("/", "_")
|
|
|
|
|
|
def _is_doi(s: str) -> bool:
|
|
return "/" in s and s.lstrip().startswith("10.")
|
|
|
|
|
|
def _resolve_to_id(id_or_doi: str) -> str:
|
|
"""传 id 直接返,传 doi → 调 list 接口取 id。命中 0 / 多条都抛。"""
|
|
s = id_or_doi.strip()
|
|
if not _is_doi(s):
|
|
return s
|
|
r = httpx.get(_API + "/", params={"doi": s}, timeout=_TIMEOUT)
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
results = data.get("results") if isinstance(data, dict) and "results" in data else data
|
|
if not results:
|
|
raise ValueError(f"doi 未命中: {s}")
|
|
if len(results) > 1:
|
|
raise ValueError(f"doi 命中多条({len(results)}): {s}")
|
|
return results[0]["id"]
|
|
|
|
|
|
def search(
|
|
keyword: str = "",
|
|
year: Optional[int] = None,
|
|
doi: str = "",
|
|
has_pdf: Optional[bool] = None,
|
|
limit: int = 10,
|
|
) -> list[dict]:
|
|
"""搜文献,返回精简列表。
|
|
|
|
keyword: paper_server search 字段,匹配 title / first_author / first_author_institution
|
|
year: 精确年份(paper_server 当前只支持 exact)
|
|
doi: 精确 DOI(命中 0/1 条)
|
|
has_pdf: True 仅返已下好 PDF;False 仅返没 PDF;None 都返
|
|
limit: 默认 10,上限 50
|
|
"""
|
|
if limit > 50:
|
|
limit = 50
|
|
params: dict[str, Any] = {"page_size": limit}
|
|
if keyword:
|
|
params["search"] = keyword
|
|
if year is not None:
|
|
params["publication_year"] = year
|
|
if doi:
|
|
params["doi"] = doi
|
|
if has_pdf is True:
|
|
params["has_fulltext"] = "true"
|
|
elif has_pdf is False:
|
|
params["has_fulltext"] = "false"
|
|
r = httpx.get(_API + "/", params=params, timeout=_TIMEOUT)
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
results = data.get("results") if isinstance(data, dict) and "results" in data else data
|
|
return [{k: p.get(k) for k in _LIST_FIELDS} for p in results[:limit]]
|
|
|
|
|
|
def get_paper(id_or_doi: str) -> dict:
|
|
"""取单条 metadata + abstract(走 retrieve 端点)。
|
|
|
|
abstract 字段由 paper_server retrieve serializer 提供;无 PaperAbstract 行时返空串。
|
|
"""
|
|
pid = _resolve_to_id(id_or_doi)
|
|
r = httpx.get(f"{_API}/{pid}/", timeout=_TIMEOUT)
|
|
r.raise_for_status()
|
|
return r.json()
|
|
|
|
|
|
def fetch_pdf(id_or_doi: str, working_dir: str) -> str:
|
|
"""下载 PDF 到 <working_dir>/papers/<safe_doi>.pdf,返回相对路径 'papers/<safe_doi>.pdf'。
|
|
|
|
已存在跳过下载直接复用。paper.has_fulltext_pdf=False → 抛 RuntimeError。
|
|
"""
|
|
paper = get_paper(id_or_doi)
|
|
if not paper.get("has_fulltext_pdf"):
|
|
reason = paper.get("fail_reason") or "no PDF on server"
|
|
raise RuntimeError(f"paper has no PDF: id={paper.get('id')} reason={reason}")
|
|
safe = _safe_doi(paper["doi"])
|
|
rel = f"papers/{safe}.pdf"
|
|
dest = Path(working_dir) / rel
|
|
if dest.exists() and dest.stat().st_size > 0:
|
|
return rel
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
with httpx.stream("GET", f"{_PDF}/{paper['id']}/pdf/", timeout=60.0) as resp:
|
|
resp.raise_for_status()
|
|
with open(dest, "wb") as f:
|
|
for chunk in resp.iter_bytes(chunk_size=64 * 1024):
|
|
f.write(chunk)
|
|
return rel
|