178 lines
6.0 KiB
Python
178 lines
6.0 KiB
Python
"""paper_server 客户端 helper。base_url 默认硬编码,env 可覆盖。"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
import httpx
|
|
|
|
_BASE_URL = os.environ.get("PAPER_SERVER_URL", "http://paper.xxhhcty.xyz:8080").rstrip("/")
|
|
_API = f"{_BASE_URL}/api/resm/paper"
|
|
_PDF = f"{_BASE_URL}/resm/paper" # /resm/paper/<id>/pdf/
|
|
_TIMEOUT = 30.0
|
|
|
|
_LIST_FIELDS = (
|
|
"id",
|
|
"doi",
|
|
"title",
|
|
"first_author",
|
|
"first_author_institution",
|
|
"publication_year",
|
|
"publication_date",
|
|
"publication_name",
|
|
"has_fulltext_pdf",
|
|
"has_fulltext_xml",
|
|
"has_abstract",
|
|
"is_oa",
|
|
"type",
|
|
"abstract",
|
|
"pdf_url",
|
|
"xml_url",
|
|
)
|
|
|
|
|
|
def _safe_doi(doi: str) -> str:
|
|
return doi.replace("/", "_")
|
|
|
|
|
|
def _is_doi(s: str) -> bool:
|
|
return "/" in s and s.lstrip().startswith("10.")
|
|
|
|
|
|
def _resolve_to_id(id_or_doi: str) -> str:
|
|
"""传 id 直接返,传 doi → 调 list 接口取 id。命中 0 / 多条都抛。"""
|
|
s = id_or_doi.strip()
|
|
if not _is_doi(s):
|
|
return s
|
|
r = httpx.get(_API + "/", params={"doi": s}, timeout=_TIMEOUT)
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
results = data.get("results") if isinstance(data, dict) and "results" in data else data
|
|
if not results:
|
|
raise ValueError(f"doi 未命中: {s}")
|
|
if len(results) > 1:
|
|
raise ValueError(f"doi 命中多条({len(results)}): {s}")
|
|
return results[0]["id"]
|
|
|
|
|
|
def search(
|
|
keyword: str = "",
|
|
year: Optional[int] = None,
|
|
year_gte: Optional[int] = None,
|
|
year_lte: Optional[int] = None,
|
|
doi: str = "",
|
|
first_author: str = "",
|
|
publication_name: str = "",
|
|
has_pdf: Optional[bool] = None,
|
|
is_oa: Optional[bool] = None,
|
|
limit: int = 10,
|
|
) -> list[dict]:
|
|
"""搜文献,返回精简列表(每条含 abstract 字段,有就是文本,没就是空串)。
|
|
|
|
keyword: paper_server SearchFilter,模糊匹配 title / first_author / first_author_institution
|
|
库里主语料是英文,**优先英文关键词**(用户中文输入要先转专业英文术语)
|
|
year: 精确年份
|
|
year_gte/year_lte: 年份范围(做"近 N 年文献"用)
|
|
doi: 精确 DOI(命中 0/1 条)
|
|
first_author: 精确作者名
|
|
publication_name: 精确期刊名
|
|
has_pdf: True 仅返 PDF 已下好的;False 仅返没 PDF 的;None 都返
|
|
is_oa: True 仅返 OA 的;False 仅返非 OA;None 都返
|
|
limit: 默认 10,上限 50
|
|
"""
|
|
if limit > 50:
|
|
limit = 50
|
|
params: dict[str, Any] = {"page_size": limit}
|
|
if keyword:
|
|
params["search"] = keyword
|
|
if year is not None:
|
|
params["publication_year"] = year
|
|
if year_gte is not None:
|
|
params["publication_year_gte"] = year_gte
|
|
if year_lte is not None:
|
|
params["publication_year_lte"] = year_lte
|
|
if doi:
|
|
params["doi"] = doi
|
|
if first_author:
|
|
params["first_author"] = first_author
|
|
if publication_name:
|
|
params["publication_name"] = publication_name
|
|
if has_pdf is True:
|
|
params["has_fulltext_pdf"] = "true"
|
|
elif has_pdf is False:
|
|
params["has_fulltext_pdf"] = "false"
|
|
if is_oa is True:
|
|
params["is_oa"] = "true"
|
|
elif is_oa is False:
|
|
params["is_oa"] = "false"
|
|
r = httpx.get(_API + "/", params=params, timeout=_TIMEOUT)
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
results = data.get("results") if isinstance(data, dict) and "results" in data else data
|
|
return [{k: p.get(k) for k in _LIST_FIELDS} for p in results[:limit]]
|
|
|
|
|
|
def get_paper(id_or_doi: str) -> dict:
|
|
"""取单条完整 metadata + abstract。
|
|
|
|
list 端点已带 abstract,正常工作流不需要调本函数;仅在用户给单个 id/DOI 想拿全字段时用。
|
|
"""
|
|
pid = _resolve_to_id(id_or_doi)
|
|
r = httpx.get(f"{_API}/{pid}/", timeout=_TIMEOUT)
|
|
r.raise_for_status()
|
|
return r.json()
|
|
|
|
|
|
def _stream_to(url: str, dest: Path) -> None:
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
with httpx.stream("GET", url, timeout=60.0) as resp:
|
|
resp.raise_for_status()
|
|
with open(dest, "wb") as f:
|
|
for chunk in resp.iter_bytes(chunk_size=64 * 1024):
|
|
f.write(chunk)
|
|
|
|
|
|
def fetch_pdf(id_or_doi: str, working_dir: str) -> str:
|
|
"""下载 PDF 到 <working_dir>/papers/<safe_doi>.pdf,返回相对路径 'papers/<safe_doi>.pdf'。
|
|
|
|
已存在跳过下载直接复用。paper.has_fulltext_pdf=False → 抛 RuntimeError。
|
|
"""
|
|
paper = get_paper(id_or_doi)
|
|
if not paper.get("has_fulltext_pdf"):
|
|
reason = paper.get("fail_reason") or "no PDF on server"
|
|
raise RuntimeError(f"paper has no PDF: id={paper.get('id')} reason={reason}")
|
|
safe = _safe_doi(paper["doi"])
|
|
rel = f"papers/{safe}.pdf"
|
|
dest = Path(working_dir) / rel
|
|
if dest.exists() and dest.stat().st_size > 0:
|
|
return rel
|
|
_stream_to(f"{_PDF}/{paper['id']}/pdf/", dest)
|
|
return rel
|
|
|
|
|
|
def fetch_xml(id_or_doi: str, working_dir: str) -> str:
|
|
"""下载 XML 到 <working_dir>/papers/<safe_doi>.xml,返回相对路径 'papers/<safe_doi>.xml'。
|
|
|
|
XML 走 paper_server media 静态直链(由 list/retrieve 返回的 xml_url 字段提供);
|
|
paper_pdf_view 只覆盖 PDF,XML 没对应 API。
|
|
paper.has_fulltext_xml=False / xml_url 空 → 抛 RuntimeError。
|
|
已存在跳过下载直接复用。
|
|
"""
|
|
paper = get_paper(id_or_doi)
|
|
if not paper.get("has_fulltext_xml"):
|
|
raise RuntimeError(f"paper has no XML: id={paper.get('id')}")
|
|
xml_url = paper.get("xml_url") or ""
|
|
if not xml_url:
|
|
# publication_date 缺失(unknown 目录)→ paper_server 没暴露这层 media URL
|
|
raise RuntimeError(
|
|
f"paper xml_url unavailable (likely missing publication_date): id={paper.get('id')}"
|
|
)
|
|
safe = _safe_doi(paper["doi"])
|
|
rel = f"papers/{safe}.xml"
|
|
dest = Path(working_dir) / rel
|
|
if dest.exists() and dest.stat().st_size > 0:
|
|
return rel
|
|
_stream_to(xml_url, dest)
|
|
return rel
|