zcbot/skills/research/paper.py

119 lines
3.7 KiB
Python

"""paper_server 客户端 helper。base_url 默认硬编码,env 可覆盖。"""
from __future__ import annotations
import os
from pathlib import Path
from typing import Any, Optional
import httpx
_BASE_URL = os.environ.get("PAPER_SERVER_URL", "http://paper.xxhhcty.xyz:8080").rstrip("/")
_API = f"{_BASE_URL}/api/resm/paper"
_PDF = f"{_BASE_URL}/resm/paper" # /resm/paper/<id>/pdf/
_TIMEOUT = 30.0
_LIST_FIELDS = (
"id",
"doi",
"title",
"first_author",
"publication_year",
"publication_name",
"has_fulltext_pdf",
"has_abstract",
"type",
)
def _safe_doi(doi: str) -> str:
return doi.replace("/", "_")
def _is_doi(s: str) -> bool:
return "/" in s and s.lstrip().startswith("10.")
def _resolve_to_id(id_or_doi: str) -> str:
"""传 id 直接返,传 doi → 调 list 接口取 id。命中 0 / 多条都抛。"""
s = id_or_doi.strip()
if not _is_doi(s):
return s
r = httpx.get(_API + "/", params={"doi": s}, timeout=_TIMEOUT)
r.raise_for_status()
data = r.json()
results = data.get("results") if isinstance(data, dict) and "results" in data else data
if not results:
raise ValueError(f"doi 未命中: {s}")
if len(results) > 1:
raise ValueError(f"doi 命中多条({len(results)}): {s}")
return results[0]["id"]
def search(
keyword: str = "",
year: Optional[int] = None,
doi: str = "",
has_pdf: Optional[bool] = None,
limit: int = 10,
) -> list[dict]:
"""搜文献,返回精简列表。
keyword: paper_server search 字段,匹配 title / first_author / first_author_institution
year: 精确年份(paper_server 当前只支持 exact)
doi: 精确 DOI(命中 0/1 条)
has_pdf: True 仅返已下好 PDF;False 仅返没 PDF;None 都返
limit: 默认 10,上限 50
"""
if limit > 50:
limit = 50
params: dict[str, Any] = {"page_size": limit}
if keyword:
params["search"] = keyword
if year is not None:
params["publication_year"] = year
if doi:
params["doi"] = doi
if has_pdf is True:
params["has_fulltext"] = "true"
elif has_pdf is False:
params["has_fulltext"] = "false"
r = httpx.get(_API + "/", params=params, timeout=_TIMEOUT)
r.raise_for_status()
data = r.json()
results = data.get("results") if isinstance(data, dict) and "results" in data else data
return [{k: p.get(k) for k in _LIST_FIELDS} for p in results[:limit]]
def get_paper(id_or_doi: str) -> dict:
"""取单条 metadata + abstract(走 retrieve 端点)。
abstract 字段由 paper_server retrieve serializer 提供;无 PaperAbstract 行时返空串。
"""
pid = _resolve_to_id(id_or_doi)
r = httpx.get(f"{_API}/{pid}/", timeout=_TIMEOUT)
r.raise_for_status()
return r.json()
def fetch_pdf(id_or_doi: str, working_dir: str) -> str:
"""下载 PDF 到 <working_dir>/papers/<safe_doi>.pdf,返回相对路径 'papers/<safe_doi>.pdf'
已存在跳过下载直接复用。paper.has_fulltext_pdf=False → 抛 RuntimeError。
"""
paper = get_paper(id_or_doi)
if not paper.get("has_fulltext_pdf"):
reason = paper.get("fail_reason") or "no PDF on server"
raise RuntimeError(f"paper has no PDF: id={paper.get('id')} reason={reason}")
safe = _safe_doi(paper["doi"])
rel = f"papers/{safe}.pdf"
dest = Path(working_dir) / rel
if dest.exists() and dest.stat().st_size > 0:
return rel
dest.parent.mkdir(parents=True, exist_ok=True)
with httpx.stream("GET", f"{_PDF}/{paper['id']}/pdf/", timeout=60.0) as resp:
resp.raise_for_status()
with open(dest, "wb") as f:
for chunk in resp.iter_bytes(chunk_size=64 * 1024):
f.write(chunk)
return rel