"""paper_server 客户端 helper。base_url 默认硬编码,env 可覆盖。""" from __future__ import annotations import os from pathlib import Path from typing import Any, Optional import httpx _BASE_URL = os.environ.get("PAPER_SERVER_URL", "http://paper.xxhhcty.xyz:8080").rstrip("/") _API = f"{_BASE_URL}/api/resm/paper" _PDF = f"{_BASE_URL}/resm/paper" # /resm/paper//pdf/ _TIMEOUT = 30.0 _LIST_FIELDS = ( "id", "doi", "title", "first_author", "publication_year", "publication_name", "has_fulltext_pdf", "has_abstract", "type", ) def _safe_doi(doi: str) -> str: return doi.replace("/", "_") def _is_doi(s: str) -> bool: return "/" in s and s.lstrip().startswith("10.") def _resolve_to_id(id_or_doi: str) -> str: """传 id 直接返,传 doi → 调 list 接口取 id。命中 0 / 多条都抛。""" s = id_or_doi.strip() if not _is_doi(s): return s r = httpx.get(_API + "/", params={"doi": s}, timeout=_TIMEOUT) r.raise_for_status() data = r.json() results = data.get("results") if isinstance(data, dict) and "results" in data else data if not results: raise ValueError(f"doi 未命中: {s}") if len(results) > 1: raise ValueError(f"doi 命中多条({len(results)}): {s}") return results[0]["id"] def search( keyword: str = "", year: Optional[int] = None, doi: str = "", has_pdf: Optional[bool] = None, limit: int = 10, ) -> list[dict]: """搜文献,返回精简列表。 keyword: paper_server search 字段,匹配 title / first_author / first_author_institution year: 精确年份(paper_server 当前只支持 exact) doi: 精确 DOI(命中 0/1 条) has_pdf: True 仅返已下好 PDF;False 仅返没 PDF;None 都返 limit: 默认 10,上限 50 """ if limit > 50: limit = 50 params: dict[str, Any] = {"page_size": limit} if keyword: params["search"] = keyword if year is not None: params["publication_year"] = year if doi: params["doi"] = doi if has_pdf is True: params["has_fulltext"] = "true" elif has_pdf is False: params["has_fulltext"] = "false" r = httpx.get(_API + "/", params=params, timeout=_TIMEOUT) r.raise_for_status() data = r.json() results = data.get("results") if isinstance(data, dict) and "results" in data else data return [{k: p.get(k) for k in _LIST_FIELDS} for p in results[:limit]] def get_paper(id_or_doi: str) -> dict: """取单条 metadata + abstract(走 retrieve 端点)。 abstract 字段由 paper_server retrieve serializer 提供;无 PaperAbstract 行时返空串。 """ pid = _resolve_to_id(id_or_doi) r = httpx.get(f"{_API}/{pid}/", timeout=_TIMEOUT) r.raise_for_status() return r.json() def fetch_pdf(id_or_doi: str, working_dir: str) -> str: """下载 PDF 到 /papers/.pdf,返回相对路径 'papers/.pdf'。 已存在跳过下载直接复用。paper.has_fulltext_pdf=False → 抛 RuntimeError。 """ paper = get_paper(id_or_doi) if not paper.get("has_fulltext_pdf"): reason = paper.get("fail_reason") or "no PDF on server" raise RuntimeError(f"paper has no PDF: id={paper.get('id')} reason={reason}") safe = _safe_doi(paper["doi"]) rel = f"papers/{safe}.pdf" dest = Path(working_dir) / rel if dest.exists() and dest.stat().st_size > 0: return rel dest.parent.mkdir(parents=True, exist_ok=True) with httpx.stream("GET", f"{_PDF}/{paper['id']}/pdf/", timeout=60.0) as resp: resp.raise_for_status() with open(dest, "wb") as f: for chunk in resp.iter_bytes(chunk_size=64 * 1024): f.write(chunk) return rel