"""paper_server 客户端 helper。base_url 默认硬编码,env 可覆盖。""" from __future__ import annotations import os from pathlib import Path from typing import Any, Optional import httpx _BASE_URL = os.environ.get("PAPER_SERVER_URL", "http://paper.xxhhcty.xyz:8080").rstrip("/") _API = f"{_BASE_URL}/api/resm/paper" _TIMEOUT = 30.0 _LIST_FIELDS = ( "id", "doi", "title", "first_author", "first_author_institution", "publication_year", "publication_date", "publication_name", "has_fulltext_pdf", "has_fulltext_xml", "has_abstract", "is_oa", "type", "abstract", "pdf_url", "xml_url", ) def _safe_doi(doi: str) -> str: return doi.replace("/", "_") def _is_doi(s: str) -> bool: return "/" in s and s.lstrip().startswith("10.") def _resolve_to_id(id_or_doi: str) -> str: """传 id 直接返,传 doi → 调 list 接口取 id。命中 0 / 多条都抛。""" s = id_or_doi.strip() if not _is_doi(s): return s r = httpx.get(_API + "/", params={"doi": s}, timeout=_TIMEOUT) r.raise_for_status() data = r.json() results = data.get("results") if isinstance(data, dict) and "results" in data else data if not results: raise ValueError(f"doi 未命中: {s}") if len(results) > 1: raise ValueError(f"doi 命中多条({len(results)}): {s}") return results[0]["id"] def search( keyword: str = "", year: Optional[int] = None, year_gte: Optional[int] = None, year_lte: Optional[int] = None, doi: str = "", first_author: str = "", publication_name: str = "", has_pdf: Optional[bool] = None, is_oa: Optional[bool] = None, limit: int = 10, ) -> list[dict]: """搜文献,返回精简列表(每条含 abstract 字段,有就是文本,没就是空串)。 keyword: paper_server SearchFilter,模糊匹配 title / first_author / first_author_institution 库里主语料是英文,**优先英文关键词**(用户中文输入要先转专业英文术语) year: 精确年份 year_gte/year_lte: 年份范围(做"近 N 年文献"用) doi: 精确 DOI(命中 0/1 条) first_author: 精确作者名 publication_name: 精确期刊名 has_pdf: True 仅返 PDF 已下好的;False 仅返没 PDF 的;None 都返 is_oa: True 仅返 OA 的;False 仅返非 OA;None 都返 limit: 默认 10,上限 50 """ if limit > 50: limit = 50 params: dict[str, Any] = {"page_size": limit} if keyword: params["search"] = keyword if year is not None: params["publication_year"] = year if year_gte is not None: params["publication_year_gte"] = year_gte if year_lte is not None: params["publication_year_lte"] = year_lte if doi: params["doi"] = doi if first_author: params["first_author"] = first_author if publication_name: params["publication_name"] = publication_name if has_pdf is True: params["has_fulltext_pdf"] = "true" elif has_pdf is False: params["has_fulltext_pdf"] = "false" if is_oa is True: params["is_oa"] = "true" elif is_oa is False: params["is_oa"] = "false" r = httpx.get(_API + "/", params=params, timeout=_TIMEOUT) r.raise_for_status() data = r.json() results = data.get("results") if isinstance(data, dict) and "results" in data else data return [{k: p.get(k) for k in _LIST_FIELDS} for p in results[:limit]] def get_paper(id_or_doi: str) -> dict: """取单条完整 metadata + abstract。 list 端点已带 abstract,正常工作流不需要调本函数;仅在用户给单个 id/DOI 想拿全字段时用。 """ pid = _resolve_to_id(id_or_doi) r = httpx.get(f"{_API}/{pid}/", timeout=_TIMEOUT) r.raise_for_status() return r.json() def _stream_to(url: str, dest: Path) -> None: dest.parent.mkdir(parents=True, exist_ok=True) with httpx.stream("GET", url, timeout=60.0) as resp: resp.raise_for_status() with open(dest, "wb") as f: for chunk in resp.iter_bytes(chunk_size=64 * 1024): f.write(chunk) def fetch_pdf(id_or_doi: str, working_dir: str) -> str: """下载 PDF 到 /papers/.pdf,返回相对路径 'papers/.pdf'。 走 paper_server media 静态直链(从 list/retrieve 返回的 pdf_url 字段),跟 fetch_xml 同范式。 paper.has_fulltext_pdf=False / pdf_url 空(publication_date 缺失时)→ 抛 RuntimeError。 已存在跳过下载直接复用。 """ paper = get_paper(id_or_doi) if not paper.get("has_fulltext_pdf"): reason = paper.get("fail_reason") or "no PDF on server" raise RuntimeError(f"paper has no PDF: id={paper.get('id')} reason={reason}") pdf_url = paper.get("pdf_url") or "" if not pdf_url: raise RuntimeError( f"paper pdf_url unavailable (likely missing publication_date): id={paper.get('id')}" ) safe = _safe_doi(paper["doi"]) rel = f"papers/{safe}.pdf" dest = Path(working_dir) / rel if dest.exists() and dest.stat().st_size > 0: return rel _stream_to(pdf_url, dest) return rel def fetch_xml(id_or_doi: str, working_dir: str) -> str: """下载 XML 到 /papers/.xml,返回相对路径 'papers/.xml'。 XML 走 paper_server media 静态直链(由 list/retrieve 返回的 xml_url 字段提供); paper_pdf_view 只覆盖 PDF,XML 没对应 API。 paper.has_fulltext_xml=False / xml_url 空 → 抛 RuntimeError。 已存在跳过下载直接复用。 """ paper = get_paper(id_or_doi) if not paper.get("has_fulltext_xml"): raise RuntimeError(f"paper has no XML: id={paper.get('id')}") xml_url = paper.get("xml_url") or "" if not xml_url: # publication_date 缺失(unknown 目录)→ paper_server 没暴露这层 media URL raise RuntimeError( f"paper xml_url unavailable (likely missing publication_date): id={paper.get('id')}" ) safe = _safe_doi(paper["doi"]) rel = f"papers/{safe}.xml" dest = Path(working_dir) / rel if dest.exists() and dest.stat().st_size > 0: return rel _stream_to(xml_url, dest) return rel