zcbot/skills/research/paper.py

"""paper_server 客户端 helper。base_url 默认硬编码,env 可覆盖。"""
from __future__ import annotations

import os
from pathlib import Path
from typing import Any, Optional

import httpx

_BASE_URL = os.environ.get("PAPER_SERVER_URL", "http://paper.xxhhcty.xyz:8080").rstrip("/")
_API = f"{_BASE_URL}/api/resm/paper"
_PDF = f"{_BASE_URL}/resm/paper"  # /resm/paper/<id>/pdf/
_TIMEOUT = 30.0

_LIST_FIELDS = (
    "id",
    "doi",
    "title",
    "first_author",
    "publication_year",
    "publication_name",
    "has_fulltext_pdf",
    "has_abstract",
    "type",
)


def _safe_doi(doi: str) -> str:
    return doi.replace("/", "_")


def _is_doi(s: str) -> bool:
    return "/" in s and s.lstrip().startswith("10.")


def _resolve_to_id(id_or_doi: str) -> str:
    """传 id 直接返,传 doi → 调 list 接口取 id。命中 0 / 多条都抛。"""
    s = id_or_doi.strip()
    if not _is_doi(s):
        return s
    r = httpx.get(_API + "/", params={"doi": s}, timeout=_TIMEOUT)
    r.raise_for_status()
    data = r.json()
    results = data.get("results") if isinstance(data, dict) and "results" in data else data
    if not results:
        raise ValueError(f"doi 未命中: {s}")
    if len(results) > 1:
        raise ValueError(f"doi 命中多条({len(results)}): {s}")
    return results[0]["id"]


def search(
    keyword: str = "",
    year: Optional[int] = None,
    doi: str = "",
    has_pdf: Optional[bool] = None,
    limit: int = 10,
) -> list[dict]:
    """搜文献,返回精简列表。

    keyword: paper_server search 字段,匹配 title / first_author / first_author_institution
    year:    精确年份(paper_server 当前只支持 exact)
    doi:     精确 DOI(命中 0/1 条)
    has_pdf: True 仅返已下好 PDF;False 仅返没 PDF;None 都返
    limit:   默认 10,上限 50
    """
    if limit > 50:
        limit = 50
    params: dict[str, Any] = {"page_size": limit}
    if keyword:
        params["search"] = keyword
    if year is not None:
        params["publication_year"] = year
    if doi:
        params["doi"] = doi
    if has_pdf is True:
        params["has_fulltext"] = "true"
    elif has_pdf is False:
        params["has_fulltext"] = "false"
    r = httpx.get(_API + "/", params=params, timeout=_TIMEOUT)
    r.raise_for_status()
    data = r.json()
    results = data.get("results") if isinstance(data, dict) and "results" in data else data
    return [{k: p.get(k) for k in _LIST_FIELDS} for p in results[:limit]]


def get_paper(id_or_doi: str) -> dict:
    """取单条 metadata + abstract(走 retrieve 端点)。

    abstract 字段由 paper_server retrieve serializer 提供;无 PaperAbstract 行时返空串。
    """
    pid = _resolve_to_id(id_or_doi)
    r = httpx.get(f"{_API}/{pid}/", timeout=_TIMEOUT)
    r.raise_for_status()
    return r.json()


def fetch_pdf(id_or_doi: str, working_dir: str) -> str:
    """下载 PDF 到 <working_dir>/papers/<safe_doi>.pdf,返回相对路径 'papers/<safe_doi>.pdf'。

    已存在跳过下载直接复用。paper.has_fulltext_pdf=False → 抛 RuntimeError。
    """
    paper = get_paper(id_or_doi)
    if not paper.get("has_fulltext_pdf"):
        reason = paper.get("fail_reason") or "no PDF on server"
        raise RuntimeError(f"paper has no PDF: id={paper.get('id')} reason={reason}")
    safe = _safe_doi(paper["doi"])
    rel = f"papers/{safe}.pdf"
    dest = Path(working_dir) / rel
    if dest.exists() and dest.stat().st_size > 0:
        return rel
    dest.parent.mkdir(parents=True, exist_ok=True)
    with httpx.stream("GET", f"{_PDF}/{paper['id']}/pdf/", timeout=60.0) as resp:
        resp.raise_for_status()
        with open(dest, "wb") as f:
            for chunk in resp.iter_bytes(chunk_size=64 * 1024):
                f.write(chunk)
    return rel