"""paper_server 客户端 helper。base_url 默认硬编码,env 可覆盖。"""
from __future__ import annotations

import os
from pathlib import Path
from typing import Any, Optional

import httpx

_BASE_URL = os.environ.get("PAPER_SERVER_URL", "http://paper.xxhhcty.xyz:8080").rstrip("/")
_API = f"{_BASE_URL}/api/resm/paper"
_TIMEOUT = 30.0

_LIST_FIELDS = (
    "id",
    "doi",
    "title",
    "first_author",
    "first_author_institution",
    "publication_year",
    "publication_date",
    "publication_name",
    "has_fulltext_pdf",
    "has_fulltext_xml",
    "has_abstract",
    "is_oa",
    "type",
    "abstract",
    "pdf_url",
    "xml_url",
)


def _safe_doi(doi: str) -> str:
    return doi.replace("/", "_")


def _is_doi(s: str) -> bool:
    return "/" in s and s.lstrip().startswith("10.")


def _resolve_to_id(id_or_doi: str) -> str:
    """传 id 直接返,传 doi → 调 list 接口取 id。命中 0 / 多条都抛。"""
    s = id_or_doi.strip()
    if not _is_doi(s):
        return s
    r = httpx.get(_API + "/", params={"doi": s}, timeout=_TIMEOUT)
    r.raise_for_status()
    data = r.json()
    results = data.get("results") if isinstance(data, dict) and "results" in data else data
    if not results:
        raise ValueError(f"doi 未命中: {s}")
    if len(results) > 1:
        raise ValueError(f"doi 命中多条({len(results)}): {s}")
    return results[0]["id"]


def search(
    keyword: str = "",
    year: Optional[int] = None,
    year_gte: Optional[int] = None,
    year_lte: Optional[int] = None,
    doi: str = "",
    first_author: str = "",
    publication_name: str = "",
    has_pdf: Optional[bool] = None,
    is_oa: Optional[bool] = None,
    limit: int = 10,
) -> list[dict]:
    """搜文献,返回精简列表(每条含 abstract 字段,有就是文本,没就是空串)。

    keyword:          paper_server SearchFilter,模糊匹配 title / first_author / first_author_institution
                      库里主语料是英文,**优先英文关键词**(用户中文输入要先转专业英文术语)
    year:             精确年份
    year_gte/year_lte: 年份范围(做"近 N 年文献"用)
    doi:              精确 DOI(命中 0/1 条)
    first_author:     精确作者名
    publication_name: 精确期刊名
    has_pdf:          True 仅返 PDF 已下好的;False 仅返没 PDF 的;None 都返
    is_oa:            True 仅返 OA 的;False 仅返非 OA;None 都返
    limit:            默认 10,上限 50
    """
    if limit > 50:
        limit = 50
    params: dict[str, Any] = {"page_size": limit}
    if keyword:
        params["search"] = keyword
    if year is not None:
        params["publication_year"] = year
    if year_gte is not None:
        params["publication_year_gte"] = year_gte
    if year_lte is not None:
        params["publication_year_lte"] = year_lte
    if doi:
        params["doi"] = doi
    if first_author:
        params["first_author"] = first_author
    if publication_name:
        params["publication_name"] = publication_name
    if has_pdf is True:
        params["has_fulltext_pdf"] = "true"
    elif has_pdf is False:
        params["has_fulltext_pdf"] = "false"
    if is_oa is True:
        params["is_oa"] = "true"
    elif is_oa is False:
        params["is_oa"] = "false"
    r = httpx.get(_API + "/", params=params, timeout=_TIMEOUT)
    r.raise_for_status()
    data = r.json()
    results = data.get("results") if isinstance(data, dict) and "results" in data else data
    return [{k: p.get(k) for k in _LIST_FIELDS} for p in results[:limit]]


def get_paper(id_or_doi: str) -> dict:
    """取单条完整 metadata + abstract。

    list 端点已带 abstract,正常工作流不需要调本函数;仅在用户给单个 id/DOI 想拿全字段时用。
    """
    pid = _resolve_to_id(id_or_doi)
    r = httpx.get(f"{_API}/{pid}/", timeout=_TIMEOUT)
    r.raise_for_status()
    return r.json()


def _stream_to(url: str, dest: Path) -> None:
    dest.parent.mkdir(parents=True, exist_ok=True)
    with httpx.stream("GET", url, timeout=60.0) as resp:
        resp.raise_for_status()
        with open(dest, "wb") as f:
            for chunk in resp.iter_bytes(chunk_size=64 * 1024):
                f.write(chunk)


def fetch_pdf(id_or_doi: str, working_dir: str) -> str:
    """下载 PDF 到 <working_dir>/papers/<safe_doi>.pdf,返回相对路径 'papers/<safe_doi>.pdf'。

    走 paper_server media 静态直链(从 list/retrieve 返回的 pdf_url 字段),跟 fetch_xml 同范式。
    paper.has_fulltext_pdf=False / pdf_url 空(publication_date 缺失时)→ 抛 RuntimeError。
    已存在跳过下载直接复用。
    """
    paper = get_paper(id_or_doi)
    if not paper.get("has_fulltext_pdf"):
        reason = paper.get("fail_reason") or "no PDF on server"
        raise RuntimeError(f"paper has no PDF: id={paper.get('id')} reason={reason}")
    pdf_url = paper.get("pdf_url") or ""
    if not pdf_url:
        raise RuntimeError(
            f"paper pdf_url unavailable (likely missing publication_date): id={paper.get('id')}"
        )
    safe = _safe_doi(paper["doi"])
    rel = f"papers/{safe}.pdf"
    dest = Path(working_dir) / rel
    if dest.exists() and dest.stat().st_size > 0:
        return rel
    _stream_to(pdf_url, dest)
    return rel


def fetch_xml(id_or_doi: str, working_dir: str) -> str:
    """下载 XML 到 <working_dir>/papers/<safe_doi>.xml,返回相对路径 'papers/<safe_doi>.xml'。

    XML 走 paper_server media 静态直链(由 list/retrieve 返回的 xml_url 字段提供);
    paper_pdf_view 只覆盖 PDF,XML 没对应 API。
    paper.has_fulltext_xml=False / xml_url 空 → 抛 RuntimeError。
    已存在跳过下载直接复用。
    """
    paper = get_paper(id_or_doi)
    if not paper.get("has_fulltext_xml"):
        raise RuntimeError(f"paper has no XML: id={paper.get('id')}")
    xml_url = paper.get("xml_url") or ""
    if not xml_url:
        # publication_date 缺失(unknown 目录)→ paper_server 没暴露这层 media URL
        raise RuntimeError(
            f"paper xml_url unavailable (likely missing publication_date): id={paper.get('id')}"
        )
    safe = _safe_doi(paper["doi"])
    rel = f"papers/{safe}.xml"
    dest = Path(working_dir) / rel
    if dest.exists() and dest.stat().st_size > 0:
        return rel
    _stream_to(xml_url, dest)
    return rel