zcbot/core/probe.py

"""能力探测: 用真实 LLM 调用对账 yaml 中声称的能力。

只在用户显式触发(`cli.py probe`)时跑——会花 API 额度,不进启动路径。
不修改 yaml,只输出对比报告;让用户自己判断要不要改档案。

四项探测:
- basic_chat:连通性。失败则跳过其余。
- parallel_tools:给两个独立工具,看 single response 里 tool_calls 数量。
- thinking_mode:对 declared=True 的模型传 reasoning_effort,看 API 是否接受 + 是否产出 thinking。
- long_context(opt-in):needle-in-haystack 简化版,默认探 reliable_context 的 1/8。
"""
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any, List, Optional

from .capabilities import ModelCapabilities
from .llm import LLM


@dataclass
class ProbeResult:
    name: str
    declared: Any
    observed: Any
    status: str  # "ok" / "mismatch" / "skip" / "error"
    detail: str = ""


@dataclass
class ProbeReport:
    model: str
    results: List[ProbeResult] = field(default_factory=list)

    def add(self, r: ProbeResult) -> None:
        self.results.append(r)

    @property
    def has_mismatch(self) -> bool:
        return any(r.status == "mismatch" for r in self.results)


def _msg_dict(msg: Any) -> dict:
    if hasattr(msg, "model_dump"):
        return msg.model_dump()
    if hasattr(msg, "dict"):
        return msg.dict()
    return {}


# ----- 单项 probe -----

def probe_basic_chat(llm: LLM) -> ProbeResult:
    try:
        resp = llm.chat(
            messages=[{"role": "user", "content": "Reply with exactly the word: pong"}],
        )
        text = (resp.choices[0].message.content or "").strip()
        ok = "pong" in text.lower()
        return ProbeResult(
            name="basic_chat",
            declared="reachable",
            observed=text[:40] or "<empty>",
            status="ok" if ok else "mismatch",
            detail="" if ok else f"expected 'pong', got: {text[:80]!r}",
        )
    except Exception as e:
        return ProbeResult(
            name="basic_chat",
            declared="reachable",
            observed=None,
            status="error",
            detail=f"{type(e).__name__}: {e}",
        )


def probe_parallel_tools(llm: LLM, caps: ModelCapabilities) -> ProbeResult:
    declared = caps.parallel_tools
    tools = [
        {
            "type": "function",
            "function": {
                "name": "get_weather",
                "description": "Get current weather for a city.",
                "parameters": {
                    "type": "object",
                    "properties": {"city": {"type": "string"}},
                    "required": ["city"],
                },
            },
        },
        {
            "type": "function",
            "function": {
                "name": "get_time",
                "description": "Get current time in a timezone.",
                "parameters": {
                    "type": "object",
                    "properties": {"tz": {"type": "string"}},
                    "required": ["tz"],
                },
            },
        },
    ]
    user_msg = (
        "I need two independent pieces of information at the same time: the weather "
        "in Beijing AND the current time in Tokyo. Please call BOTH tools in this "
        "single turn (in parallel)."
    )
    try:
        resp = llm.chat(
            messages=[{"role": "user", "content": user_msg}],
            tools=tools,
            parallel_tool_calls=True,
        )
        tool_calls = getattr(resp.choices[0].message, "tool_calls", None) or []
        n = len(tool_calls)
        observed = n >= 2
        return ProbeResult(
            name="parallel_tools",
            declared=declared,
            observed=observed,
            status="ok" if observed == bool(declared) else "mismatch",
            detail=f"{n} tool_calls in single response",
        )
    except Exception as e:
        return ProbeResult(
            name="parallel_tools",
            declared=declared,
            observed=None,
            status="error",
            detail=f"{type(e).__name__}: {e}",
        )


def probe_thinking_mode(llm: LLM, caps: ModelCapabilities) -> ProbeResult:
    declared = caps.thinking_mode
    if not declared:
        return ProbeResult(
            name="thinking_mode",
            declared=False,
            observed=None,
            status="skip",
            detail="declared false; skipping (cap-side flag controls API forwarding)",
        )
    effort = (
        caps.default_reasoning_effort
        or (caps.reasoning_effort_levels[0] if caps.reasoning_effort_levels else "medium")
    )
    try:
        resp = llm.chat(
            messages=[{"role": "user", "content": "Briefly: what is 17 * 23?"}],
            reasoning_effort=effort,
        )
        msg = resp.choices[0].message
        d = _msg_dict(msg)
        rc = (
            getattr(msg, "reasoning_content", None)
            or getattr(msg, "thinking", None)
            or d.get("reasoning_content")
            or d.get("thinking")
        )
        observed = bool(rc)
        return ProbeResult(
            name="thinking_mode",
            declared=True,
            observed=observed,
            status="ok" if observed else "mismatch",
            detail=(
                f"reasoning_effort={effort} accepted; "
                + ("thinking content returned" if observed else "no thinking content in response")
            ),
        )
    except Exception as e:
        return ProbeResult(
            name="thinking_mode",
            declared=True,
            observed=False,
            status="mismatch",
            detail=f"reasoning_effort rejected: {type(e).__name__}: {e}",
        )


def probe_long_context(
    llm: LLM, caps: ModelCapabilities, target_chars: Optional[int] = None
) -> ProbeResult:
    """needle-in-haystack 简化版。默认探 reliable_context * 4 / 8 字符,上限 200K。"""
    if target_chars is None:
        target_chars = caps.reliable_context * 4 // 8
    target_chars = max(2_000, min(target_chars, 200_000))
    SECRET = "K7-ZULU-9213"
    pad = "The quick brown fox jumps over the lazy dog. " * 200
    n_blocks = max(1, target_chars // len(pad))
    middle = n_blocks // 2
    parts: List[str] = []
    for i in range(n_blocks):
        if i == middle:
            parts.append(f"\n>>> SECRET TOKEN: {SECRET} <<<\n")
        parts.append(pad)
    haystack = "".join(parts)
    prompt = (
        "Below is a long block of text. Somewhere in it a SECRET TOKEN is recorded "
        "after the marker '>>> SECRET TOKEN:'. Reply with ONLY the token value, "
        "nothing else.\n\n" + haystack
    )
    try:
        resp = llm.chat(messages=[{"role": "user", "content": prompt}])
        text = (resp.choices[0].message.content or "").strip()
        ok = SECRET in text
        return ProbeResult(
            name="long_context",
            declared=f"reliable_context={caps.reliable_context}",
            observed=f"{len(haystack)} chars sent; secret {'recovered' if ok else 'missed'}",
            status="ok" if ok else "mismatch",
            detail=f"reply head: {text[:80]!r}",
        )
    except Exception as e:
        return ProbeResult(
            name="long_context",
            declared=f"reliable_context={caps.reliable_context}",
            observed=None,
            status="error",
            detail=f"{type(e).__name__}: {e}",
        )


# ----- 顶层入口 -----

def probe_capabilities(
    caps: ModelCapabilities,
    llm: LLM,
    *,
    include_long_context: bool = False,
) -> ProbeReport:
    report = ProbeReport(model=caps.model_id)
    report.add(probe_basic_chat(llm))
    if report.results[0].status == "error":
        return report
    report.add(probe_parallel_tools(llm, caps))
    report.add(probe_thinking_mode(llm, caps))
    if include_long_context:
        report.add(probe_long_context(llm, caps))
    return report