"""能力探测: 用真实 LLM 调用对账 yaml 中声称的能力。 只在用户显式触发(`cli.py probe`)时跑——会花 API 额度,不进启动路径。 不修改 yaml,只输出对比报告;让用户自己判断要不要改档案。 四项探测: - basic_chat:连通性。失败则跳过其余。 - parallel_tools:给两个独立工具,看 single response 里 tool_calls 数量。 - thinking_mode:对 declared=True 的模型传 reasoning_effort,看 API 是否接受 + 是否产出 thinking。 - long_context(opt-in):needle-in-haystack 简化版,默认探 reliable_context 的 1/8。 """ from __future__ import annotations from dataclasses import dataclass, field from typing import Any, List, Optional from .capabilities import ModelCapabilities from .llm import LLM @dataclass class ProbeResult: name: str declared: Any observed: Any status: str # "ok" / "mismatch" / "skip" / "error" detail: str = "" @dataclass class ProbeReport: model: str results: List[ProbeResult] = field(default_factory=list) def add(self, r: ProbeResult) -> None: self.results.append(r) @property def has_mismatch(self) -> bool: return any(r.status == "mismatch" for r in self.results) def _msg_dict(msg: Any) -> dict: if hasattr(msg, "model_dump"): return msg.model_dump() if hasattr(msg, "dict"): return msg.dict() return {} # ----- 单项 probe ----- def probe_basic_chat(llm: LLM) -> ProbeResult: try: resp = llm.chat( messages=[{"role": "user", "content": "Reply with exactly the word: pong"}], ) text = (resp.choices[0].message.content or "").strip() ok = "pong" in text.lower() return ProbeResult( name="basic_chat", declared="reachable", observed=text[:40] or "", status="ok" if ok else "mismatch", detail="" if ok else f"expected 'pong', got: {text[:80]!r}", ) except Exception as e: return ProbeResult( name="basic_chat", declared="reachable", observed=None, status="error", detail=f"{type(e).__name__}: {e}", ) def probe_parallel_tools(llm: LLM, caps: ModelCapabilities) -> ProbeResult: declared = caps.parallel_tools tools = [ { "type": "function", "function": { "name": "get_weather", "description": "Get current weather for a city.", "parameters": { "type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"], }, }, }, { "type": "function", "function": { "name": "get_time", "description": "Get current time in a timezone.", "parameters": { "type": "object", "properties": {"tz": {"type": "string"}}, "required": ["tz"], }, }, }, ] user_msg = ( "I need two independent pieces of information at the same time: the weather " "in Beijing AND the current time in Tokyo. Please call BOTH tools in this " "single turn (in parallel)." ) try: resp = llm.chat( messages=[{"role": "user", "content": user_msg}], tools=tools, parallel_tool_calls=True, ) tool_calls = getattr(resp.choices[0].message, "tool_calls", None) or [] n = len(tool_calls) observed = n >= 2 return ProbeResult( name="parallel_tools", declared=declared, observed=observed, status="ok" if observed == bool(declared) else "mismatch", detail=f"{n} tool_calls in single response", ) except Exception as e: return ProbeResult( name="parallel_tools", declared=declared, observed=None, status="error", detail=f"{type(e).__name__}: {e}", ) def probe_thinking_mode(llm: LLM, caps: ModelCapabilities) -> ProbeResult: declared = caps.thinking_mode if not declared: return ProbeResult( name="thinking_mode", declared=False, observed=None, status="skip", detail="declared false; skipping (cap-side flag controls API forwarding)", ) effort = ( caps.default_reasoning_effort or (caps.reasoning_effort_levels[0] if caps.reasoning_effort_levels else "medium") ) try: resp = llm.chat( messages=[{"role": "user", "content": "Briefly: what is 17 * 23?"}], reasoning_effort=effort, ) msg = resp.choices[0].message d = _msg_dict(msg) rc = ( getattr(msg, "reasoning_content", None) or getattr(msg, "thinking", None) or d.get("reasoning_content") or d.get("thinking") ) observed = bool(rc) return ProbeResult( name="thinking_mode", declared=True, observed=observed, status="ok" if observed else "mismatch", detail=( f"reasoning_effort={effort} accepted; " + ("thinking content returned" if observed else "no thinking content in response") ), ) except Exception as e: return ProbeResult( name="thinking_mode", declared=True, observed=False, status="mismatch", detail=f"reasoning_effort rejected: {type(e).__name__}: {e}", ) def probe_long_context( llm: LLM, caps: ModelCapabilities, target_chars: Optional[int] = None ) -> ProbeResult: """needle-in-haystack 简化版。默认探 reliable_context * 4 / 8 字符,上限 200K。""" if target_chars is None: target_chars = caps.reliable_context * 4 // 8 target_chars = max(2_000, min(target_chars, 200_000)) SECRET = "K7-ZULU-9213" pad = "The quick brown fox jumps over the lazy dog. " * 200 n_blocks = max(1, target_chars // len(pad)) middle = n_blocks // 2 parts: List[str] = [] for i in range(n_blocks): if i == middle: parts.append(f"\n>>> SECRET TOKEN: {SECRET} <<<\n") parts.append(pad) haystack = "".join(parts) prompt = ( "Below is a long block of text. Somewhere in it a SECRET TOKEN is recorded " "after the marker '>>> SECRET TOKEN:'. Reply with ONLY the token value, " "nothing else.\n\n" + haystack ) try: resp = llm.chat(messages=[{"role": "user", "content": prompt}]) text = (resp.choices[0].message.content or "").strip() ok = SECRET in text return ProbeResult( name="long_context", declared=f"reliable_context={caps.reliable_context}", observed=f"{len(haystack)} chars sent; secret {'recovered' if ok else 'missed'}", status="ok" if ok else "mismatch", detail=f"reply head: {text[:80]!r}", ) except Exception as e: return ProbeResult( name="long_context", declared=f"reliable_context={caps.reliable_context}", observed=None, status="error", detail=f"{type(e).__name__}: {e}", ) # ----- 顶层入口 ----- def probe_capabilities( caps: ModelCapabilities, llm: LLM, *, include_long_context: bool = False, ) -> ProbeReport: report = ProbeReport(model=caps.model_id) report.add(probe_basic_chat(llm)) if report.results[0].status == "error": return report report.add(probe_parallel_tools(llm, caps)) report.add(probe_thinking_mode(llm, caps)) if include_long_context: report.add(probe_long_context(llm, caps)) return report