zcbot/core/probe.py

244 lines
7.8 KiB
Python

"""能力探测: 用真实 LLM 调用对账 yaml 中声称的能力。
只在用户显式触发(`cli.py probe`)时跑——会花 API 额度,不进启动路径。
不修改 yaml,只输出对比报告;让用户自己判断要不要改档案。
四项探测:
- basic_chat:连通性。失败则跳过其余。
- parallel_tools:给两个独立工具,看 single response 里 tool_calls 数量。
- thinking_mode:对 declared=True 的模型传 reasoning_effort,看 API 是否接受 + 是否产出 thinking。
- long_context(opt-in):needle-in-haystack 简化版,默认探 reliable_context 的 1/8。
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, List, Optional
from .capabilities import ModelCapabilities
from .llm import LLM
@dataclass
class ProbeResult:
name: str
declared: Any
observed: Any
status: str # "ok" / "mismatch" / "skip" / "error"
detail: str = ""
@dataclass
class ProbeReport:
model: str
results: List[ProbeResult] = field(default_factory=list)
def add(self, r: ProbeResult) -> None:
self.results.append(r)
@property
def has_mismatch(self) -> bool:
return any(r.status == "mismatch" for r in self.results)
def _msg_dict(msg: Any) -> dict:
if hasattr(msg, "model_dump"):
return msg.model_dump()
if hasattr(msg, "dict"):
return msg.dict()
return {}
# ----- 单项 probe -----
def probe_basic_chat(llm: LLM) -> ProbeResult:
try:
resp = llm.chat(
messages=[{"role": "user", "content": "Reply with exactly the word: pong"}],
)
text = (resp.choices[0].message.content or "").strip()
ok = "pong" in text.lower()
return ProbeResult(
name="basic_chat",
declared="reachable",
observed=text[:40] or "<empty>",
status="ok" if ok else "mismatch",
detail="" if ok else f"expected 'pong', got: {text[:80]!r}",
)
except Exception as e:
return ProbeResult(
name="basic_chat",
declared="reachable",
observed=None,
status="error",
detail=f"{type(e).__name__}: {e}",
)
def probe_parallel_tools(llm: LLM, caps: ModelCapabilities) -> ProbeResult:
declared = caps.parallel_tools
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get current weather for a city.",
"parameters": {
"type": "object",
"properties": {"city": {"type": "string"}},
"required": ["city"],
},
},
},
{
"type": "function",
"function": {
"name": "get_time",
"description": "Get current time in a timezone.",
"parameters": {
"type": "object",
"properties": {"tz": {"type": "string"}},
"required": ["tz"],
},
},
},
]
user_msg = (
"I need two independent pieces of information at the same time: the weather "
"in Beijing AND the current time in Tokyo. Please call BOTH tools in this "
"single turn (in parallel)."
)
try:
resp = llm.chat(
messages=[{"role": "user", "content": user_msg}],
tools=tools,
parallel_tool_calls=True,
)
tool_calls = getattr(resp.choices[0].message, "tool_calls", None) or []
n = len(tool_calls)
observed = n >= 2
return ProbeResult(
name="parallel_tools",
declared=declared,
observed=observed,
status="ok" if observed == bool(declared) else "mismatch",
detail=f"{n} tool_calls in single response",
)
except Exception as e:
return ProbeResult(
name="parallel_tools",
declared=declared,
observed=None,
status="error",
detail=f"{type(e).__name__}: {e}",
)
def probe_thinking_mode(llm: LLM, caps: ModelCapabilities) -> ProbeResult:
declared = caps.thinking_mode
if not declared:
return ProbeResult(
name="thinking_mode",
declared=False,
observed=None,
status="skip",
detail="declared false; skipping (cap-side flag controls API forwarding)",
)
effort = (
caps.default_reasoning_effort
or (caps.reasoning_effort_levels[0] if caps.reasoning_effort_levels else "medium")
)
try:
resp = llm.chat(
messages=[{"role": "user", "content": "Briefly: what is 17 * 23?"}],
reasoning_effort=effort,
)
msg = resp.choices[0].message
d = _msg_dict(msg)
rc = (
getattr(msg, "reasoning_content", None)
or getattr(msg, "thinking", None)
or d.get("reasoning_content")
or d.get("thinking")
)
observed = bool(rc)
return ProbeResult(
name="thinking_mode",
declared=True,
observed=observed,
status="ok" if observed else "mismatch",
detail=(
f"reasoning_effort={effort} accepted; "
+ ("thinking content returned" if observed else "no thinking content in response")
),
)
except Exception as e:
return ProbeResult(
name="thinking_mode",
declared=True,
observed=False,
status="mismatch",
detail=f"reasoning_effort rejected: {type(e).__name__}: {e}",
)
def probe_long_context(
llm: LLM, caps: ModelCapabilities, target_chars: Optional[int] = None
) -> ProbeResult:
"""needle-in-haystack 简化版。默认探 reliable_context * 4 / 8 字符,上限 200K。"""
if target_chars is None:
target_chars = caps.reliable_context * 4 // 8
target_chars = max(2_000, min(target_chars, 200_000))
SECRET = "K7-ZULU-9213"
pad = "The quick brown fox jumps over the lazy dog. " * 200
n_blocks = max(1, target_chars // len(pad))
middle = n_blocks // 2
parts: List[str] = []
for i in range(n_blocks):
if i == middle:
parts.append(f"\n>>> SECRET TOKEN: {SECRET} <<<\n")
parts.append(pad)
haystack = "".join(parts)
prompt = (
"Below is a long block of text. Somewhere in it a SECRET TOKEN is recorded "
"after the marker '>>> SECRET TOKEN:'. Reply with ONLY the token value, "
"nothing else.\n\n" + haystack
)
try:
resp = llm.chat(messages=[{"role": "user", "content": prompt}])
text = (resp.choices[0].message.content or "").strip()
ok = SECRET in text
return ProbeResult(
name="long_context",
declared=f"reliable_context={caps.reliable_context}",
observed=f"{len(haystack)} chars sent; secret {'recovered' if ok else 'missed'}",
status="ok" if ok else "mismatch",
detail=f"reply head: {text[:80]!r}",
)
except Exception as e:
return ProbeResult(
name="long_context",
declared=f"reliable_context={caps.reliable_context}",
observed=None,
status="error",
detail=f"{type(e).__name__}: {e}",
)
# ----- 顶层入口 -----
def probe_capabilities(
caps: ModelCapabilities,
llm: LLM,
*,
include_long_context: bool = False,
) -> ProbeReport:
report = ProbeReport(model=caps.model_id)
report.add(probe_basic_chat(llm))
if report.results[0].status == "error":
return report
report.add(probe_parallel_tools(llm, caps))
report.add(probe_thinking_mode(llm, caps))
if include_long_context:
report.add(probe_long_context(llm, caps))
return report