244 lines
7.8 KiB
Python
244 lines
7.8 KiB
Python
"""能力探测: 用真实 LLM 调用对账 yaml 中声称的能力。
|
|
|
|
只在用户显式触发(`cli.py probe`)时跑——会花 API 额度,不进启动路径。
|
|
不修改 yaml,只输出对比报告;让用户自己判断要不要改档案。
|
|
|
|
四项探测:
|
|
- basic_chat:连通性。失败则跳过其余。
|
|
- parallel_tools:给两个独立工具,看 single response 里 tool_calls 数量。
|
|
- thinking_mode:对 declared=True 的模型传 reasoning_effort,看 API 是否接受 + 是否产出 thinking。
|
|
- long_context(opt-in):needle-in-haystack 简化版,默认探 reliable_context 的 1/8。
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, List, Optional
|
|
|
|
from .capabilities import ModelCapabilities
|
|
from .llm import LLM
|
|
|
|
|
|
@dataclass
|
|
class ProbeResult:
|
|
name: str
|
|
declared: Any
|
|
observed: Any
|
|
status: str # "ok" / "mismatch" / "skip" / "error"
|
|
detail: str = ""
|
|
|
|
|
|
@dataclass
|
|
class ProbeReport:
|
|
model: str
|
|
results: List[ProbeResult] = field(default_factory=list)
|
|
|
|
def add(self, r: ProbeResult) -> None:
|
|
self.results.append(r)
|
|
|
|
@property
|
|
def has_mismatch(self) -> bool:
|
|
return any(r.status == "mismatch" for r in self.results)
|
|
|
|
|
|
def _msg_dict(msg: Any) -> dict:
|
|
if hasattr(msg, "model_dump"):
|
|
return msg.model_dump()
|
|
if hasattr(msg, "dict"):
|
|
return msg.dict()
|
|
return {}
|
|
|
|
|
|
# ----- 单项 probe -----
|
|
|
|
def probe_basic_chat(llm: LLM) -> ProbeResult:
|
|
try:
|
|
resp = llm.chat(
|
|
messages=[{"role": "user", "content": "Reply with exactly the word: pong"}],
|
|
)
|
|
text = (resp.choices[0].message.content or "").strip()
|
|
ok = "pong" in text.lower()
|
|
return ProbeResult(
|
|
name="basic_chat",
|
|
declared="reachable",
|
|
observed=text[:40] or "<empty>",
|
|
status="ok" if ok else "mismatch",
|
|
detail="" if ok else f"expected 'pong', got: {text[:80]!r}",
|
|
)
|
|
except Exception as e:
|
|
return ProbeResult(
|
|
name="basic_chat",
|
|
declared="reachable",
|
|
observed=None,
|
|
status="error",
|
|
detail=f"{type(e).__name__}: {e}",
|
|
)
|
|
|
|
|
|
def probe_parallel_tools(llm: LLM, caps: ModelCapabilities) -> ProbeResult:
|
|
declared = caps.parallel_tools
|
|
tools = [
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_weather",
|
|
"description": "Get current weather for a city.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {"city": {"type": "string"}},
|
|
"required": ["city"],
|
|
},
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_time",
|
|
"description": "Get current time in a timezone.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {"tz": {"type": "string"}},
|
|
"required": ["tz"],
|
|
},
|
|
},
|
|
},
|
|
]
|
|
user_msg = (
|
|
"I need two independent pieces of information at the same time: the weather "
|
|
"in Beijing AND the current time in Tokyo. Please call BOTH tools in this "
|
|
"single turn (in parallel)."
|
|
)
|
|
try:
|
|
resp = llm.chat(
|
|
messages=[{"role": "user", "content": user_msg}],
|
|
tools=tools,
|
|
parallel_tool_calls=True,
|
|
)
|
|
tool_calls = getattr(resp.choices[0].message, "tool_calls", None) or []
|
|
n = len(tool_calls)
|
|
observed = n >= 2
|
|
return ProbeResult(
|
|
name="parallel_tools",
|
|
declared=declared,
|
|
observed=observed,
|
|
status="ok" if observed == bool(declared) else "mismatch",
|
|
detail=f"{n} tool_calls in single response",
|
|
)
|
|
except Exception as e:
|
|
return ProbeResult(
|
|
name="parallel_tools",
|
|
declared=declared,
|
|
observed=None,
|
|
status="error",
|
|
detail=f"{type(e).__name__}: {e}",
|
|
)
|
|
|
|
|
|
def probe_thinking_mode(llm: LLM, caps: ModelCapabilities) -> ProbeResult:
|
|
declared = caps.thinking_mode
|
|
if not declared:
|
|
return ProbeResult(
|
|
name="thinking_mode",
|
|
declared=False,
|
|
observed=None,
|
|
status="skip",
|
|
detail="declared false; skipping (cap-side flag controls API forwarding)",
|
|
)
|
|
effort = (
|
|
caps.default_reasoning_effort
|
|
or (caps.reasoning_effort_levels[0] if caps.reasoning_effort_levels else "medium")
|
|
)
|
|
try:
|
|
resp = llm.chat(
|
|
messages=[{"role": "user", "content": "Briefly: what is 17 * 23?"}],
|
|
reasoning_effort=effort,
|
|
)
|
|
msg = resp.choices[0].message
|
|
d = _msg_dict(msg)
|
|
rc = (
|
|
getattr(msg, "reasoning_content", None)
|
|
or getattr(msg, "thinking", None)
|
|
or d.get("reasoning_content")
|
|
or d.get("thinking")
|
|
)
|
|
observed = bool(rc)
|
|
return ProbeResult(
|
|
name="thinking_mode",
|
|
declared=True,
|
|
observed=observed,
|
|
status="ok" if observed else "mismatch",
|
|
detail=(
|
|
f"reasoning_effort={effort} accepted; "
|
|
+ ("thinking content returned" if observed else "no thinking content in response")
|
|
),
|
|
)
|
|
except Exception as e:
|
|
return ProbeResult(
|
|
name="thinking_mode",
|
|
declared=True,
|
|
observed=False,
|
|
status="mismatch",
|
|
detail=f"reasoning_effort rejected: {type(e).__name__}: {e}",
|
|
)
|
|
|
|
|
|
def probe_long_context(
|
|
llm: LLM, caps: ModelCapabilities, target_chars: Optional[int] = None
|
|
) -> ProbeResult:
|
|
"""needle-in-haystack 简化版。默认探 reliable_context * 4 / 8 字符,上限 200K。"""
|
|
if target_chars is None:
|
|
target_chars = caps.reliable_context * 4 // 8
|
|
target_chars = max(2_000, min(target_chars, 200_000))
|
|
SECRET = "K7-ZULU-9213"
|
|
pad = "The quick brown fox jumps over the lazy dog. " * 200
|
|
n_blocks = max(1, target_chars // len(pad))
|
|
middle = n_blocks // 2
|
|
parts: List[str] = []
|
|
for i in range(n_blocks):
|
|
if i == middle:
|
|
parts.append(f"\n>>> SECRET TOKEN: {SECRET} <<<\n")
|
|
parts.append(pad)
|
|
haystack = "".join(parts)
|
|
prompt = (
|
|
"Below is a long block of text. Somewhere in it a SECRET TOKEN is recorded "
|
|
"after the marker '>>> SECRET TOKEN:'. Reply with ONLY the token value, "
|
|
"nothing else.\n\n" + haystack
|
|
)
|
|
try:
|
|
resp = llm.chat(messages=[{"role": "user", "content": prompt}])
|
|
text = (resp.choices[0].message.content or "").strip()
|
|
ok = SECRET in text
|
|
return ProbeResult(
|
|
name="long_context",
|
|
declared=f"reliable_context={caps.reliable_context}",
|
|
observed=f"{len(haystack)} chars sent; secret {'recovered' if ok else 'missed'}",
|
|
status="ok" if ok else "mismatch",
|
|
detail=f"reply head: {text[:80]!r}",
|
|
)
|
|
except Exception as e:
|
|
return ProbeResult(
|
|
name="long_context",
|
|
declared=f"reliable_context={caps.reliable_context}",
|
|
observed=None,
|
|
status="error",
|
|
detail=f"{type(e).__name__}: {e}",
|
|
)
|
|
|
|
|
|
# ----- 顶层入口 -----
|
|
|
|
def probe_capabilities(
|
|
caps: ModelCapabilities,
|
|
llm: LLM,
|
|
*,
|
|
include_long_context: bool = False,
|
|
) -> ProbeReport:
|
|
report = ProbeReport(model=caps.model_id)
|
|
report.add(probe_basic_chat(llm))
|
|
if report.results[0].status == "error":
|
|
return report
|
|
report.add(probe_parallel_tools(llm, caps))
|
|
report.add(probe_thinking_mode(llm, caps))
|
|
if include_long_context:
|
|
report.add(probe_long_context(llm, caps))
|
|
return report
|