Add web_search and web_fetch tools via Bocha AI search API

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
caoqianming 2026-05-25 11:37:33 +08:00
parent ade7f3d1e1
commit fe95df0b9d
7 changed files with 288 additions and 0 deletions

View File

@ -31,6 +31,8 @@
- **dev SPA 右侧文件列表长名称 hover 显示全路径**:`web/static/dev.html` 在右 pane 文件行 `.file-row .name` 和"选入…"源文件列表 `.sp-row .sp-name` 上补 `title`,内容取 `e.rel || e.name`,保留现有 ellipsis 截断视觉,鼠标悬停可看完整相对路径/名称。`DESIGN.md` 不动(无架构/心智模型变化);`RUN.md` 不动(运行方式无变化)。 - **dev SPA 右侧文件列表长名称 hover 显示全路径**:`web/static/dev.html` 在右 pane 文件行 `.file-row .name` 和"选入…"源文件列表 `.sp-row .sp-name` 上补 `title`,内容取 `e.rel || e.name`,保留现有 ellipsis 截断视觉,鼠标悬停可看完整相对路径/名称。`DESIGN.md` 不动(无架构/心智模型变化);`RUN.md` 不动(运行方式无变化)。
- **dev SPA 左侧滚动条只覆盖 task 列表**:`web/static/dev.html` 左 pane 改成 flex column,顶部 4 行 pane-head(任务标题/新建/搜索筛选/排序)固定不参与滚动;`#task-list` 与 `#task-sentinel` 包进 `#task-scroll`,并把 IntersectionObserver root 从 `#pane-left` 改到 `#task-scroll`,保证无限滚动仍按列表区域触发。`DESIGN.md` 不动(无架构/心智模型变化);`RUN.md` 不动(运行方式无变化)。 - **dev SPA 左侧滚动条只覆盖 task 列表**:`web/static/dev.html` 左 pane 改成 flex column,顶部 4 行 pane-head(任务标题/新建/搜索筛选/排序)固定不参与滚动;`#task-list` 与 `#task-sentinel` 包进 `#task-scroll`,并把 IntersectionObserver root 从 `#pane-left` 改到 `#task-scroll`,保证无限滚动仍按列表区域触发。`DESIGN.md` 不动(无架构/心智模型变化);`RUN.md` 不动(运行方式无变化)。
- **接入博查 Web Search + Web Fetch 两个 tool**:`tools/web_search.py`(BochaConfig/BochaClient, POST `/v1/web-search`,Bearer 认证),`tools/web_fetch.py`(httpx + html2text,SSRF 内网屏蔽,截断 8000 字符);`config/web/bocha.yaml` 配置 API key env(`BOCHA_API_KEY`);`core/agent_builder.py` 注册 — web_fetch 无条件挂,web_search 仅在 env 设了 BOCHA_API_KEY 时挂(跟媒体 tool 同范式)。`requirements.txt` 加 `httpx>=0.27.0` + `html2text>=2024.0`。`DESIGN.md` 不动(纯新 tool 无架构变化);`RUN.md` 不动(运行方式无变化)。
### 2026-05-22 ### 2026-05-22
- **dev SPA 手机端对话面板顶栏 + chat-meta 紧凑化**:`web/static/dev.html` 手机段(≤640px)对 `#pane-mid > .pane-head``flex-wrap: wrap` + 按钮 `white-space: nowrap`,消除 5 个按钮(导出对话记录/清空对话/完成/废弃/删除)在 320-360px 视口被挤压后"完\n成"这种逐字竖排;同时藏掉 `.label`("对话",mobile-tabs 已亮态指示)和 `.spacer`(flex-wrap 下 spacer 会强制后续按钮换行影响视觉一致)。`#chat-meta` 同段把 `gap` 8px → 6px、藏 `.tid`(8 位 UUID 前缀手机用户用不上)、`.desc` 加 `max-width:60vw` ellipsis(避免长 description 独占一行);三个 model 下拉 label "模型/生图/生视频" 用 `.mdl-text / .mdl-icon` 双 span 渲染,桌面显文字 + 手机显 emoji(💬🖼🎬)—— `renderModelDropdown / renderImageModelDropdown / renderVideoModelDropdown` 三处统一。改动只在手机视口生效,桌面零变化。否决:(a) 折叠成 ⋯ 浮层菜单(用户拒,多一次点击);(b) 改图标按钮(5 个动作含义不直观需 tooltip);(c) 把 emoji 应用到桌面(无解决问题且改动用户已习惯的桌面态)。 - **dev SPA 手机端对话面板顶栏 + chat-meta 紧凑化**:`web/static/dev.html` 手机段(≤640px)对 `#pane-mid > .pane-head``flex-wrap: wrap` + 按钮 `white-space: nowrap`,消除 5 个按钮(导出对话记录/清空对话/完成/废弃/删除)在 320-360px 视口被挤压后"完\n成"这种逐字竖排;同时藏掉 `.label`("对话",mobile-tabs 已亮态指示)和 `.spacer`(flex-wrap 下 spacer 会强制后续按钮换行影响视觉一致)。`#chat-meta` 同段把 `gap` 8px → 6px、藏 `.tid`(8 位 UUID 前缀手机用户用不上)、`.desc` 加 `max-width:60vw` ellipsis(避免长 description 独占一行);三个 model 下拉 label "模型/生图/生视频" 用 `.mdl-text / .mdl-icon` 双 span 渲染,桌面显文字 + 手机显 emoji(💬🖼🎬)—— `renderModelDropdown / renderImageModelDropdown / renderVideoModelDropdown` 三处统一。改动只在手机视口生效,桌面零变化。否决:(a) 折叠成 ⋯ 浮层菜单(用户拒,多一次点击);(b) 改图标按钮(5 个动作含义不直观需 tooltip);(c) 把 emoji 应用到桌面(无解决问题且改动用户已习惯的桌面态)。

4
config/web/bocha.yaml Normal file
View File

@ -0,0 +1,4 @@
# 博查 (Bocha AI) 联网搜索 API 配置
# 不给 BOCHA_API_KEY 环境变量时,web_search tool 不会注册
bocha_api_key_env: BOCHA_API_KEY
bocha_base_url: https://api.bochaai.com/v1

View File

@ -41,8 +41,11 @@ from tools.seedance import SeedanceTool
from tools.seedream import SeedreamTool from tools.seedream import SeedreamTool
from tools.shell import ShellTool from tools.shell import ShellTool
from tools.skill_tool import LoadSkillTool from tools.skill_tool import LoadSkillTool
from tools.web_fetch import WebFetchTool
from tools.web_search import WebSearchTool
from core.ark_client import ArkConfig from core.ark_client import ArkConfig
from core.bocha_client import BochaConfig
def load_config() -> dict: def load_config() -> dict:
@ -347,6 +350,10 @@ def build_agent(
t = cls(base_dir=tool_base, user_root=ur_path) t = cls(base_dir=tool_base, user_root=ur_path)
tools[t.name] = t tools[t.name] = t
# web_fetch 无需 API key,始终可用
wf = WebFetchTool(base_dir=tool_base, user_root=ur_path)
tools[wf.name] = wf
if skills.skills: if skills.skills:
ls = LoadSkillTool(registry=skills, base_dir=tool_base, user_root=ur_path) ls = LoadSkillTool(registry=skills, base_dir=tool_base, user_root=ur_path)
tools[ls.name] = ls tools[ls.name] = ls
@ -424,6 +431,12 @@ def build_agent(
) )
tools[seedance_tool.name] = seedance_tool tools[seedance_tool.name] = seedance_tool
# 博查联网搜索:仅当 BOCHA_API_KEY 设了才挂
bocha_cfg = BochaConfig.load()
if bocha_cfg is not None:
ws = WebSearchTool(cfg=bocha_cfg)
tools[ws.name] = ws
sink = ConsoleEventSink(console) if console else None sink = ConsoleEventSink(console) if console else None
agent = AgentLoop(llm, tools, session, caps, user_id=uid, sink=sink) agent = AgentLoop(llm, tools, session, caps, user_id=uid, sink=sink)
if cancel_check is not None: if cancel_check is not None:

96
core/bocha_client.py Normal file
View File

@ -0,0 +1,96 @@
"""博查 (Bocha AI) Web Search API 客户端,共享给 web_search tool。"""
from __future__ import annotations
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
import httpx
import yaml
from core.paths import ROOT
_BOCHA_YAML = ROOT / "config" / "web" / "bocha.yaml"
class BochaError(RuntimeError):
"""博查 API 调用失败的统一异常。"""
@dataclass
class BochaConfig:
api_key: str
base_url: str
@classmethod
def load(cls, path: Optional[Path] = None) -> Optional["BochaConfig"]:
"""读 bocha.yaml + 解析 env 拿 api_key。
api_key env 未设 None(caller 据此决定是否注册 tool)
yaml 不存在 None
"""
p = path or _BOCHA_YAML
if not p.exists():
return None
data = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
env = data.get("bocha_api_key_env") or "BOCHA_API_KEY"
key = os.environ.get(env, "").strip()
if not key:
return None
return cls(
api_key=key,
base_url=str(data.get("bocha_base_url") or "https://api.bochaai.com/v1").rstrip("/"),
)
class BochaClient:
"""轻量 httpx 封装: POST /v1/web-search, Bearer auth + 异常翻译。"""
def __init__(self, cfg: BochaConfig, timeout_s: float = 15.0) -> None:
self.cfg = cfg
self._client = httpx.Client(
base_url=cfg.base_url,
headers={
"Authorization": f"Bearer {cfg.api_key}",
"Content-Type": "application/json",
},
timeout=timeout_s,
)
def search(self, query: str, count: int = 10, freshness: str = "noLimit") -> dict:
"""调用博查 Web Search API,返回原始 dict。
freshness 可选: noLimit, oneDay, oneWeek, oneMonth, oneYear
count: 1-50
"""
body = {"query": query, "count": min(max(count, 1), 50), "freshness": freshness}
try:
resp = self._client.post("/web-search", json=body)
except httpx.TimeoutException as e:
raise BochaError(f"博查搜索超时: {e}") from e
except httpx.HTTPError as e:
raise BochaError(f"博查网络错误: {e}") from e
return self._parse(resp)
@staticmethod
def _parse(resp: httpx.Response) -> dict:
if resp.status_code >= 400:
try:
msg = resp.json().get("message", resp.text[:300])
except ValueError:
msg = resp.text[:300]
raise BochaError(f"博查 API → HTTP {resp.status_code}: {msg}")
try:
return resp.json()
except ValueError as e:
raise BochaError(f"博查 API → invalid JSON: {e}") from e
def close(self) -> None:
self._client.close()
def __enter__(self) -> "BochaClient":
return self
def __exit__(self, *_: object) -> None:
self.close()

View File

@ -11,6 +11,10 @@ matplotlib>=3.8.0
# 素材摄取: PDF/DOCX/PPTX/XLSX/HTML/URL → Markdown (ppt 阶段零 + proposal 阶段零) # 素材摄取: PDF/DOCX/PPTX/XLSX/HTML/URL → Markdown (ppt 阶段零 + proposal 阶段零)
markitdown[pdf,docx,pptx,xlsx]>=0.0.1 markitdown[pdf,docx,pptx,xlsx]>=0.0.1
# 联网搜索 / web fetch
httpx>=0.27.0
html2text>=2024.0
# §7 B 阶段: Storage 落 PG # §7 B 阶段: Storage 落 PG
sqlalchemy>=2.0.0 sqlalchemy>=2.0.0
psycopg[binary]>=3.1.0 psycopg[binary]>=3.1.0

106
tools/web_fetch.py Normal file
View File

@ -0,0 +1,106 @@
"""Web Fetch: 抓取任意 URL 并返回 markdown 文本。"""
from __future__ import annotations
import ipaddress
import re
import socket
import html2text
import httpx
from .base import Tool
_SSRF_BLOCKED = {
ipaddress.ip_network(n)
for n in (
"127.0.0.0/8", "10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16",
"169.254.0.0/16", "0.0.0.0/8", "::1/128", "fc00::/7", "fe80::/10",
)
}
_MAX_CHARS = 8000
_TIMEOUT = 15.0
_UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
)
_h2t = html2text.HTML2Text()
_h2t.ignore_links = False
_h2t.ignore_images = True
_h2t.body_width = 0
_h2t.skip_internal_links = True
def _check_ssrf(url: str) -> str | None:
"""返回 None 表示安全;否则返回错误信息字符串。"""
import urllib.parse
parsed = urllib.parse.urlparse(url)
host = parsed.hostname
if not host:
return f"invalid URL: no host in {url!r}"
try:
ip = ipaddress.ip_address(host)
except ValueError:
try:
ip = ipaddress.ip_address(socket.getaddrinfo(host, None, 0, socket.SOCK_STREAM, socket.IPPROTO_TCP)[0][4][0])
except (OSError, IndexError, ValueError):
return f"cannot resolve host: {host!r}"
for net in _SSRF_BLOCKED:
if ip in net:
return f"blocked internal/private host: {host} ({ip})"
return None
class WebFetchTool(Tool):
name = "web_fetch"
description = (
"Fetch a web page and return its content as markdown text. "
"Use this to read the full content of a URL found in search results or referenced by the user. "
"Results are truncated to 8000 characters."
)
parameters = {
"type": "object",
"properties": {
"url": {"type": "string", "description": "The URL to fetch"},
},
"required": ["url"],
}
def execute(self, url: str) -> str:
err = _check_ssrf(url)
if err:
return f"[Error] {err}"
try:
resp = httpx.get(
url,
headers={"User-Agent": _UA},
timeout=_TIMEOUT,
follow_redirects=True,
)
except httpx.TimeoutException:
return f"[Error] request timed out after {_TIMEOUT:.0f}s"
except httpx.HTTPError as e:
return f"[Error] request failed: {e}"
if resp.status_code >= 400:
return f"[Error] HTTP {resp.status_code}"
content_type = resp.headers.get("content-type", "")
if "text/html" not in content_type and "text/plain" not in content_type:
return f"[Error] unsupported content type: {content_type} — only HTML/text pages are supported"
try:
text = _h2t.handle(resp.text)
except Exception as e:
return f"[Error] failed to convert HTML to text: {e}"
# 压缩多余空行
text = re.sub(r"\n{3,}", "\n\n", text).strip()
if len(text) > _MAX_CHARS:
text = text[:_MAX_CHARS] + f"\n\n...(truncated, {len(text) - _MAX_CHARS} more chars)"
return text

63
tools/web_search.py Normal file
View File

@ -0,0 +1,63 @@
"""Web Search: 通过博查 API 搜索互联网,返回结果列表。"""
from __future__ import annotations
from .base import Tool
from core.bocha_client import BochaClient, BochaConfig, BochaError
class WebSearchTool(Tool):
name = "web_search"
description = (
"Search the web using Bocha AI search engine. "
"Returns titles, URLs, and summaries for each result. "
"Use this to find current information, news, documentation, or anything on the public internet."
)
parameters = {
"type": "object",
"properties": {
"query": {"type": "string", "description": "Search query string"},
"count": {
"type": "integer",
"default": 10,
"description": "Number of results to return (1-20, default 10)",
},
"freshness": {
"type": "string",
"enum": ["noLimit", "oneDay", "oneWeek", "oneMonth", "oneYear"],
"default": "noLimit",
"description": "Filter results by recency",
},
},
"required": ["query"],
}
def __init__(self, cfg: BochaConfig) -> None:
super().__init__()
self._cfg = cfg
def execute(self, query: str, count: int = 10, freshness: str = "noLimit") -> str:
count = min(max(int(count), 1), 20)
try:
with BochaClient(self._cfg) as client:
data = client.search(query=query, count=count, freshness=freshness)
except BochaError as e:
return f"[Error] web search failed: {e}"
web_pages = (data.get("data") or {}).get("webPages") or {}
results = web_pages.get("value") or []
if not results:
return f"(no results found for query: {query!r})"
lines = [f"Search results for: {query!r}\n"]
for i, r in enumerate(results, 1):
name = r.get("name", "?")
url = r.get("url", "")
summary = r.get("summary") or r.get("snippet", "")
lines.append(f"{i}. **{name}**")
if url:
lines.append(f" URL: {url}")
if summary:
lines.append(f" {summary}")
lines.append("")
return "\n".join(lines)