Add web_search and web_fetch tools via Bocha AI search API

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 11:37:33 +08:00 · 2026-05-25 11:37:33 +08:00 · fe95df0b9d
parent ade7f3d1e1
commit fe95df0b9d
7 changed files with 288 additions and 0 deletions
--- a/PROGRESS.md
+++ b/PROGRESS.md
@ -31,6 +31,8 @@
 - **dev SPA 右侧文件列表长名称 hover 显示全路径**:`web/static/dev.html` 在右 pane 文件行 `.file-row .name` 和"选入…"源文件列表 `.sp-row .sp-name` 上补 `title`,内容取 `e.rel || e.name`,保留现有 ellipsis 截断视觉,鼠标悬停可看完整相对路径/名称。`DESIGN.md` 不动(无架构/心智模型变化);`RUN.md` 不动(运行方式无变化)。
 - **dev SPA 左侧滚动条只覆盖 task 列表**:`web/static/dev.html` 左 pane 改成 flex column,顶部 4 行 pane-head(任务标题/新建/搜索筛选/排序)固定不参与滚动;`#task-list` 与 `#task-sentinel` 包进 `#task-scroll`,并把 IntersectionObserver root 从 `#pane-left` 改到 `#task-scroll`,保证无限滚动仍按列表区域触发。`DESIGN.md` 不动(无架构/心智模型变化);`RUN.md` 不动(运行方式无变化)。
 - **接入博查 Web Search + Web Fetch 两个 tool**:`tools/web_search.py`(BochaConfig/BochaClient, POST `/v1/web-search`,Bearer 认证),`tools/web_fetch.py`(httpx + html2text,SSRF 内网屏蔽,截断 8000 字符);`config/web/bocha.yaml` 配置 API key env(`BOCHA_API_KEY`);`core/agent_builder.py` 注册 — web_fetch 无条件挂,web_search 仅在 env 设了 BOCHA_API_KEY 时挂(跟媒体 tool 同范式)。`requirements.txt` 加 `httpx>=0.27.0` + `html2text>=2024.0`。`DESIGN.md` 不动(纯新 tool 无架构变化);`RUN.md` 不动(运行方式无变化)。
 ### 2026-05-22
 - **dev SPA 手机端对话面板顶栏 + chat-meta 紧凑化**:`web/static/dev.html` 手机段(≤640px)对 `#pane-mid > .pane-head` 加 `flex-wrap: wrap` + 按钮 `white-space: nowrap`,消除 5 个按钮(导出对话记录/清空对话/完成/废弃/删除)在 320-360px 视口被挤压后"完\n成"这种逐字竖排;同时藏掉 `.label`("对话",mobile-tabs 已亮态指示)和 `.spacer`(flex-wrap 下 spacer 会强制后续按钮换行影响视觉一致)。`#chat-meta` 同段把 `gap` 8px → 6px、藏 `.tid`(8 位 UUID 前缀手机用户用不上)、`.desc` 加 `max-width:60vw` ellipsis(避免长 description 独占一行);三个 model 下拉 label "模型/生图/生视频" 用 `.mdl-text / .mdl-icon` 双 span 渲染,桌面显文字 + 手机显 emoji(💬🖼🎬)—— `renderModelDropdown / renderImageModelDropdown / renderVideoModelDropdown` 三处统一。改动只在手机视口生效,桌面零变化。否决:(a) 折叠成 ⋯ 浮层菜单(用户拒,多一次点击);(b) 改图标按钮(5 个动作含义不直观需 tooltip);(c) 把 emoji 应用到桌面(无解决问题且改动用户已习惯的桌面态)。
--- a/config/web/bocha.yaml
+++ b/config/web/bocha.yaml
@ -0,0 +1,4 @@
 # 博查 (Bocha AI) 联网搜索 API 配置
 # 不给 BOCHA_API_KEY 环境变量时,web_search tool 不会注册
 bocha_api_key_env: BOCHA_API_KEY
 bocha_base_url: https://api.bochaai.com/v1
--- a/core/agent_builder.py
+++ b/core/agent_builder.py
@ -41,8 +41,11 @@ from tools.seedance import SeedanceTool
 from tools.seedream import SeedreamTool
 from tools.shell import ShellTool
 from tools.skill_tool import LoadSkillTool
 from tools.web_fetch import WebFetchTool
 from tools.web_search import WebSearchTool
 from core.ark_client import ArkConfig
 from core.bocha_client import BochaConfig
 def load_config() -> dict:
@ -347,6 +350,10 @@ def build_agent(
        t = cls(base_dir=tool_base, user_root=ur_path)
        tools[t.name] = t
    # web_fetch 无需 API key,始终可用
    wf = WebFetchTool(base_dir=tool_base, user_root=ur_path)
    tools[wf.name] = wf
    if skills.skills:
        ls = LoadSkillTool(registry=skills, base_dir=tool_base, user_root=ur_path)
        tools[ls.name] = ls
@ -424,6 +431,12 @@ def build_agent(
            )
            tools[seedance_tool.name] = seedance_tool
    # 博查联网搜索:仅当 BOCHA_API_KEY 设了才挂
    bocha_cfg = BochaConfig.load()
    if bocha_cfg is not None:
        ws = WebSearchTool(cfg=bocha_cfg)
        tools[ws.name] = ws
    sink = ConsoleEventSink(console) if console else None
    agent = AgentLoop(llm, tools, session, caps, user_id=uid, sink=sink)
    if cancel_check is not None:
--- a/core/bocha_client.py
+++ b/core/bocha_client.py
@ -0,0 +1,96 @@
 """博查 (Bocha AI) Web Search API 客户端,共享给 web_search tool。"""
 from __future__ import annotations
 import os
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
 import httpx
 import yaml
 from core.paths import ROOT
 _BOCHA_YAML = ROOT / "config" / "web" / "bocha.yaml"
 class BochaError(RuntimeError):
    """博查 API 调用失败的统一异常。"""
@dataclass
 class BochaConfig:
    api_key: str
    base_url: str
    @classmethod
    def load(cls, path: Optional[Path] = None) -> Optional["BochaConfig"]:
        """读 bocha.yaml + 解析 env 拿 api_key。
        api_key env 未设 → 返 None(caller 据此决定是否注册 tool)。
        yaml 不存在 → 返 None。
        """
        p = path or _BOCHA_YAML
        if not p.exists():
            return None
        data = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
        env = data.get("bocha_api_key_env") or "BOCHA_API_KEY"
        key = os.environ.get(env, "").strip()
        if not key:
            return None
        return cls(
            api_key=key,
            base_url=str(data.get("bocha_base_url") or "https://api.bochaai.com/v1").rstrip("/"),
        )
 class BochaClient:
    """轻量 httpx 封装: POST /v1/web-search, Bearer auth + 异常翻译。"""
    def __init__(self, cfg: BochaConfig, timeout_s: float = 15.0) -> None:
        self.cfg = cfg
        self._client = httpx.Client(
            base_url=cfg.base_url,
            headers={
                "Authorization": f"Bearer {cfg.api_key}",
                "Content-Type": "application/json",
            },
            timeout=timeout_s,
        )
    def search(self, query: str, count: int = 10, freshness: str = "noLimit") -> dict:
        """调用博查 Web Search API,返回原始 dict。
        freshness 可选: noLimit, oneDay, oneWeek, oneMonth, oneYear
        count: 1-50
        """
        body = {"query": query, "count": min(max(count, 1), 50), "freshness": freshness}
        try:
            resp = self._client.post("/web-search", json=body)
        except httpx.TimeoutException as e:
            raise BochaError(f"博查搜索超时: {e}") from e
        except httpx.HTTPError as e:
            raise BochaError(f"博查网络错误: {e}") from e
        return self._parse(resp)
    @staticmethod
    def _parse(resp: httpx.Response) -> dict:
        if resp.status_code >= 400:
            try:
                msg = resp.json().get("message", resp.text[:300])
            except ValueError:
                msg = resp.text[:300]
            raise BochaError(f"博查 API → HTTP {resp.status_code}: {msg}")
        try:
            return resp.json()
        except ValueError as e:
            raise BochaError(f"博查 API → invalid JSON: {e}") from e
    def close(self) -> None:
        self._client.close()
    def __enter__(self) -> "BochaClient":
        return self
    def __exit__(self, *_: object) -> None:
        self.close()
--- a/requirements.txt
+++ b/requirements.txt
@ -11,6 +11,10 @@ matplotlib>=3.8.0
 # 素材摄取: PDF/DOCX/PPTX/XLSX/HTML/URL → Markdown (ppt 阶段零 + proposal 阶段零)
 markitdown[pdf,docx,pptx,xlsx]>=0.0.1
 # 联网搜索 / web fetch
 httpx>=0.27.0
 html2text>=2024.0
 # §7 B 阶段: Storage 落 PG
 sqlalchemy>=2.0.0
 psycopg[binary]>=3.1.0
--- a/tools/web_fetch.py
+++ b/tools/web_fetch.py
@ -0,0 +1,106 @@
 """Web Fetch: 抓取任意 URL 并返回 markdown 文本。"""
 from __future__ import annotations
 import ipaddress
 import re
 import socket
 import html2text
 import httpx
 from .base import Tool
 _SSRF_BLOCKED = {
    ipaddress.ip_network(n)
    for n in (
        "127.0.0.0/8", "10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16",
        "169.254.0.0/16", "0.0.0.0/8", "::1/128", "fc00::/7", "fe80::/10",
    )
 }
 _MAX_CHARS = 8000
 _TIMEOUT = 15.0
 _UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
 )
 _h2t = html2text.HTML2Text()
 _h2t.ignore_links = False
 _h2t.ignore_images = True
 _h2t.body_width = 0
 _h2t.skip_internal_links = True
 def _check_ssrf(url: str) -> str | None:
    """返回 None 表示安全;否则返回错误信息字符串。"""
    import urllib.parse
    parsed = urllib.parse.urlparse(url)
    host = parsed.hostname
    if not host:
        return f"invalid URL: no host in {url!r}"
    try:
        ip = ipaddress.ip_address(host)
    except ValueError:
        try:
            ip = ipaddress.ip_address(socket.getaddrinfo(host, None, 0, socket.SOCK_STREAM, socket.IPPROTO_TCP)[0][4][0])
        except (OSError, IndexError, ValueError):
            return f"cannot resolve host: {host!r}"
    for net in _SSRF_BLOCKED:
        if ip in net:
            return f"blocked internal/private host: {host} ({ip})"
    return None
 class WebFetchTool(Tool):
    name = "web_fetch"
    description = (
        "Fetch a web page and return its content as markdown text. "
        "Use this to read the full content of a URL found in search results or referenced by the user. "
        "Results are truncated to 8000 characters."
    )
    parameters = {
        "type": "object",
        "properties": {
            "url": {"type": "string", "description": "The URL to fetch"},
        },
        "required": ["url"],
    }
    def execute(self, url: str) -> str:
        err = _check_ssrf(url)
        if err:
            return f"[Error] {err}"
        try:
            resp = httpx.get(
                url,
                headers={"User-Agent": _UA},
                timeout=_TIMEOUT,
                follow_redirects=True,
            )
        except httpx.TimeoutException:
            return f"[Error] request timed out after {_TIMEOUT:.0f}s"
        except httpx.HTTPError as e:
            return f"[Error] request failed: {e}"
        if resp.status_code >= 400:
            return f"[Error] HTTP {resp.status_code}"
        content_type = resp.headers.get("content-type", "")
        if "text/html" not in content_type and "text/plain" not in content_type:
            return f"[Error] unsupported content type: {content_type} — only HTML/text pages are supported"
        try:
            text = _h2t.handle(resp.text)
        except Exception as e:
            return f"[Error] failed to convert HTML to text: {e}"
        # 压缩多余空行
        text = re.sub(r"\n{3,}", "\n\n", text).strip()
        if len(text) > _MAX_CHARS:
            text = text[:_MAX_CHARS] + f"\n\n...(truncated, {len(text) - _MAX_CHARS} more chars)"
        return text
--- a/tools/web_search.py
+++ b/tools/web_search.py
@ -0,0 +1,63 @@
 """Web Search: 通过博查 API 搜索互联网,返回结果列表。"""
 from __future__ import annotations
 from .base import Tool
 from core.bocha_client import BochaClient, BochaConfig, BochaError
 class WebSearchTool(Tool):
    name = "web_search"
    description = (
        "Search the web using Bocha AI search engine. "
        "Returns titles, URLs, and summaries for each result. "
        "Use this to find current information, news, documentation, or anything on the public internet."
    )
    parameters = {
        "type": "object",
        "properties": {
            "query": {"type": "string", "description": "Search query string"},
            "count": {
                "type": "integer",
                "default": 10,
                "description": "Number of results to return (1-20, default 10)",
            },
            "freshness": {
                "type": "string",
                "enum": ["noLimit", "oneDay", "oneWeek", "oneMonth", "oneYear"],
                "default": "noLimit",
                "description": "Filter results by recency",
            },
        },
        "required": ["query"],
    }
    def __init__(self, cfg: BochaConfig) -> None:
        super().__init__()
        self._cfg = cfg
    def execute(self, query: str, count: int = 10, freshness: str = "noLimit") -> str:
        count = min(max(int(count), 1), 20)
        try:
            with BochaClient(self._cfg) as client:
                data = client.search(query=query, count=count, freshness=freshness)
        except BochaError as e:
            return f"[Error] web search failed: {e}"
        web_pages = (data.get("data") or {}).get("webPages") or {}
        results = web_pages.get("value") or []
        if not results:
            return f"(no results found for query: {query!r})"
        lines = [f"Search results for: {query!r}\n"]
        for i, r in enumerate(results, 1):
            name = r.get("name", "?")
            url = r.get("url", "")
            summary = r.get("summary") or r.get("snippet", "")
            lines.append(f"{i}. **{name}**")
            if url:
                lines.append(f"   URL: {url}")
            if summary:
                lines.append(f"   {summary}")
            lines.append("")
        return "\n".join(lines)