diff --git a/PROGRESS.md b/PROGRESS.md index d84a586..169c1f2 100644 --- a/PROGRESS.md +++ b/PROGRESS.md @@ -31,6 +31,8 @@ - **dev SPA 右侧文件列表长名称 hover 显示全路径**:`web/static/dev.html` 在右 pane 文件行 `.file-row .name` 和"选入…"源文件列表 `.sp-row .sp-name` 上补 `title`,内容取 `e.rel || e.name`,保留现有 ellipsis 截断视觉,鼠标悬停可看完整相对路径/名称。`DESIGN.md` 不动(无架构/心智模型变化);`RUN.md` 不动(运行方式无变化)。 - **dev SPA 左侧滚动条只覆盖 task 列表**:`web/static/dev.html` 左 pane 改成 flex column,顶部 4 行 pane-head(任务标题/新建/搜索筛选/排序)固定不参与滚动;`#task-list` 与 `#task-sentinel` 包进 `#task-scroll`,并把 IntersectionObserver root 从 `#pane-left` 改到 `#task-scroll`,保证无限滚动仍按列表区域触发。`DESIGN.md` 不动(无架构/心智模型变化);`RUN.md` 不动(运行方式无变化)。 +- **接入博查 Web Search + Web Fetch 两个 tool**:`tools/web_search.py`(BochaConfig/BochaClient, POST `/v1/web-search`,Bearer 认证),`tools/web_fetch.py`(httpx + html2text,SSRF 内网屏蔽,截断 8000 字符);`config/web/bocha.yaml` 配置 API key env(`BOCHA_API_KEY`);`core/agent_builder.py` 注册 — web_fetch 无条件挂,web_search 仅在 env 设了 BOCHA_API_KEY 时挂(跟媒体 tool 同范式)。`requirements.txt` 加 `httpx>=0.27.0` + `html2text>=2024.0`。`DESIGN.md` 不动(纯新 tool 无架构变化);`RUN.md` 不动(运行方式无变化)。 + ### 2026-05-22 - **dev SPA 手机端对话面板顶栏 + chat-meta 紧凑化**:`web/static/dev.html` 手机段(≤640px)对 `#pane-mid > .pane-head` 加 `flex-wrap: wrap` + 按钮 `white-space: nowrap`,消除 5 个按钮(导出对话记录/清空对话/完成/废弃/删除)在 320-360px 视口被挤压后"完\n成"这种逐字竖排;同时藏掉 `.label`("对话",mobile-tabs 已亮态指示)和 `.spacer`(flex-wrap 下 spacer 会强制后续按钮换行影响视觉一致)。`#chat-meta` 同段把 `gap` 8px → 6px、藏 `.tid`(8 位 UUID 前缀手机用户用不上)、`.desc` 加 `max-width:60vw` ellipsis(避免长 description 独占一行);三个 model 下拉 label "模型/生图/生视频" 用 `.mdl-text / .mdl-icon` 双 span 渲染,桌面显文字 + 手机显 emoji(💬🖼🎬)—— `renderModelDropdown / renderImageModelDropdown / renderVideoModelDropdown` 三处统一。改动只在手机视口生效,桌面零变化。否决:(a) 折叠成 ⋯ 浮层菜单(用户拒,多一次点击);(b) 改图标按钮(5 个动作含义不直观需 tooltip);(c) 把 emoji 应用到桌面(无解决问题且改动用户已习惯的桌面态)。 diff --git a/config/web/bocha.yaml b/config/web/bocha.yaml new file mode 100644 index 0000000..740ea43 --- /dev/null +++ b/config/web/bocha.yaml @@ -0,0 +1,4 @@ +# 博查 (Bocha AI) 联网搜索 API 配置 +# 不给 BOCHA_API_KEY 环境变量时,web_search tool 不会注册 +bocha_api_key_env: BOCHA_API_KEY +bocha_base_url: https://api.bochaai.com/v1 diff --git a/core/agent_builder.py b/core/agent_builder.py index 0c73b02..cbf8571 100644 --- a/core/agent_builder.py +++ b/core/agent_builder.py @@ -41,8 +41,11 @@ from tools.seedance import SeedanceTool from tools.seedream import SeedreamTool from tools.shell import ShellTool from tools.skill_tool import LoadSkillTool +from tools.web_fetch import WebFetchTool +from tools.web_search import WebSearchTool from core.ark_client import ArkConfig +from core.bocha_client import BochaConfig def load_config() -> dict: @@ -347,6 +350,10 @@ def build_agent( t = cls(base_dir=tool_base, user_root=ur_path) tools[t.name] = t + # web_fetch 无需 API key,始终可用 + wf = WebFetchTool(base_dir=tool_base, user_root=ur_path) + tools[wf.name] = wf + if skills.skills: ls = LoadSkillTool(registry=skills, base_dir=tool_base, user_root=ur_path) tools[ls.name] = ls @@ -424,6 +431,12 @@ def build_agent( ) tools[seedance_tool.name] = seedance_tool + # 博查联网搜索:仅当 BOCHA_API_KEY 设了才挂 + bocha_cfg = BochaConfig.load() + if bocha_cfg is not None: + ws = WebSearchTool(cfg=bocha_cfg) + tools[ws.name] = ws + sink = ConsoleEventSink(console) if console else None agent = AgentLoop(llm, tools, session, caps, user_id=uid, sink=sink) if cancel_check is not None: diff --git a/core/bocha_client.py b/core/bocha_client.py new file mode 100644 index 0000000..a60736c --- /dev/null +++ b/core/bocha_client.py @@ -0,0 +1,96 @@ +"""博查 (Bocha AI) Web Search API 客户端,共享给 web_search tool。""" +from __future__ import annotations + +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +import httpx +import yaml + +from core.paths import ROOT + +_BOCHA_YAML = ROOT / "config" / "web" / "bocha.yaml" + + +class BochaError(RuntimeError): + """博查 API 调用失败的统一异常。""" + + +@dataclass +class BochaConfig: + api_key: str + base_url: str + + @classmethod + def load(cls, path: Optional[Path] = None) -> Optional["BochaConfig"]: + """读 bocha.yaml + 解析 env 拿 api_key。 + + api_key env 未设 → 返 None(caller 据此决定是否注册 tool)。 + yaml 不存在 → 返 None。 + """ + p = path or _BOCHA_YAML + if not p.exists(): + return None + data = yaml.safe_load(p.read_text(encoding="utf-8")) or {} + env = data.get("bocha_api_key_env") or "BOCHA_API_KEY" + key = os.environ.get(env, "").strip() + if not key: + return None + return cls( + api_key=key, + base_url=str(data.get("bocha_base_url") or "https://api.bochaai.com/v1").rstrip("/"), + ) + + +class BochaClient: + """轻量 httpx 封装: POST /v1/web-search, Bearer auth + 异常翻译。""" + + def __init__(self, cfg: BochaConfig, timeout_s: float = 15.0) -> None: + self.cfg = cfg + self._client = httpx.Client( + base_url=cfg.base_url, + headers={ + "Authorization": f"Bearer {cfg.api_key}", + "Content-Type": "application/json", + }, + timeout=timeout_s, + ) + + def search(self, query: str, count: int = 10, freshness: str = "noLimit") -> dict: + """调用博查 Web Search API,返回原始 dict。 + + freshness 可选: noLimit, oneDay, oneWeek, oneMonth, oneYear + count: 1-50 + """ + body = {"query": query, "count": min(max(count, 1), 50), "freshness": freshness} + try: + resp = self._client.post("/web-search", json=body) + except httpx.TimeoutException as e: + raise BochaError(f"博查搜索超时: {e}") from e + except httpx.HTTPError as e: + raise BochaError(f"博查网络错误: {e}") from e + return self._parse(resp) + + @staticmethod + def _parse(resp: httpx.Response) -> dict: + if resp.status_code >= 400: + try: + msg = resp.json().get("message", resp.text[:300]) + except ValueError: + msg = resp.text[:300] + raise BochaError(f"博查 API → HTTP {resp.status_code}: {msg}") + try: + return resp.json() + except ValueError as e: + raise BochaError(f"博查 API → invalid JSON: {e}") from e + + def close(self) -> None: + self._client.close() + + def __enter__(self) -> "BochaClient": + return self + + def __exit__(self, *_: object) -> None: + self.close() diff --git a/requirements.txt b/requirements.txt index 6d81f01..10dd315 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,10 @@ matplotlib>=3.8.0 # 素材摄取: PDF/DOCX/PPTX/XLSX/HTML/URL → Markdown (ppt 阶段零 + proposal 阶段零) markitdown[pdf,docx,pptx,xlsx]>=0.0.1 +# 联网搜索 / web fetch +httpx>=0.27.0 +html2text>=2024.0 + # §7 B 阶段: Storage 落 PG sqlalchemy>=2.0.0 psycopg[binary]>=3.1.0 diff --git a/tools/web_fetch.py b/tools/web_fetch.py new file mode 100644 index 0000000..ebcaa91 --- /dev/null +++ b/tools/web_fetch.py @@ -0,0 +1,106 @@ +"""Web Fetch: 抓取任意 URL 并返回 markdown 文本。""" +from __future__ import annotations + +import ipaddress +import re +import socket + +import html2text +import httpx + +from .base import Tool + +_SSRF_BLOCKED = { + ipaddress.ip_network(n) + for n in ( + "127.0.0.0/8", "10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16", + "169.254.0.0/16", "0.0.0.0/8", "::1/128", "fc00::/7", "fe80::/10", + ) +} + +_MAX_CHARS = 8000 +_TIMEOUT = 15.0 + +_UA = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" +) + +_h2t = html2text.HTML2Text() +_h2t.ignore_links = False +_h2t.ignore_images = True +_h2t.body_width = 0 +_h2t.skip_internal_links = True + + +def _check_ssrf(url: str) -> str | None: + """返回 None 表示安全;否则返回错误信息字符串。""" + import urllib.parse + parsed = urllib.parse.urlparse(url) + host = parsed.hostname + if not host: + return f"invalid URL: no host in {url!r}" + try: + ip = ipaddress.ip_address(host) + except ValueError: + try: + ip = ipaddress.ip_address(socket.getaddrinfo(host, None, 0, socket.SOCK_STREAM, socket.IPPROTO_TCP)[0][4][0]) + except (OSError, IndexError, ValueError): + return f"cannot resolve host: {host!r}" + for net in _SSRF_BLOCKED: + if ip in net: + return f"blocked internal/private host: {host} ({ip})" + return None + + +class WebFetchTool(Tool): + name = "web_fetch" + description = ( + "Fetch a web page and return its content as markdown text. " + "Use this to read the full content of a URL found in search results or referenced by the user. " + "Results are truncated to 8000 characters." + ) + parameters = { + "type": "object", + "properties": { + "url": {"type": "string", "description": "The URL to fetch"}, + }, + "required": ["url"], + } + + def execute(self, url: str) -> str: + err = _check_ssrf(url) + if err: + return f"[Error] {err}" + + try: + resp = httpx.get( + url, + headers={"User-Agent": _UA}, + timeout=_TIMEOUT, + follow_redirects=True, + ) + except httpx.TimeoutException: + return f"[Error] request timed out after {_TIMEOUT:.0f}s" + except httpx.HTTPError as e: + return f"[Error] request failed: {e}" + + if resp.status_code >= 400: + return f"[Error] HTTP {resp.status_code}" + + content_type = resp.headers.get("content-type", "") + if "text/html" not in content_type and "text/plain" not in content_type: + return f"[Error] unsupported content type: {content_type} — only HTML/text pages are supported" + + try: + text = _h2t.handle(resp.text) + except Exception as e: + return f"[Error] failed to convert HTML to text: {e}" + + # 压缩多余空行 + text = re.sub(r"\n{3,}", "\n\n", text).strip() + + if len(text) > _MAX_CHARS: + text = text[:_MAX_CHARS] + f"\n\n...(truncated, {len(text) - _MAX_CHARS} more chars)" + + return text diff --git a/tools/web_search.py b/tools/web_search.py new file mode 100644 index 0000000..2944b8e --- /dev/null +++ b/tools/web_search.py @@ -0,0 +1,63 @@ +"""Web Search: 通过博查 API 搜索互联网,返回结果列表。""" +from __future__ import annotations + +from .base import Tool +from core.bocha_client import BochaClient, BochaConfig, BochaError + + +class WebSearchTool(Tool): + name = "web_search" + description = ( + "Search the web using Bocha AI search engine. " + "Returns titles, URLs, and summaries for each result. " + "Use this to find current information, news, documentation, or anything on the public internet." + ) + parameters = { + "type": "object", + "properties": { + "query": {"type": "string", "description": "Search query string"}, + "count": { + "type": "integer", + "default": 10, + "description": "Number of results to return (1-20, default 10)", + }, + "freshness": { + "type": "string", + "enum": ["noLimit", "oneDay", "oneWeek", "oneMonth", "oneYear"], + "default": "noLimit", + "description": "Filter results by recency", + }, + }, + "required": ["query"], + } + + def __init__(self, cfg: BochaConfig) -> None: + super().__init__() + self._cfg = cfg + + def execute(self, query: str, count: int = 10, freshness: str = "noLimit") -> str: + count = min(max(int(count), 1), 20) + try: + with BochaClient(self._cfg) as client: + data = client.search(query=query, count=count, freshness=freshness) + except BochaError as e: + return f"[Error] web search failed: {e}" + + web_pages = (data.get("data") or {}).get("webPages") or {} + results = web_pages.get("value") or [] + + if not results: + return f"(no results found for query: {query!r})" + + lines = [f"Search results for: {query!r}\n"] + for i, r in enumerate(results, 1): + name = r.get("name", "?") + url = r.get("url", "") + summary = r.get("summary") or r.get("snippet", "") + lines.append(f"{i}. **{name}**") + if url: + lines.append(f" URL: {url}") + if summary: + lines.append(f" {summary}") + lines.append("") + return "\n".join(lines)