"""Web Fetch: 抓取任意 URL 并返回 markdown 文本。""" from __future__ import annotations import ipaddress import re import socket import html2text import httpx from .base import Tool _SSRF_BLOCKED = { ipaddress.ip_network(n) for n in ( "127.0.0.0/8", "10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16", "169.254.0.0/16", "0.0.0.0/8", "::1/128", "fc00::/7", "fe80::/10", ) } _MAX_CHARS = 8000 _TIMEOUT = 15.0 _UA = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" ) _h2t = html2text.HTML2Text() _h2t.ignore_links = False _h2t.ignore_images = True _h2t.body_width = 0 _h2t.skip_internal_links = True def _check_ssrf(url: str) -> str | None: """返回 None 表示安全;否则返回错误信息字符串。""" import urllib.parse parsed = urllib.parse.urlparse(url) host = parsed.hostname if not host: return f"invalid URL: no host in {url!r}" try: ip = ipaddress.ip_address(host) except ValueError: try: ip = ipaddress.ip_address(socket.getaddrinfo(host, None, 0, socket.SOCK_STREAM, socket.IPPROTO_TCP)[0][4][0]) except (OSError, IndexError, ValueError): return f"cannot resolve host: {host!r}" for net in _SSRF_BLOCKED: if ip in net: return f"blocked internal/private host: {host} ({ip})" return None class WebFetchTool(Tool): name = "web_fetch" description = ( "Fetch a web page and return its content as markdown text. " "Use this to read the full content of a URL found in search results or referenced by the user. " "Results are truncated to 8000 characters." ) parameters = { "type": "object", "properties": { "url": {"type": "string", "description": "The URL to fetch"}, }, "required": ["url"], } def execute(self, url: str) -> str: err = _check_ssrf(url) if err: return f"[Error] {err}" try: resp = httpx.get( url, headers={"User-Agent": _UA}, timeout=_TIMEOUT, follow_redirects=True, ) except httpx.TimeoutException: return f"[Error] request timed out after {_TIMEOUT:.0f}s" except httpx.HTTPError as e: return f"[Error] request failed: {e}" if resp.status_code >= 400: return f"[Error] HTTP {resp.status_code}" content_type = resp.headers.get("content-type", "") if "text/html" not in content_type and "text/plain" not in content_type: return f"[Error] unsupported content type: {content_type} — only HTML/text pages are supported" try: text = _h2t.handle(resp.text) except Exception as e: return f"[Error] failed to convert HTML to text: {e}" # 压缩多余空行 text = re.sub(r"\n{3,}", "\n\n", text).strip() if len(text) > _MAX_CHARS: text = text[:_MAX_CHARS] + f"\n\n...(truncated, {len(text) - _MAX_CHARS} more chars)" return text