From 3a3f8d86cc4f8d38f15b20d47949a55502a744b3 Mon Sep 17 00:00:00 2001 From: caoqianming Date: Wed, 27 May 2026 11:28:53 +0800 Subject: [PATCH] =?UTF-8?q?sandbox:=20yaml=20sandbox.dns=20=E6=98=BE?= =?UTF-8?q?=E5=BC=8F=20--dns=20=E6=B3=A8=E5=85=A5(=E7=BB=95=E5=BC=80=20dae?= =?UTF-8?q?mon=20=E4=B8=8A=E6=B8=B8=E6=8E=A2=E6=B5=8B)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 腾讯云轻量等场景 docker daemon 探测 host systemd-resolved 上游 DNS 不稳, 即使 init.sh ACCEPT 127.0.0.11:53 例外,embedded DNS 自己 forward 不出去 仍跪。显式 docker run --dns 8.8.8.8 --dns 114.114.114.114 直接写容器 /etc/resolv.conf 绕开上游探测。 - agent.yaml 加 sandbox.dns 列表,默 [8.8.8.8, 114.114.114.114] - SandboxPool 加 dns 字段(env: ZCBOT_SANDBOX_DNS 逗号分隔 override), _docker_run 每个 ip 加 --dns flag - RUN.md 故障兜底 DNS 失败那行补充第二层根因 + 解法 Co-Authored-By: Claude Opus 4.7 (1M context) --- RUN.md | 2 +- config/agent.yaml | 9 ++++++++- core/sandbox/pool.py | 15 +++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/RUN.md b/RUN.md index 1c3ef89..87c7ac6 100644 --- a/RUN.md +++ b/RUN.md @@ -483,7 +483,7 @@ sudo xfs_quota -x -c "limit -p bhard=10g zcbot_" /opt | `POST /v1/files/upload` 返 413 `已达磁盘配额上限` | per-user 5GB(yaml `quotas.disk_bytes_per_user`)。让用户在 dev SPA 右侧文件栏删旧产物 / 大文件,或改 yaml 升配重启 web | | `[warn] network zcbot-sandbox-net is --internal (legacy)` | 上一版 sandbox network 创建时带了 `--internal`(完全禁 outbound),当前 dogfood 阶段放开。`docker stop $(docker ps -aq -f label=zcbot.product=sandbox) ; docker network rm zcbot-sandbox-net`,重启 web 自动 recreate 为非 internal | | tool write/edit 返 `[Error] 已达磁盘配额上限` | 同 upload 413,见上 | -| 容器内 `curl https://www.baidu.com` 报 `Temporary failure in name resolution` | iptables `127.0.0.0/8 DROP` 把 docker embedded DNS(`127.0.0.11:53`)也挡了;init.sh 已加 `127.0.0.11/32 udp/tcp 53 ACCEPT` 在 DROP 之前。重 build 镜像后,容器 `cat /etc/resolv.conf` 应该是 `nameserver 127.0.0.11`,`getent hosts www.baidu.com` 应该返 IP | +| 容器内 `curl https://www.baidu.com` 报 `Temporary failure in name resolution` | 两层:① iptables `127.0.0.0/8 DROP` 把 docker embedded DNS(`127.0.0.11:53`)挡了 ── init.sh 已加 `127.0.0.11/32 udp/tcp 53 ACCEPT`;② docker embedded DNS 上游探测 host systemd-resolved 失败(腾讯云轻量等场景常见)── yaml `sandbox.dns` 显式指定 `8.8.8.8 / 114.114.114.114`,docker run `--dns` 直接注入容器 `/etc/resolv.conf`,绕过上游探测路径。重 build 镜像 + `docker rm -f $(docker ps -aq -f label=zcbot.product=sandbox)` 让新 image + 新 dns 配置生效 | | 启动报 `PLATFORM_KEY env not set` / `JWT_SECRET env not set` | D' 过渡 auth 强制双 env 必填。生成 `python -c "import secrets;print(secrets.token_urlsafe(48))"` 各填一,写 `.env` 重起 | | `/v1/auth/login_password` 返 403 `invalid email or password` | 邮箱不存在 / `password_hash` 列为空(platform_key 入口建的 user) / 密码错。`SELECT user_id, email, password_hash IS NOT NULL AS has_pw FROM users WHERE email=...` 核对;无行 → `main.py user add`;有行无密码 → `UPDATE users SET password_hash=...`(用 `.venv/Scripts/python.exe -c "from web.auth import hash_password;print(hash_password('xxx'))"` 算)或 `user add --user-id` 接到现有 user_id | | `main.py user add` 报 `IntegrityError ... uq_users_email` | 邮箱已存在,改 email 或先 `DELETE FROM users WHERE email=...`(先清该 user 的 tasks) | diff --git a/config/agent.yaml b/config/agent.yaml index 35a2afd..fed4c52 100644 --- a/config/agent.yaml +++ b/config/agent.yaml @@ -24,4 +24,11 @@ quotas: sandbox: memory: 2g # --memory (env: ZCBOT_SANDBOX_MEMORY) cpus: 1.0 # --cpus (env: ZCBOT_SANDBOX_CPUS) - pids_limit: 256 # --pids-limit (env: ZCBOT_SANDBOX_PIDS_LIMIT) + pids_limit: 256 # --pids-limit (env: ZCBOT_SANDBOX_PIDS_LIMIT) + # 容器 DNS server 显式配置(docker run --dns,容器 /etc/resolv.conf 直接写, + # 绕过 docker daemon 上游 DNS 探测路径;腾讯云轻量 / 部分云上 daemon 探测 + # systemd-resolved 上游会失败,导致 embedded DNS 127.0.0.11 forward 出去也跪)。 + # 默公共 DNS,国内访问通畅;留空(`dns: []`)走 docker 默认探测。 + dns: + - "8.8.8.8" # Google + - "114.114.114.114" # 国内通用 diff --git a/core/sandbox/pool.py b/core/sandbox/pool.py index 1809a6c..7b74156 100644 --- a/core/sandbox/pool.py +++ b/core/sandbox/pool.py @@ -89,6 +89,7 @@ class SandboxPool: memory: Optional[str] = None, cpus: Optional[str] = None, pids_limit: Optional[int] = None, + dns: Optional[List[str]] = None, ) -> None: """ user_root_base: per-user 子树父目录,典型 `/users`。bind mount 源 @@ -126,6 +127,12 @@ class SandboxPool: os.getenv("ZCBOT_SANDBOX_PIDS_LIMIT") or (pids_limit if pids_limit is not None else DEFAULT_PIDS_LIMIT) ) + # DNS:env(逗号分隔)> caller > 默空(让 docker 自己探测) + env_dns = os.getenv("ZCBOT_SANDBOX_DNS", "").strip() + if env_dns: + self.dns: List[str] = [x.strip() for x in env_dns.split(",") if x.strip()] + else: + self.dns = list(dns) if dns else [] self._dict_lock = threading.Lock() # 保护 _locks / _last_active 的字典级 race self._locks: Dict[UUID, threading.Lock] = {} self._last_active: Dict[UUID, int] = {} @@ -177,6 +184,10 @@ class SandboxPool: "-e", f"ZCBOT_PG_IPS={self.pg_ips}", "--restart=no", ] + # 显式 DNS(绕过 docker daemon 上游探测,腾讯云轻量等场景下 daemon 探测 + # host systemd-resolved 上游不稳) + for dns_ip in self.dns: + cmd += ["--dns", dns_ip] # repo skills 只读 mount ── fs 工具进容器后(read/glob/grep)能 access # SKILL.md 内引用的 references/*.md。host 上 zcbot/skills/ 是项目代码, # 跟用户 working_dir 正交,只读防容器内进程改 skill 实现。 @@ -256,10 +267,14 @@ def setup_pool( """ ensure_network() cfg = sandbox_cfg or {} + dns_cfg = cfg.get("dns") or [] + if not isinstance(dns_cfg, list): + dns_cfg = [] return SandboxPool( user_root_base=user_root_base, repo_root=repo_root, memory=cfg.get("memory") if isinstance(cfg.get("memory"), str) else None, cpus=str(cfg["cpus"]) if cfg.get("cpus") is not None else None, pids_limit=int(cfg["pids_limit"]) if cfg.get("pids_limit") is not None else None, + dns=[str(x) for x in dns_cfg], )