"""DockerExecutor:`shell` / `run_python` 走 docker exec,其余 in-process(§7.5 #6)。 Backend 二分(§7.5 #6 信任域): - host in-process:`read/write/edit/glob/grep/load_skill/web_*/seedream/seedance` 原本就在 host 持凭据(Bocha key / ARK key)或走 `paths.py::resolve_user_path` 校验 (user-rooted 安全边界已存),塞容器无收益付 ~200ms exec overhead × N 次 - container exec:`shell` / `run_python` —— 执行模型生成的任意代码,必须容器隔离 容器准入(per call): 1. `pool.ensure(user_id)` —— 拿到 / 起 `zcbot-sandbox-` 容器(per-user lock 已串行化) 2. `docker exec --user 1000:1000 --workdir /workspace/ setsid bash -c ''` 3. timeout 到 → 杀 docker CLI 客户端(Popen.kill()) 4. 完成 → `pool.mark_active(user_id)` 刷 idle 计时 run_python tmp .py 落 host 侧 `/.zcbot_tmp//.py`(bind mount 自动可见于容器 `/workspace/.zcbot_tmp//`),执行完 unlink。dotfile 起头让 `/v1/files` API 天然过滤(`web/app.py:169` startswith(".")),用户视野不污染。 Cancel limitation(第一版接受): - docker exec 客户端断开后,容器内 server 端进程**不会**因此终止 —— 这是 docker 设计 - 第一版只杀 docker CLI(Popen.kill());容器内残留进程靠 idle 5min reaper / 下次 ensure 时 rm -f 兜底 - 升级触发(§7.5 #3 PGID 协议):用户反馈"取消了但还在烧 CPU" / 多次 cancel 后 容器内进程堆积 → 启用「ZCBOT_EXEC_ID env + PGID 写文件 + 二次 exec kill」协议 """ from __future__ import annotations import os import secrets import subprocess import time from pathlib import Path from typing import Any, Dict, List, Optional from uuid import UUID from .executor import ExecCtx, Executor, ToolResult from .executor_host import HostExecutor from .sandbox import SandboxPool CONTAINER_TOOLS = frozenset({"shell", "run_python"}) # 容器内非 root 用户:与 Dockerfile HOST_UID/HOST_GID build-arg 默认值对齐。 # 部署机 host 上 zcbot 账号 uid 若非 1000,镜像 build 时透传 HOST_UID + 这里 # env `ZCBOT_SANDBOX_EXEC_USER` 同步改(详 RUN.md "Sandbox 部署"段)。 DEFAULT_EXEC_USER = "1000:1000" # host 侧 tmp 脚本目录(user_root 内 dotfile,被 /v1/files API 隐藏) TMP_SUBDIR = ".zcbot_tmp" class DockerExecutor(Executor): """组合 HostExecutor + docker exec dispatch shell/run_python。 host backend 仍承担 schema 列表 + 大部分 tool 执行;本类只在 shell/run_python 命中时夺路接管,docker exec 在 per-user 容器里跑。 """ def __init__( self, host: HostExecutor, pool: SandboxPool, user_id: UUID, user_root: Path, working_dir: Path, ) -> None: self.host = host self.pool = pool self.user_id = user_id self.user_root = user_root.resolve() self.working_dir = working_dir.resolve() # 容器内对应路径 /workspace/ try: wd_rel = self.working_dir.relative_to(self.user_root) self.container_workdir = "/workspace/" + wd_rel.as_posix() except ValueError: # working_dir 不在 user_root 下 —— 防御性兜底,正常路径不会到这里 self.container_workdir = "/workspace" self.exec_user = os.getenv("ZCBOT_SANDBOX_EXEC_USER", DEFAULT_EXEC_USER) # ── Executor 接口 ──────────────────────────────────────── def has_tool(self, name: str) -> bool: return self.host.has_tool(name) def schemas(self) -> List[Dict[str, Any]]: return self.host.schemas() def call_tool(self, name: str, args: Dict[str, Any], ctx: ExecCtx) -> ToolResult: if name not in CONTAINER_TOOLS: return self.host.call_tool(name, args, ctx) if not self.host.has_tool(name): # caps.enable_run_python=False 等场景下,host 没装 run_python → schema 也没暴露 return ToolResult(content=f"[Error] unknown tool: {name}", exit_code=2) try: if name == "shell": return self._exec_shell(args, ctx) if name == "run_python": return self._exec_python(args, ctx) except Exception as e: return ToolResult( content=f"[Error executing {name} via docker] {type(e).__name__}: {e}", exit_code=1, ) return ToolResult(content=f"[Error] unhandled container tool: {name}", exit_code=2) # ── shell ──────────────────────────────────────────────── def _exec_shell(self, args: Dict[str, Any], ctx: ExecCtx) -> ToolResult: cmd = args.get("command") if not isinstance(cmd, str) or not cmd.strip(): return ToolResult( content="[Error] bad arguments to shell: command must be non-empty string", exit_code=2, ) timeout = int(args.get("timeout") or 60) container = self.pool.ensure(self.user_id) argv = self._docker_exec_argv(container) + ["setsid", "bash", "-c", cmd] result = self._run_subprocess(argv, timeout=timeout, ctx=ctx) self.pool.mark_active(self.user_id) return result # ── run_python ─────────────────────────────────────────── def _exec_python(self, args: Dict[str, Any], ctx: ExecCtx) -> ToolResult: code = args.get("code") if not isinstance(code, str): return ToolResult( content="[Error] bad arguments to run_python: code must be string", exit_code=2, ) timeout = int(args.get("timeout") or 120) # tmp .py 落 host 侧 `.zcbot_tmp//.py`; # 容器内对应 /workspace/.zcbot_tmp//.py tmp_root = self.user_root / TMP_SUBDIR / str(ctx.task_id) tmp_root.mkdir(parents=True, exist_ok=True) rand_name = f"{int(time.time() * 1000)}-{secrets.token_hex(4)}.py" host_script = tmp_root / rand_name container_script = f"/workspace/{TMP_SUBDIR}/{ctx.task_id}/{rand_name}" host_script.write_text(code, encoding="utf-8") try: container = self.pool.ensure(self.user_id) argv = self._docker_exec_argv( container, extra_env={ "PYTHONIOENCODING": "utf-8", "PYTHONPATH": "/workspace", }, ) + ["setsid", "python", container_script] result = self._run_subprocess(argv, timeout=timeout, ctx=ctx) self.pool.mark_active(self.user_id) return result finally: try: host_script.unlink() except OSError: pass # ── helpers ────────────────────────────────────────────── def _docker_exec_argv( self, container: str, extra_env: Optional[Dict[str, str]] = None ) -> List[str]: argv = [ "docker", "exec", "--user", self.exec_user, "--workdir", self.container_workdir, ] env: Dict[str, str] = {} if extra_env: env.update(extra_env) for k, v in env.items(): argv.extend(["-e", f"{k}={v}"]) argv.append(container) return argv def _run_subprocess( self, argv: List[str], timeout: int, ctx: ExecCtx ) -> ToolResult: """跑 docker exec 子进程,带 cancel 协作 poll。 cancel 命中 / timeout 到 → Popen.kill() 杀 docker CLI 客户端; 容器内 server 端进程接受 limitation(见模块头注释)。 """ cancel_check = ctx.cancel_check try: proc = subprocess.Popen( argv, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding="utf-8", errors="replace", ) except FileNotFoundError as e: return ToolResult(content=f"[Error] docker CLI not found: {e}", exit_code=2) start = time.monotonic() cancel_hit = False timeout_hit = False stdout: str = "" stderr: str = "" while True: try: stdout, stderr = proc.communicate(timeout=0.5) break except subprocess.TimeoutExpired: if cancel_check is not None and cancel_check(): cancel_hit = True proc.kill() stdout, stderr = proc.communicate() break if time.monotonic() - start > timeout: timeout_hit = True proc.kill() stdout, stderr = proc.communicate() break if timeout_hit: return ToolResult( content=f"[Error] command timed out after {timeout}s", exit_code=124, ) if cancel_hit: return ToolResult( content="[Error] command cancelled by user", exit_code=130, ) parts: List[str] = [] if stdout: parts.append(f"[stdout]\n{stdout.rstrip()}") if stderr: parts.append(f"[stderr]\n{stderr.rstrip()}") parts.append(f"[exit {proc.returncode}]") return ToolResult(content="\n".join(parts), exit_code=proc.returncode)