zcbot/core/executor_docker.py

240 lines
9.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""DockerExecutor:`shell` / `run_python` 走 docker exec,其余 in-process(§7.5 #6)。
Backend 二分(§7.5 #6 信任域):
- host in-process:`read/write/edit/glob/grep/load_skill/web_*/seedream/seedance`
原本就在 host 持凭据(Bocha key / ARK key)或走 `paths.py::resolve_user_path` 校验
(user-rooted 安全边界已存),塞容器无收益付 ~200ms exec overhead × N 次
- container exec:`shell` / `run_python` —— 执行模型生成的任意代码,必须容器隔离
容器准入(per call):
1. `pool.ensure(user_id)` —— 拿到 / 起 `zcbot-sandbox-<uid>` 容器(per-user lock 已串行化)
2. `docker exec --user 1000:1000 --workdir /workspace/<wd_name> <c> setsid bash -c '<cmd>'`
3. timeout 到 → 杀 docker CLI 客户端(Popen.kill())
4. 完成 → `pool.mark_active(user_id)` 刷 idle 计时
run_python tmp .py 落 host 侧 `<user_root>/.zcbot_tmp/<task_id>/<rand>.py`(bind mount
自动可见于容器 `/workspace/.zcbot_tmp/<task_id>/`),执行完 unlink。dotfile 起头让
`/v1/files` API 天然过滤(`web/app.py:169` startswith(".")),用户视野不污染。
Cancel limitation(第一版接受):
- docker exec 客户端断开后,容器内 server 端进程**不会**因此终止 —— 这是 docker 设计
- 第一版只杀 docker CLI(Popen.kill());容器内残留进程靠 idle 5min reaper / 下次
ensure 时 rm -f 兜底
- 升级触发(§7.5 #3 PGID 协议):用户反馈"取消了但还在烧 CPU" / 多次 cancel 后
容器内进程堆积 → 启用「ZCBOT_EXEC_ID env + PGID 写文件 + 二次 exec kill」协议
"""
from __future__ import annotations
import os
import secrets
import subprocess
import time
from pathlib import Path
from typing import Any, Dict, List, Optional
from uuid import UUID
from .executor import ExecCtx, Executor, ToolResult
from .executor_host import HostExecutor
from .sandbox import SandboxPool
CONTAINER_TOOLS = frozenset({"shell", "run_python"})
# 容器内非 root 用户:与 Dockerfile HOST_UID/HOST_GID build-arg 默认值对齐。
# 部署机 host 上 zcbot 账号 uid 若非 1000,镜像 build 时透传 HOST_UID + 这里
# env `ZCBOT_SANDBOX_EXEC_USER` 同步改(详 RUN.md "Sandbox 部署"段)。
DEFAULT_EXEC_USER = "1000:1000"
# host 侧 tmp 脚本目录(user_root 内 dotfile,被 /v1/files API 隐藏)
TMP_SUBDIR = ".zcbot_tmp"
class DockerExecutor(Executor):
"""组合 HostExecutor + docker exec dispatch shell/run_python。
host backend 仍承担 schema 列表 + 大部分 tool 执行;本类只在 shell/run_python
命中时夺路接管,docker exec 在 per-user 容器里跑。
"""
def __init__(
self,
host: HostExecutor,
pool: SandboxPool,
user_id: UUID,
user_root: Path,
working_dir: Path,
) -> None:
self.host = host
self.pool = pool
self.user_id = user_id
self.user_root = user_root.resolve()
self.working_dir = working_dir.resolve()
# 容器内对应路径 /workspace/<wd_name>
try:
wd_rel = self.working_dir.relative_to(self.user_root)
self.container_workdir = "/workspace/" + wd_rel.as_posix()
except ValueError:
# working_dir 不在 user_root 下 —— 防御性兜底,正常路径不会到这里
self.container_workdir = "/workspace"
self.exec_user = os.getenv("ZCBOT_SANDBOX_EXEC_USER", DEFAULT_EXEC_USER)
# ── Executor 接口 ────────────────────────────────────────
def has_tool(self, name: str) -> bool:
return self.host.has_tool(name)
def schemas(self) -> List[Dict[str, Any]]:
return self.host.schemas()
def call_tool(self, name: str, args: Dict[str, Any], ctx: ExecCtx) -> ToolResult:
if name not in CONTAINER_TOOLS:
return self.host.call_tool(name, args, ctx)
if not self.host.has_tool(name):
# caps.enable_run_python=False 等场景下,host 没装 run_python → schema 也没暴露
return ToolResult(content=f"[Error] unknown tool: {name}", exit_code=2)
try:
if name == "shell":
return self._exec_shell(args, ctx)
if name == "run_python":
return self._exec_python(args, ctx)
except Exception as e:
return ToolResult(
content=f"[Error executing {name} via docker] {type(e).__name__}: {e}",
exit_code=1,
)
return ToolResult(content=f"[Error] unhandled container tool: {name}", exit_code=2)
# ── shell ────────────────────────────────────────────────
def _exec_shell(self, args: Dict[str, Any], ctx: ExecCtx) -> ToolResult:
cmd = args.get("command")
if not isinstance(cmd, str) or not cmd.strip():
return ToolResult(
content="[Error] bad arguments to shell: command must be non-empty string",
exit_code=2,
)
timeout = int(args.get("timeout") or 60)
container = self.pool.ensure(self.user_id)
argv = self._docker_exec_argv(container) + ["setsid", "bash", "-c", cmd]
result = self._run_subprocess(argv, timeout=timeout, ctx=ctx)
self.pool.mark_active(self.user_id)
return result
# ── run_python ───────────────────────────────────────────
def _exec_python(self, args: Dict[str, Any], ctx: ExecCtx) -> ToolResult:
code = args.get("code")
if not isinstance(code, str):
return ToolResult(
content="[Error] bad arguments to run_python: code must be string",
exit_code=2,
)
timeout = int(args.get("timeout") or 120)
# tmp .py 落 host 侧 `.zcbot_tmp/<task_id>/<rand>.py`;
# 容器内对应 /workspace/.zcbot_tmp/<task_id>/<rand>.py
tmp_root = self.user_root / TMP_SUBDIR / str(ctx.task_id)
tmp_root.mkdir(parents=True, exist_ok=True)
rand_name = f"{int(time.time() * 1000)}-{secrets.token_hex(4)}.py"
host_script = tmp_root / rand_name
container_script = f"/workspace/{TMP_SUBDIR}/{ctx.task_id}/{rand_name}"
host_script.write_text(code, encoding="utf-8")
try:
container = self.pool.ensure(self.user_id)
argv = self._docker_exec_argv(
container,
extra_env={
"PYTHONIOENCODING": "utf-8",
"PYTHONPATH": "/workspace",
},
) + ["setsid", "python", container_script]
result = self._run_subprocess(argv, timeout=timeout, ctx=ctx)
self.pool.mark_active(self.user_id)
return result
finally:
try:
host_script.unlink()
except OSError:
pass
# ── helpers ──────────────────────────────────────────────
def _docker_exec_argv(
self, container: str, extra_env: Optional[Dict[str, str]] = None
) -> List[str]:
argv = [
"docker", "exec",
"--user", self.exec_user,
"--workdir", self.container_workdir,
]
env: Dict[str, str] = {}
if extra_env:
env.update(extra_env)
for k, v in env.items():
argv.extend(["-e", f"{k}={v}"])
argv.append(container)
return argv
def _run_subprocess(
self, argv: List[str], timeout: int, ctx: ExecCtx
) -> ToolResult:
"""跑 docker exec 子进程,带 cancel 协作 poll。
cancel 命中 / timeout 到 → Popen.kill() 杀 docker CLI 客户端;
容器内 server 端进程接受 limitation(见模块头注释)。
"""
cancel_check = ctx.cancel_check
try:
proc = subprocess.Popen(
argv,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding="utf-8",
errors="replace",
)
except FileNotFoundError as e:
return ToolResult(content=f"[Error] docker CLI not found: {e}", exit_code=2)
start = time.monotonic()
cancel_hit = False
timeout_hit = False
stdout: str = ""
stderr: str = ""
while True:
try:
stdout, stderr = proc.communicate(timeout=0.5)
break
except subprocess.TimeoutExpired:
if cancel_check is not None and cancel_check():
cancel_hit = True
proc.kill()
stdout, stderr = proc.communicate()
break
if time.monotonic() - start > timeout:
timeout_hit = True
proc.kill()
stdout, stderr = proc.communicate()
break
if timeout_hit:
return ToolResult(
content=f"[Error] command timed out after {timeout}s",
exit_code=124,
)
if cancel_hit:
return ToolResult(
content="[Error] command cancelled by user",
exit_code=130,
)
parts: List[str] = []
if stdout:
parts.append(f"[stdout]\n{stdout.rstrip()}")
if stderr:
parts.append(f"[stderr]\n{stderr.rstrip()}")
parts.append(f"[exit {proc.returncode}]")
return ToolResult(content="\n".join(parts), exit_code=proc.returncode)