"""Sandbox 部署前置对账(`main.py sandbox check`)。 跑 5 项独立探测,各自打 `[ok]` / `[warn]` / `[err]`,汇总后返 exit code。 外部用户开放前所有项必须 `[ok]`。 探测项与 §7.5 协议对应: 1. Docker daemon 可达 -- ZCBOT_SANDBOX_BACKEND=docker 启用必备 2. `zcbot-sandbox:latest` 镜像存在 -- 缺则 pool.ensure 时 docker run 报 "Unable to find image" 3. `zcbot-sandbox-net` network 存在 -- 缺也无所谓(init_pool 内自动 ensure),但提前预热 4. 镜像 HOST_UID 与 host zcbot uid 对齐 -- 错配会让 exec 进来后 write /workspace 时 EACCES 5. user_root_base fs 类型可 quota -- §7.5 #4,xfs prjquota / ext4 project / zfs;否则 "扫描间隙打满共享 fs"会拖死同节点其他 user(攻击者写满速度 >> 应用层周期扫描) """ from __future__ import annotations import os import shutil import subprocess import sys from pathlib import Path from typing import Tuple from .pool import DEFAULT_IMAGE from .network import NETWORK_NAME # 颜色用 ANSI(终端不支持的环境自动退化为 plain;click.echo 不强求 click context) def _ok(msg: str) -> None: print(f"[ok] {msg}") def _warn(msg: str) -> None: print(f"[warn] {msg}") def _err(msg: str) -> None: print(f"[err] {msg}") def _run(argv, timeout: int = 10) -> Tuple[int, str, str]: """统一 subprocess.run wrapper。docker CLI 不存在 → returncode=127,stderr 给原因。""" if shutil.which(argv[0]) is None: return 127, "", f"{argv[0]} not found in PATH" try: r = subprocess.run(argv, capture_output=True, text=True, timeout=timeout) return r.returncode, r.stdout.strip(), r.stderr.strip() except subprocess.TimeoutExpired: return 124, "", f"timed out after {timeout}s" except Exception as e: return 1, "", f"{type(e).__name__}: {e}" # -- 探测项 ------------------------------------------------ def check_docker_daemon() -> bool: rc, out, err = _run(["docker", "version", "--format", "{{.Server.Version}}"]) if rc == 0 and out: _ok(f"docker daemon reachable (server={out})") return True if rc == 127: _err("docker CLI not found -- apt install docker.io / docker-ce") elif "permission denied" in err.lower(): _err(f"docker daemon not reachable: {err} -- usermod -aG docker $USER + relogin") else: _err(f"docker daemon not reachable: {err or 'unknown'}") return False def check_image_present() -> bool: image = os.getenv("ZCBOT_SANDBOX_IMAGE", DEFAULT_IMAGE) rc, _, err = _run(["docker", "image", "inspect", image]) if rc == 0: _ok(f"image present: {image}") return True _err( f"image not found: {image} -- " f"`docker build -f deploy/sandbox/Dockerfile " f"--build-arg HOST_UID=$(id -u) --build-arg HOST_GID=$(id -g) " f"-t {image} .`" ) return False def check_network_present() -> bool: rc, _, _ = _run(["docker", "network", "inspect", NETWORK_NAME]) if rc == 0: _ok(f"network present: {NETWORK_NAME}") return True _warn( f"network missing: {NETWORK_NAME} -- lifespan 启动会自动 ensure;" f"或手动 `docker network create --internal {NETWORK_NAME}`" ) return True # warn 不算失败 def check_host_uid_alignment() -> bool: """镜像内 zcbot 用户 uid 与 host 当前 uid 对齐。 bind mount 让 host fs owner 直接落进容器;镜像 build 时若漏传 `HOST_UID`, 容器内默 uid=1000,host 实际跑 zcbot 服务的账号若 uid≠1000 → exec 写 /workspace 全 EACCES。这里用 `docker run --rm --entrypoint id -u zcbot` 拿镜像 uid, 与 host `os.getuid()` 比对(假设 zcbot 用户跑 check 子命令)。 """ image = os.getenv("ZCBOT_SANDBOX_IMAGE", DEFAULT_IMAGE) rc, out, err = _run( ["docker", "run", "--rm", "--entrypoint", "id", image, "-u", "zcbot"] ) if rc != 0: _warn( f"image uid check skipped: {err or 'unknown'} -- " f"if image not built yet 先跑 build 再来" ) return True try: image_uid = int(out) except ValueError: _warn(f"image uid unexpected output: {out!r}") return True if sys.platform == "win32": _warn( f"image zcbot uid={image_uid}; host uid check skipped on Windows " f"(Linux 部署机上跑 check 才有意义)" ) return True host_uid = os.getuid() # type: ignore[attr-defined] if image_uid == host_uid: _ok(f"HOST_UID aligned: image zcbot uid={image_uid} == host uid={host_uid}") return True _err( f"HOST_UID mismatch: image zcbot uid={image_uid}, host uid={host_uid} -- " f"重 build 镜像 `docker build --build-arg HOST_UID={host_uid} ...`" ) return False def detect_fs_quota(target: Path) -> Tuple[str, str]: """探测 target 所在 fs 是否可 quota,返 (level, msg)。 level ∈ {"ok", "warn"} —— fs quota 永不视为 err(不阻塞 web 启动)。 给 CLI 与 lifespan 共用 —— CLI 走 _ok/_warn 打印,lifespan 走 print。 识别: - xfs:mount options 含 `prjquota` 或 `pquota` → ok;否则 warn(fs 支持但未 enable) - ext4:mount options 含 `prjquota` 或 `project,quota` → ok - zfs:任何 → ok(dataset quota 在 zfs set 层,这里不深入) - btrfs:警告 quota 群组复杂 - tmpfs / overlay / 其他:warn(典型 Docker-in-Docker 或本地 dev,生产部署不应该) """ if sys.platform == "win32": return "warn", "fs quota check skipped on Windows (Linux 部署机才有意义)" # findmnt 在多数 Linux 发行版自带(util-linux) rc, out, err = _run([ "findmnt", "--target", str(target), "-no", "FSTYPE,OPTIONS", ]) if rc != 0 or not out: return "warn", ( f"fs quota check skipped: cannot detect fs for {target} " f"({err or 'findmnt missing'})" ) parts = out.split() fstype = parts[0].lower() if parts else "" options = parts[1] if len(parts) > 1 else "" opts = set(options.split(",")) if fstype == "xfs": if "prjquota" in opts or "pquota" in opts: return "ok", f"fs quota: xfs with prjquota on {target}" return "warn", ( f"fs quota: xfs on {target} but NO prjquota mount option -- " f"`sudo mount -o remount,prjquota ` + `xfs_quota -x ...`" ) if fstype == "ext4": if "prjquota" in opts or ("project" in opts and "quota" in opts): return "ok", f"fs quota: ext4 with project quota on {target}" return "warn", ( f"fs quota: ext4 on {target} but NO project quota option -- " f"`tune2fs -O project,quota ` + remount + `quota -P`" ) if fstype == "zfs": return "ok", f"fs quota: zfs on {target} (dataset quota via `zfs set quota=...`)" if fstype == "btrfs": return "warn", ( f"fs quota: btrfs on {target} -- qgroup 配置复杂,生产部署" f"推荐 xfs prjquota;如必须用 btrfs 自行验 `btrfs qgroup`" ) return "warn", ( f"fs quota: {fstype or ''} on {target} -- " f"非主流 quota-able 类型,外部用户开放前换 xfs/ext4/zfs 单独分区" ) def check_fs_quota_capable() -> bool: """CLI 入口:探测 workspace/users/ 所在 fs。返 True(永不 err)。""" from core.agent_builder import load_config, resolve_workspace try: cfg = load_config() workspace = resolve_workspace(None, cfg) target = (workspace / "users").resolve() except Exception as e: _warn(f"fs quota check: cannot resolve workspace path: {e}") return True level, msg = detect_fs_quota(target) if level == "ok": _ok(msg) else: _warn(msg) return True # -- 汇总入口 --------------------------------------------- CHECK_NAMES = [ ("docker daemon", "check_docker_daemon"), ("image present", "check_image_present"), ("network present", "check_network_present"), ("HOST_UID alignment", "check_host_uid_alignment"), ("fs quota capable", "check_fs_quota_capable"), ] def run_sandbox_check() -> int: """跑所有探测,返 exit code(0=全 ok 或仅 warn;1=有 err)。 err vs warn 分界: - err = docker backend 启动会 fail-fast 的根因(daemon / 镜像 / HOST_UID) - warn = 不阻塞启动但外部用户开放前要清(network 缺 / fs 不可 quota) 通过模块全局 lookup 拿函数引用(不固化进 CHECKS 元组),让 unittest patch `core.sandbox.check.check_xxx` 对本函数生效。 """ print("--- sandbox deployment check ---\n") ok_count = 0 module = sys.modules[__name__] for label, fn_name in CHECK_NAMES: fn = getattr(module, fn_name) try: if fn(): ok_count += 1 except Exception as e: _err(f"{label}: unexpected {type(e).__name__}: {e}") total = len(CHECK_NAMES) print() if ok_count == total: print(f"[summary] {ok_count}/{total} checks passed -- docker backend ready") return 0 failed = total - ok_count print( f"[summary] {ok_count}/{total} passed, {failed} failed -- " f"修完上面的 [err] 项再启 docker backend" ) return 1