zcbot/core/sandbox/check.py

259 lines
9.2 KiB
Python

"""Sandbox 部署前置对账(`main.py sandbox check`)。
跑 5 项独立探测,各自打 `[ok]` / `[warn]` / `[err]`,汇总后返 exit code。
外部用户开放前所有项必须 `[ok]`。
探测项与 §7.5 协议对应:
1. Docker daemon 可达 -- ZCBOT_SANDBOX_BACKEND=docker 启用必备
2. `zcbot-sandbox:latest` 镜像存在 -- 缺则 pool.ensure 时 docker run 报 "Unable to find image"
3. `zcbot-sandbox-net` network 存在 -- 缺也无所谓(init_pool 内自动 ensure),但提前预热
4. 镜像 HOST_UID 与 host zcbot uid 对齐 -- 错配会让 exec 进来后 write /workspace 时 EACCES
5. user_root_base fs 类型可 quota -- §7.5 #4,xfs prjquota / ext4 project / zfs;否则
"扫描间隙打满共享 fs"会拖死同节点其他 user(攻击者写满速度 >> 应用层周期扫描)
"""
from __future__ import annotations
import os
import shutil
import subprocess
import sys
from pathlib import Path
from typing import Tuple
from .pool import DEFAULT_IMAGE
from .network import NETWORK_NAME
# 颜色用 ANSI(终端不支持的环境自动退化为 plain;click.echo 不强求 click context)
def _ok(msg: str) -> None:
print(f"[ok] {msg}")
def _warn(msg: str) -> None:
print(f"[warn] {msg}")
def _err(msg: str) -> None:
print(f"[err] {msg}")
def _run(argv, timeout: int = 10) -> Tuple[int, str, str]:
"""统一 subprocess.run wrapper。docker CLI 不存在 → returncode=127,stderr 给原因。"""
if shutil.which(argv[0]) is None:
return 127, "", f"{argv[0]} not found in PATH"
try:
r = subprocess.run(argv, capture_output=True, text=True, timeout=timeout)
return r.returncode, r.stdout.strip(), r.stderr.strip()
except subprocess.TimeoutExpired:
return 124, "", f"timed out after {timeout}s"
except Exception as e:
return 1, "", f"{type(e).__name__}: {e}"
# -- 探测项 ------------------------------------------------
def check_docker_daemon() -> bool:
rc, out, err = _run(["docker", "version", "--format", "{{.Server.Version}}"])
if rc == 0 and out:
_ok(f"docker daemon reachable (server={out})")
return True
if rc == 127:
_err("docker CLI not found -- apt install docker.io / docker-ce")
elif "permission denied" in err.lower():
_err(f"docker daemon not reachable: {err} -- usermod -aG docker $USER + relogin")
else:
_err(f"docker daemon not reachable: {err or 'unknown'}")
return False
def check_image_present() -> bool:
image = os.getenv("ZCBOT_SANDBOX_IMAGE", DEFAULT_IMAGE)
rc, _, err = _run(["docker", "image", "inspect", image])
if rc == 0:
_ok(f"image present: {image}")
return True
_err(
f"image not found: {image} -- "
f"`docker build -f deploy/sandbox/Dockerfile "
f"--build-arg HOST_UID=$(id -u) --build-arg HOST_GID=$(id -g) "
f"-t {image} .`"
)
return False
def check_network_present() -> bool:
rc, _, _ = _run(["docker", "network", "inspect", NETWORK_NAME])
if rc == 0:
_ok(f"network present: {NETWORK_NAME}")
return True
_warn(
f"network missing: {NETWORK_NAME} -- lifespan 启动会自动 ensure;"
f"或手动 `docker network create --internal {NETWORK_NAME}`"
)
return True # warn 不算失败
def check_host_uid_alignment() -> bool:
"""镜像内 zcbot 用户 uid 与 host 当前 uid 对齐。
bind mount 让 host fs owner 直接落进容器;镜像 build 时若漏传 `HOST_UID`,
容器内默 uid=1000,host 实际跑 zcbot 服务的账号若 uid≠1000 → exec 写 /workspace
全 EACCES。这里用 `docker run --rm --entrypoint id -u zcbot` 拿镜像 uid,
与 host `os.getuid()` 比对(假设 zcbot 用户跑 check 子命令)。
"""
image = os.getenv("ZCBOT_SANDBOX_IMAGE", DEFAULT_IMAGE)
rc, out, err = _run(
["docker", "run", "--rm", "--entrypoint", "id", image, "-u", "zcbot"]
)
if rc != 0:
_warn(
f"image uid check skipped: {err or 'unknown'} -- "
f"if image not built yet 先跑 build 再来"
)
return True
try:
image_uid = int(out)
except ValueError:
_warn(f"image uid unexpected output: {out!r}")
return True
if sys.platform == "win32":
_warn(
f"image zcbot uid={image_uid}; host uid check skipped on Windows "
f"(Linux 部署机上跑 check 才有意义)"
)
return True
host_uid = os.getuid() # type: ignore[attr-defined]
if image_uid == host_uid:
_ok(f"HOST_UID aligned: image zcbot uid={image_uid} == host uid={host_uid}")
return True
_err(
f"HOST_UID mismatch: image zcbot uid={image_uid}, host uid={host_uid} -- "
f"重 build 镜像 `docker build --build-arg HOST_UID={host_uid} ...`"
)
return False
def detect_fs_quota(target: Path) -> Tuple[str, str]:
"""探测 target 所在 fs 是否可 quota,返 (level, msg)。
level ∈ {"ok", "warn"} —— fs quota 永不视为 err(不阻塞 web 启动)。
给 CLI 与 lifespan 共用 —— CLI 走 _ok/_warn 打印,lifespan 走 print。
识别:
- xfs:mount options 含 `prjquota` 或 `pquota` → ok;否则 warn(fs 支持但未 enable)
- ext4:mount options 含 `prjquota` 或 `project,quota` → ok
- zfs:任何 → ok(dataset quota 在 zfs set 层,这里不深入)
- btrfs:警告 quota 群组复杂
- tmpfs / overlay / 其他:warn(典型 Docker-in-Docker 或本地 dev,生产部署不应该)
"""
if sys.platform == "win32":
return "warn", "fs quota check skipped on Windows (Linux 部署机才有意义)"
# findmnt 在多数 Linux 发行版自带(util-linux)
rc, out, err = _run([
"findmnt", "--target", str(target), "-no", "FSTYPE,OPTIONS",
])
if rc != 0 or not out:
return "warn", (
f"fs quota check skipped: cannot detect fs for {target} "
f"({err or 'findmnt missing'})"
)
parts = out.split()
fstype = parts[0].lower() if parts else ""
options = parts[1] if len(parts) > 1 else ""
opts = set(options.split(","))
if fstype == "xfs":
if "prjquota" in opts or "pquota" in opts:
return "ok", f"fs quota: xfs with prjquota on {target}"
return "warn", (
f"fs quota: xfs on {target} but NO prjquota mount option -- "
f"`sudo mount -o remount,prjquota <mountpoint>` + `xfs_quota -x ...`"
)
if fstype == "ext4":
if "prjquota" in opts or ("project" in opts and "quota" in opts):
return "ok", f"fs quota: ext4 with project quota on {target}"
return "warn", (
f"fs quota: ext4 on {target} but NO project quota option -- "
f"`tune2fs -O project,quota <dev>` + remount + `quota -P`"
)
if fstype == "zfs":
return "ok", f"fs quota: zfs on {target} (dataset quota via `zfs set quota=...`)"
if fstype == "btrfs":
return "warn", (
f"fs quota: btrfs on {target} -- qgroup 配置复杂,生产部署"
f"推荐 xfs prjquota;如必须用 btrfs 自行验 `btrfs qgroup`"
)
return "warn", (
f"fs quota: {fstype or '<unknown>'} on {target} -- "
f"非主流 quota-able 类型,外部用户开放前换 xfs/ext4/zfs 单独分区"
)
def check_fs_quota_capable() -> bool:
"""CLI 入口:探测 workspace/users/ 所在 fs。返 True(永不 err)。"""
from core.agent_builder import load_config, resolve_workspace
try:
cfg = load_config()
workspace = resolve_workspace(None, cfg)
target = (workspace / "users").resolve()
except Exception as e:
_warn(f"fs quota check: cannot resolve workspace path: {e}")
return True
level, msg = detect_fs_quota(target)
if level == "ok":
_ok(msg)
else:
_warn(msg)
return True
# -- 汇总入口 ---------------------------------------------
CHECK_NAMES = [
("docker daemon", "check_docker_daemon"),
("image present", "check_image_present"),
("network present", "check_network_present"),
("HOST_UID alignment", "check_host_uid_alignment"),
("fs quota capable", "check_fs_quota_capable"),
]
def run_sandbox_check() -> int:
"""跑所有探测,返 exit code(0=全 ok 或仅 warn;1=有 err)。
err vs warn 分界:
- err = docker backend 启动会 fail-fast 的根因(daemon / 镜像 / HOST_UID)
- warn = 不阻塞启动但外部用户开放前要清(network 缺 / fs 不可 quota)
通过模块全局 lookup 拿函数引用(不固化进 CHECKS 元组),让 unittest patch
`core.sandbox.check.check_xxx` 对本函数生效。
"""
print("--- sandbox deployment check ---\n")
ok_count = 0
module = sys.modules[__name__]
for label, fn_name in CHECK_NAMES:
fn = getattr(module, fn_name)
try:
if fn():
ok_count += 1
except Exception as e:
_err(f"{label}: unexpected {type(e).__name__}: {e}")
total = len(CHECK_NAMES)
print()
if ok_count == total:
print(f"[summary] {ok_count}/{total} checks passed -- docker backend ready")
return 0
failed = total - ok_count
print(
f"[summary] {ok_count}/{total} passed, {failed} failed -- "
f"修完上面的 [err] 项再启 docker backend"
)
return 1