259 lines
9.2 KiB
Python
259 lines
9.2 KiB
Python
"""Sandbox 部署前置对账(`main.py sandbox check`)。
|
|
|
|
跑 5 项独立探测,各自打 `[ok]` / `[warn]` / `[err]`,汇总后返 exit code。
|
|
外部用户开放前所有项必须 `[ok]`。
|
|
|
|
探测项与 §7.5 协议对应:
|
|
1. Docker daemon 可达 -- ZCBOT_SANDBOX_BACKEND=docker 启用必备
|
|
2. `zcbot-sandbox:latest` 镜像存在 -- 缺则 pool.ensure 时 docker run 报 "Unable to find image"
|
|
3. `zcbot-sandbox-net` network 存在 -- 缺也无所谓(init_pool 内自动 ensure),但提前预热
|
|
4. 镜像 HOST_UID 与 host zcbot uid 对齐 -- 错配会让 exec 进来后 write /workspace 时 EACCES
|
|
5. user_root_base fs 类型可 quota -- §7.5 #4,xfs prjquota / ext4 project / zfs;否则
|
|
"扫描间隙打满共享 fs"会拖死同节点其他 user(攻击者写满速度 >> 应用层周期扫描)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Tuple
|
|
|
|
from .pool import DEFAULT_IMAGE
|
|
from .network import NETWORK_NAME
|
|
|
|
|
|
# 颜色用 ANSI(终端不支持的环境自动退化为 plain;click.echo 不强求 click context)
|
|
def _ok(msg: str) -> None:
|
|
print(f"[ok] {msg}")
|
|
|
|
|
|
def _warn(msg: str) -> None:
|
|
print(f"[warn] {msg}")
|
|
|
|
|
|
def _err(msg: str) -> None:
|
|
print(f"[err] {msg}")
|
|
|
|
|
|
def _run(argv, timeout: int = 10) -> Tuple[int, str, str]:
|
|
"""统一 subprocess.run wrapper。docker CLI 不存在 → returncode=127,stderr 给原因。"""
|
|
if shutil.which(argv[0]) is None:
|
|
return 127, "", f"{argv[0]} not found in PATH"
|
|
try:
|
|
r = subprocess.run(argv, capture_output=True, text=True, timeout=timeout)
|
|
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
|
except subprocess.TimeoutExpired:
|
|
return 124, "", f"timed out after {timeout}s"
|
|
except Exception as e:
|
|
return 1, "", f"{type(e).__name__}: {e}"
|
|
|
|
|
|
# -- 探测项 ------------------------------------------------
|
|
|
|
def check_docker_daemon() -> bool:
|
|
rc, out, err = _run(["docker", "version", "--format", "{{.Server.Version}}"])
|
|
if rc == 0 and out:
|
|
_ok(f"docker daemon reachable (server={out})")
|
|
return True
|
|
if rc == 127:
|
|
_err("docker CLI not found -- apt install docker.io / docker-ce")
|
|
elif "permission denied" in err.lower():
|
|
_err(f"docker daemon not reachable: {err} -- usermod -aG docker $USER + relogin")
|
|
else:
|
|
_err(f"docker daemon not reachable: {err or 'unknown'}")
|
|
return False
|
|
|
|
|
|
def check_image_present() -> bool:
|
|
image = os.getenv("ZCBOT_SANDBOX_IMAGE", DEFAULT_IMAGE)
|
|
rc, _, err = _run(["docker", "image", "inspect", image])
|
|
if rc == 0:
|
|
_ok(f"image present: {image}")
|
|
return True
|
|
_err(
|
|
f"image not found: {image} -- "
|
|
f"`docker build -f deploy/sandbox/Dockerfile "
|
|
f"--build-arg HOST_UID=$(id -u) --build-arg HOST_GID=$(id -g) "
|
|
f"-t {image} .`"
|
|
)
|
|
return False
|
|
|
|
|
|
def check_network_present() -> bool:
|
|
rc, _, _ = _run(["docker", "network", "inspect", NETWORK_NAME])
|
|
if rc == 0:
|
|
_ok(f"network present: {NETWORK_NAME}")
|
|
return True
|
|
_warn(
|
|
f"network missing: {NETWORK_NAME} -- lifespan 启动会自动 ensure;"
|
|
f"或手动 `docker network create --internal {NETWORK_NAME}`"
|
|
)
|
|
return True # warn 不算失败
|
|
|
|
|
|
def check_host_uid_alignment() -> bool:
|
|
"""镜像内 zcbot 用户 uid 与 host 当前 uid 对齐。
|
|
|
|
bind mount 让 host fs owner 直接落进容器;镜像 build 时若漏传 `HOST_UID`,
|
|
容器内默 uid=1000,host 实际跑 zcbot 服务的账号若 uid≠1000 → exec 写 /workspace
|
|
全 EACCES。这里用 `docker run --rm --entrypoint id -u zcbot` 拿镜像 uid,
|
|
与 host `os.getuid()` 比对(假设 zcbot 用户跑 check 子命令)。
|
|
"""
|
|
image = os.getenv("ZCBOT_SANDBOX_IMAGE", DEFAULT_IMAGE)
|
|
rc, out, err = _run(
|
|
["docker", "run", "--rm", "--entrypoint", "id", image, "-u", "zcbot"]
|
|
)
|
|
if rc != 0:
|
|
_warn(
|
|
f"image uid check skipped: {err or 'unknown'} -- "
|
|
f"if image not built yet 先跑 build 再来"
|
|
)
|
|
return True
|
|
|
|
try:
|
|
image_uid = int(out)
|
|
except ValueError:
|
|
_warn(f"image uid unexpected output: {out!r}")
|
|
return True
|
|
|
|
if sys.platform == "win32":
|
|
_warn(
|
|
f"image zcbot uid={image_uid}; host uid check skipped on Windows "
|
|
f"(Linux 部署机上跑 check 才有意义)"
|
|
)
|
|
return True
|
|
|
|
host_uid = os.getuid() # type: ignore[attr-defined]
|
|
if image_uid == host_uid:
|
|
_ok(f"HOST_UID aligned: image zcbot uid={image_uid} == host uid={host_uid}")
|
|
return True
|
|
_err(
|
|
f"HOST_UID mismatch: image zcbot uid={image_uid}, host uid={host_uid} -- "
|
|
f"重 build 镜像 `docker build --build-arg HOST_UID={host_uid} ...`"
|
|
)
|
|
return False
|
|
|
|
|
|
def detect_fs_quota(target: Path) -> Tuple[str, str]:
|
|
"""探测 target 所在 fs 是否可 quota,返 (level, msg)。
|
|
|
|
level ∈ {"ok", "warn"} —— fs quota 永不视为 err(不阻塞 web 启动)。
|
|
给 CLI 与 lifespan 共用 —— CLI 走 _ok/_warn 打印,lifespan 走 print。
|
|
|
|
识别:
|
|
- xfs:mount options 含 `prjquota` 或 `pquota` → ok;否则 warn(fs 支持但未 enable)
|
|
- ext4:mount options 含 `prjquota` 或 `project,quota` → ok
|
|
- zfs:任何 → ok(dataset quota 在 zfs set 层,这里不深入)
|
|
- btrfs:警告 quota 群组复杂
|
|
- tmpfs / overlay / 其他:warn(典型 Docker-in-Docker 或本地 dev,生产部署不应该)
|
|
"""
|
|
if sys.platform == "win32":
|
|
return "warn", "fs quota check skipped on Windows (Linux 部署机才有意义)"
|
|
|
|
# findmnt 在多数 Linux 发行版自带(util-linux)
|
|
rc, out, err = _run([
|
|
"findmnt", "--target", str(target), "-no", "FSTYPE,OPTIONS",
|
|
])
|
|
if rc != 0 or not out:
|
|
return "warn", (
|
|
f"fs quota check skipped: cannot detect fs for {target} "
|
|
f"({err or 'findmnt missing'})"
|
|
)
|
|
|
|
parts = out.split()
|
|
fstype = parts[0].lower() if parts else ""
|
|
options = parts[1] if len(parts) > 1 else ""
|
|
opts = set(options.split(","))
|
|
|
|
if fstype == "xfs":
|
|
if "prjquota" in opts or "pquota" in opts:
|
|
return "ok", f"fs quota: xfs with prjquota on {target}"
|
|
return "warn", (
|
|
f"fs quota: xfs on {target} but NO prjquota mount option -- "
|
|
f"`sudo mount -o remount,prjquota <mountpoint>` + `xfs_quota -x ...`"
|
|
)
|
|
if fstype == "ext4":
|
|
if "prjquota" in opts or ("project" in opts and "quota" in opts):
|
|
return "ok", f"fs quota: ext4 with project quota on {target}"
|
|
return "warn", (
|
|
f"fs quota: ext4 on {target} but NO project quota option -- "
|
|
f"`tune2fs -O project,quota <dev>` + remount + `quota -P`"
|
|
)
|
|
if fstype == "zfs":
|
|
return "ok", f"fs quota: zfs on {target} (dataset quota via `zfs set quota=...`)"
|
|
if fstype == "btrfs":
|
|
return "warn", (
|
|
f"fs quota: btrfs on {target} -- qgroup 配置复杂,生产部署"
|
|
f"推荐 xfs prjquota;如必须用 btrfs 自行验 `btrfs qgroup`"
|
|
)
|
|
return "warn", (
|
|
f"fs quota: {fstype or '<unknown>'} on {target} -- "
|
|
f"非主流 quota-able 类型,外部用户开放前换 xfs/ext4/zfs 单独分区"
|
|
)
|
|
|
|
|
|
def check_fs_quota_capable() -> bool:
|
|
"""CLI 入口:探测 workspace/users/ 所在 fs。返 True(永不 err)。"""
|
|
from core.agent_builder import load_config, resolve_workspace
|
|
|
|
try:
|
|
cfg = load_config()
|
|
workspace = resolve_workspace(None, cfg)
|
|
target = (workspace / "users").resolve()
|
|
except Exception as e:
|
|
_warn(f"fs quota check: cannot resolve workspace path: {e}")
|
|
return True
|
|
|
|
level, msg = detect_fs_quota(target)
|
|
if level == "ok":
|
|
_ok(msg)
|
|
else:
|
|
_warn(msg)
|
|
return True
|
|
|
|
|
|
# -- 汇总入口 ---------------------------------------------
|
|
|
|
CHECK_NAMES = [
|
|
("docker daemon", "check_docker_daemon"),
|
|
("image present", "check_image_present"),
|
|
("network present", "check_network_present"),
|
|
("HOST_UID alignment", "check_host_uid_alignment"),
|
|
("fs quota capable", "check_fs_quota_capable"),
|
|
]
|
|
|
|
|
|
def run_sandbox_check() -> int:
|
|
"""跑所有探测,返 exit code(0=全 ok 或仅 warn;1=有 err)。
|
|
|
|
err vs warn 分界:
|
|
- err = docker backend 启动会 fail-fast 的根因(daemon / 镜像 / HOST_UID)
|
|
- warn = 不阻塞启动但外部用户开放前要清(network 缺 / fs 不可 quota)
|
|
|
|
通过模块全局 lookup 拿函数引用(不固化进 CHECKS 元组),让 unittest patch
|
|
`core.sandbox.check.check_xxx` 对本函数生效。
|
|
"""
|
|
print("--- sandbox deployment check ---\n")
|
|
ok_count = 0
|
|
module = sys.modules[__name__]
|
|
for label, fn_name in CHECK_NAMES:
|
|
fn = getattr(module, fn_name)
|
|
try:
|
|
if fn():
|
|
ok_count += 1
|
|
except Exception as e:
|
|
_err(f"{label}: unexpected {type(e).__name__}: {e}")
|
|
total = len(CHECK_NAMES)
|
|
print()
|
|
if ok_count == total:
|
|
print(f"[summary] {ok_count}/{total} checks passed -- docker backend ready")
|
|
return 0
|
|
failed = total - ok_count
|
|
print(
|
|
f"[summary] {ok_count}/{total} passed, {failed} failed -- "
|
|
f"修完上面的 [err] 项再启 docker backend"
|
|
)
|
|
return 1
|