diff --git a/PROGRESS.md b/PROGRESS.md index 17279a2..97e8ebb 100644 --- a/PROGRESS.md +++ b/PROGRESS.md @@ -21,6 +21,12 @@ ## 已完成关键能力 +### 2026-06-14 / sandbox 容器加 `--shm-size`:修 mmdc 渲 mermaid 挂超时 + +- 实测一个"生图测试"任务(`caoqianming@foxmail.com`)对话:模型裸调 `mmdc` 渲 mermaid,自造的 puppeteer config 漏了 `--disable-dev-shm-usage`,chromium 用 64MB 的 `/dev/shm` 起不来 → 连试 6 次全超时,烧约 120k token 才绕道 mermaid.ink 出了个 SVG。根因:`pool.py` 的 `docker run` 没传 `--shm-size`,容器 `/dev/shm` = docker 默认 64MB(镜像备的 `/sandbox/puppeteer-config.json` 虽有 `--disable-dev-shm-usage`,但模型不一定用那份;且 `mmdc` 不读 `MERMAID_PUPPETEER_CONFIG` env)。 +- 修法(只做最小 infra,不动模型侧):`docker run` 加 `--shm-size`(`DEFAULT_SHM_SIZE=512m`,env `ZCBOT_SANDBOX_SHM_SIZE` / yaml `sandbox.shm_size` 可配,优先级同 memory/cpus)。从根上让任何 chromium 路径都不再挂,连模型自造的漏 flag config 也能跑。已 running 旧容器需重启 web + idle 回收后新起才带。 +- 实测脚本 `deploy/sandbox/probe_mermaid.sh`(区分 chromium 缺包 vs 纯 shm 超时);诊断脚本 `scripts/diag_dump_task.py`(按 email+任务名 dump 对话)。改动文件:`core/sandbox/pool.py`、`config/agent.yaml`、`RUN.md`。bump 0.12.4 → 0.12.5。 + ### 2026-06-13 / 模型选择瘦身:对话模型常驻 + 生图/生视频收进 ⚙ 弹层 - `#chat-meta` 右侧原三个带标签下拉(模型/生图/生视频)占满整行。改为**高频的对话模型下拉常驻**(一眼可见当前模型、直接切),**低频的生图/生视频收进一个「⚙ 媒体」弹层**(fixed 定位逃出 pane overflow,点开才渲染 select)。meta 行从"3 下拉"降到"1 下拉 + 1 齿轮"。 diff --git a/RUN.md b/RUN.md index a8d6a81..2a78260 100644 --- a/RUN.md +++ b/RUN.md @@ -396,6 +396,7 @@ sudo -u zcbot docker network create --internal zcbot-sandbox-net # ZCBOT_SANDBOX_MEMORY=2g # ZCBOT_SANDBOX_CPUS=1.0 # ZCBOT_SANDBOX_PIDS_LIMIT=256 +# ZCBOT_SANDBOX_SHM_SIZE=512m # chromium/mmdc 渲 mermaid 的 /dev/shm(docker 默 64MB 不够会挂超时) # PG 实际 IP,逗号分隔。defense-in-depth ── 即便落内网三段(§7.5 #1), # init.sh 再加一遍 DROP 规则。生产部署必填。 ZCBOT_PG_IPS=10.1.2.3,10.1.2.4 @@ -694,6 +695,7 @@ sudo xfs_quota -x -c "limit -p bhard=10g zcbot_" /opt | 镜像 build apt 报 `OpenSSL error: ... unexpected eof while reading` | 某些 mirror HTTPS 端偶发 close_notify 缺失,OpenSSL 3 严格 fail(腾讯 / 阿里见过;清华一般不犯)。改用 http 形式:`--build-arg APT_MIRROR=http://mirrors.tuna.tsinghua.edu.cn`(apt 包 GPG 签名校验,无 HTTPS 安全收益)。Dockerfile 已配 apt retry=5 + 关 pipeline,重 build 一般直接过 | | 容器内 shell 写工作目录报 `Permission denied`(but `sandbox check` ⑤ HOST_UID aligned ok) | DockerExecutor 写死了 `--user 1000:1000` 不会自动跟 build 的 HOST_UID 同步(改 `--user zcbot` 后已修)。仍报错检查镜像内 `docker run --rm --entrypoint id zcbot-sandbox:latest zcbot` 输出 uid 是否 = `id -u $(whoami)` | | 模型用 run_python 跑 `render_diagrams.py` 报 `mmdc returncode=1: Failed to launch chromium` | 容器内 chromium 缺 puppeteer no-sandbox 配置。镜像已落 `/sandbox/puppeteer-config.json` + ENV `MERMAID_PUPPETEER_CONFIG`,render_diagrams.py 已读 env 自动 -p 注入;仍跪查 `docker exec ... env \| grep MERMAID` 看 env 是否在 | +| 模型裸调 `mmdc -i x.md -o x.png`(自造 puppeteer config / 不走 render_diagrams.py)卡到 **timeout** 而非报错 | chromium 默认用 `/dev/shm`,docker 不传 `--shm-size` 时只 64MB → 起不来一直挂。已在 `pool.py` 给 `docker run` 加 `--shm-size`(默 512m,env `ZCBOT_SANDBOX_SHM_SIZE` / yaml `sandbox.shm_size`)。**已 running 的旧容器不会自动生效**,重启 web + 等 idle 回收(或 `docker rm -f zcbot-sandbox-`)后新起的才带。注:`mmdc` 自身不读 `MERMAID_PUPPETEER_CONFIG` env(只认 `-p`),裸调要么靠 shm-size 兜底要么显式 `-p /sandbox/puppeteer-config.json`。实测脚本 `deploy/sandbox/probe_mermaid.sh` | | Export 报 "无可导出内容" | task 没 messages(只 system 不算);先发条消息再 export | | `NoSubtaskError: working_dir ... 前缀嵌套` | §7.4 no-subtask:同 user 不允许 working_dir 嵌套(child / parent)。**同项目多对话**用**完全相同**的 working_dir;否则改成 sibling(平级) | | `main.py web` 启动后 curl 连不上 | 检查 proxy(`HTTP_PROXY` / `HTTPS_PROXY`):本地服务 127.0.0.1,系统 proxy 拦截会 502。临时 `unset HTTP_PROXY HTTPS_PROXY` 或 `curl --noproxy '*'`。验通:`curl --noproxy '*' http://127.0.0.1:8765/healthz` | diff --git a/config/agent.yaml b/config/agent.yaml index abaa54c..4250af1 100644 --- a/config/agent.yaml +++ b/config/agent.yaml @@ -35,6 +35,7 @@ sandbox: memory: 2g # --memory (env: ZCBOT_SANDBOX_MEMORY) cpus: 1.0 # --cpus (env: ZCBOT_SANDBOX_CPUS) pids_limit: 256 # --pids-limit (env: ZCBOT_SANDBOX_PIDS_LIMIT) + shm_size: 512m # --shm-size (env: ZCBOT_SANDBOX_SHM_SIZE);chromium/mmdc 渲 mermaid 的 /dev/shm,默 64MB 不够会挂 # 容器 DNS server 显式配置(docker run --dns,容器 /etc/resolv.conf 直接写, # 绕过 docker daemon 上游 DNS 探测路径;腾讯云轻量 / 部分云上 daemon 探测 # systemd-resolved 上游会失败,导致 embedded DNS 127.0.0.11 forward 出去也跪)。 diff --git a/core/__init__.py b/core/__init__.py index 2babc9a..b866157 100644 --- a/core/__init__.py +++ b/core/__init__.py @@ -1,3 +1,3 @@ # zcbot 版本号单一事实源:web/app.py 的 FastAPI version、/healthz 返回、前端展示都引这里。 # 改版本只动这一行。 -__version__ = "0.12.4" +__version__ = "0.12.5" diff --git a/core/sandbox/pool.py b/core/sandbox/pool.py index 0aec2fc..b09973d 100644 --- a/core/sandbox/pool.py +++ b/core/sandbox/pool.py @@ -49,6 +49,11 @@ DEFAULT_IDLE_TTL_SECONDS = 300 DEFAULT_MEMORY = "2g" DEFAULT_CPUS = "1.0" DEFAULT_PIDS_LIMIT = 256 +# chromium(mmdc 渲 mermaid / puppeteer)默认走 /dev/shm,docker 不传 --shm-size 时 +# 只给 64MB,起不来就一直挂到 timeout。镜像备的 puppeteer-config 有 --disable-dev-shm-usage, +# 但模型不一定用那份;这里从根上把 /dev/shm 撑到够用,任何 chromium 路径都不再挂。 +# 从 --memory(默 2g)里切,512m 是上限非占用(tmpfs 按需用)。 +DEFAULT_SHM_SIZE = "512m" def container_name(user_id: UUID) -> str: @@ -89,6 +94,7 @@ class SandboxPool: memory: Optional[str] = None, cpus: Optional[str] = None, pids_limit: Optional[int] = None, + shm_size: Optional[str] = None, dns: Optional[List[str]] = None, ) -> None: """ @@ -107,10 +113,11 @@ class SandboxPool: (env `ZCBOT_SANDBOX_IDLE_TTL`,默 300) pg_ips: 逗号分隔的 PG IP 串,塞容器 `ZCBOT_PG_IPS` env,init.sh 加 DROP 规则 (env `ZCBOT_PG_IPS`)。defense-in-depth ── 即便落内网三段。 - memory/cpus/pids_limit: - 容器资源限制,默 2g/1.0/256;env(`ZCBOT_SANDBOX_MEMORY` 等) + memory/cpus/pids_limit/shm_size: + 容器资源限制,默 2g/1.0/256/512m;env(`ZCBOT_SANDBOX_MEMORY` 等) override caller 参数 override 默认。改后重启 web 生效,新起的 容器用新值;已 running 不变(idle 5min 回收后下次起按新值)。 + shm_size 撑 chromium 的 /dev/shm(默 64MB 不够,mmdc 渲图会挂)。 """ self.user_root_base = user_root_base self.repo_root = repo_root @@ -123,6 +130,7 @@ class SandboxPool: # 资源限制:env > caller > 默 self.memory = os.getenv("ZCBOT_SANDBOX_MEMORY") or memory or DEFAULT_MEMORY self.cpus = os.getenv("ZCBOT_SANDBOX_CPUS") or cpus or DEFAULT_CPUS + self.shm_size = os.getenv("ZCBOT_SANDBOX_SHM_SIZE") or shm_size or DEFAULT_SHM_SIZE self.pids_limit = int( os.getenv("ZCBOT_SANDBOX_PIDS_LIMIT") or (pids_limit if pids_limit is not None else DEFAULT_PIDS_LIMIT) @@ -197,6 +205,7 @@ class SandboxPool: # §7.5 硬限制(任一缺失视为 hardening 未完成) "--read-only", # rootfs read-only "--tmpfs", "/tmp:exec,size=512m,mode=1777", # 可写临时区,exec 允许 (run_python 写脚本) + f"--shm-size={self.shm_size}", # chromium/mmdc 的 /dev/shm,默 64MB 不够会挂(DEFAULT_SHM_SIZE) "--cap-drop=ALL", # 默全丢 "--cap-add=NET_ADMIN", # init.sh 配 iptables 需要;exec 进来的 uid 1000 拿不到 "--security-opt=no-new-privileges", @@ -308,5 +317,6 @@ def setup_pool( memory=cfg.get("memory") if isinstance(cfg.get("memory"), str) else None, cpus=str(cfg["cpus"]) if cfg.get("cpus") is not None else None, pids_limit=int(cfg["pids_limit"]) if cfg.get("pids_limit") is not None else None, + shm_size=cfg.get("shm_size") if isinstance(cfg.get("shm_size"), str) else None, dns=[str(x) for x in dns_cfg], ) diff --git a/deploy/sandbox/probe_mermaid.sh b/deploy/sandbox/probe_mermaid.sh new file mode 100644 index 0000000..5ca1180 --- /dev/null +++ b/deploy/sandbox/probe_mermaid.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# 在 sandbox 容器里实测 mmdc/chromium:区分「chromium 缺包」vs「纯启动超时(/dev/shm 64MB)」。 +# 用法(服务器上,任选其一): +# A) 进一个活着的 per-user 容器(最贴真,复用线上 64MB /dev/shm 默认): +# C=$(docker ps --filter "label=zcbot.product=sandbox" --format '{{.Names}}' | head -1) +# docker cp deploy/sandbox/probe_mermaid.sh "$C":/tmp/probe.sh +# docker exec "$C" bash /tmp/probe.sh +# B) 没有活容器时,起一个临时的(显式 NOT 传 --shm-size,复现线上 64MB): +# docker run --rm --read-only --tmpfs /tmp:exec,size=512m,mode=1777 \ +# --cap-drop=ALL --security-opt=no-new-privileges \ +# --entrypoint bash zcbot-sandbox:latest /dev/stdin < deploy/sandbox/probe_mermaid.sh +set -u +echo "===== /dev/shm size (期望线上 64M) ====="; df -h /dev/shm +echo "===== chromium 是否在 (缺包则这里就失败) =====" +command -v chromium && chromium --version 2>&1 | head -1 || echo "[FAIL] chromium 缺包/不可执行" +cd /tmp; printf 'flowchart TB\n A[甲]-->B[乙]\n' > d.mmd + +echo; echo "===== A: 模型自造 config(漏 --disable-dev-shm-usage)→ 预期挂起/超时 =====" +printf '{"args":["--no-sandbox","--disable-setuid-sandbox"]}' > bad.json +ts=$SECONDS; timeout 60 mmdc -i d.mmd -o a.png -p bad.json >a.log 2>&1; rc=$? +echo "rc=$rc 用时=$((SECONDS-ts))s"; tail -3 a.log; ls -l a.png 2>/dev/null && echo "[A 出图]" || echo "[A 无图]" + +echo; echo "===== B: 镜像备好的 /sandbox/puppeteer-config.json(含 --disable-dev-shm-usage)→ 预期成功 =====" +ts=$SECONDS; timeout 60 mmdc -i d.mmd -o b.png -p /sandbox/puppeteer-config.json >b.log 2>&1; rc=$? +echo "rc=$rc 用时=$((SECONDS-ts))s"; tail -3 b.log; ls -l b.png 2>/dev/null && echo "[B 出图]" || echo "[B 无图]" + +echo; echo "===== 结论 =====" +echo "chromium 在 + A挂超时 + B出图 => 纯 /dev/shm 64MB 问题,fix=给 docker run 加 --shm-size 或强制用 B 的 config" +echo "chromium 缺/B 也失败 => 更深的环境问题,看上面 b.log" diff --git a/scripts/diag_dump_task.py b/scripts/diag_dump_task.py new file mode 100644 index 0000000..49efec6 --- /dev/null +++ b/scripts/diag_dump_task.py @@ -0,0 +1,77 @@ +"""按 email + 任务名 dump 一个 task 的完整对话记录(ASCII 标签,Windows GBK 安全)。""" +import json +import os +import sys +from pathlib import Path + +env = Path(__file__).resolve().parent.parent / ".env" +for line in env.read_text(encoding="utf-8").splitlines(): + if line.strip().startswith("ZCBOT_DB_URL="): + os.environ["ZCBOT_DB_URL"] = line.split("=", 1)[1].strip() +from sqlalchemy import create_engine, text # noqa: E402 + +import builtins # noqa: E402 + +_out = open(Path(__file__).resolve().parent / "_task_dump.txt", "w", encoding="utf-8") + + +def print(*a, **k): # noqa: A001 - redirect to utf-8 file + builtins.print(*a, **k, file=_out) + + +engine = create_engine(os.environ["ZCBOT_DB_URL"]) +email = sys.argv[1] if len(sys.argv) > 1 else "caoqianming@foxmail.com" +name_like = sys.argv[2] if len(sys.argv) > 2 else "生图测试" + + +def s(x, n=4000): + t = str(x or "") + return t if len(t) <= n else t[:n] + f"...[+{len(t)-n} chars]" + + +with engine.connect() as conn: + uid = conn.execute(text("select user_id from users where email=:e"), {"e": email}).fetchone() + if not uid: + print("[NO USER]", email) + sys.exit(1) + uid = uid[0] + rows = conn.execute( + text("select task_id,name,skill,model,model_profile,status,run_status,run_error," + "tokens_prompt,tokens_completion,created_at,updated_at from tasks " + "where user_id=:u and name like :n order by created_at"), + {"u": uid, "n": "%" + name_like + "%"}, + ).fetchall() + print(f"[USER] {email} matched tasks: {len(rows)}") + for r in rows: + print(f" task={r[0]} name={r[1]!r} skill={r[2]!r} model={r[3]}/{r[4]} " + f"status={r[5]} run={r[6]} tok={r[8]}/{r[9]} created={r[10]}") + if r[7]: + print(f" run_error: {s(r[7], 500)}") + if not rows: + sys.exit(0) + + tid = rows[-1][0] + print(f"\n========== DUMP task {tid} ==========") + msgs = conn.execute( + text("select idx,payload,model_profile,tokens_in,tokens_out from messages " + "where task_id=:t order by idx"), + {"t": tid}, + ).fetchall() + print(f"messages: {len(msgs)}\n") + for idx, p, mp, ti, to in msgs: + role = p.get("role") + head = f"[{idx}] {role}" + if mp: + head += f" ({mp})" + if ti or to: + head += f" tok={ti}/{to}" + print(head) + content = p.get("content") + if content: + print(" content:", s(content, 3000)) + for tc in p.get("tool_calls") or []: + fn = tc.get("function") or {} + print(f" CALL {fn.get('name')}({s(fn.get('arguments'), 1500)})") + if role == "tool": + print(f" TOOL[{p.get('name')}]:", s(content, 2000)) + print()