zcbot/scripts/smoke_look_at_image.py

"""Smoke: look_at_image(豆包 Seed 2.0 Lite 视觉)端到端走通 + OCR 验证。

跑法: .venv/Scripts/python.exe scripts/smoke_look_at_image.py
依赖 .env 里 ARK_API_KEY / ZCBOT_DB_URL。**会真的调豆包 vision API,产生 < ¥0.01 费用**。

校验:
  1. ArkConfig.load() + yaml vision 段存在
  2. 合成一张含已知文字 "ZCBOT-VISION-8848" 的 PNG → LookAtImageTool.execute 能 OCR 出该串
  3. 返回串首行 banner 含 model/tokens/cost
  4. usage_events 多出一行 kind="vision",units 含 tokens_in/out + 单价 snapshot
"""
from __future__ import annotations

import os
import sys
import uuid
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

# Windows 控制台默认 GBK,打印 ¥ / 中文结果会崩 → 强制 stdout UTF-8(只影响本脚本打印)
try:
    sys.stdout.reconfigure(encoding="utf-8", errors="replace")  # type: ignore[attr-defined]
except Exception:
    pass

# 读 .env
env_file = ROOT / ".env"
if env_file.exists():
    for line in env_file.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        k, _, v = line.partition("=")
        os.environ.setdefault(k.strip(), v.strip())

from PIL import Image, ImageDraw
from sqlalchemy import text

from core.ark_client import ArkConfig
from core.storage import session_scope
from core.storage.models import Task, User
from tools.look_at_image import LookAtImageTool

MAGIC = "ZCBOT-VISION-8848"


def make_text_png(dest: Path) -> None:
    """白底大号黑字 PNG(放大 4x 让默认位图字体也清晰可 OCR)。"""
    small = Image.new("RGB", (260, 60), (255, 255, 255))
    d = ImageDraw.Draw(small)
    d.text((10, 22), MAGIC, fill=(0, 0, 0))
    big = small.resize((260 * 4, 60 * 4), Image.NEAREST)
    dest.parent.mkdir(parents=True, exist_ok=True)
    big.save(dest)


def main() -> int:
    cfg = ArkConfig.load()
    if cfg is None:
        print("[SKIP] ARK_API_KEY 未设(或 doubao.yaml 缺失),无法测真接口")
        return 0
    vision_cfg = (cfg.raw.get("vision") or {})
    if not vision_cfg:
        print("[SKIP] doubao.yaml 无 vision 段")
        return 0
    variant_key, variant_cfg = next(iter(vision_cfg.items()))
    print(f"[setup] variant={variant_key} model={variant_cfg.get('model_id')} "
          f"price_in={variant_cfg.get('price_cny_per_mtoken_input')} "
          f"price_out={variant_cfg.get('price_cny_per_mtoken_output')}")

    uid = uuid.uuid4()
    tid = uuid.uuid4()
    ws_user = ROOT / "workspace" / "users" / str(uid)
    wd = ws_user / "smoke_vision"
    img = wd / "figures" / "magic.png"
    make_text_png(img)
    print(f"[setup] 合成测试图 {img.name}(含文字 {MAGIC!r})")

    with session_scope() as s:  # User 先单独落库,再建 Task(FK 顺序保险)
        s.add(User(user_id=uid))
    with session_scope() as s:
        s.add(Task(task_id=tid, user_id=uid, name="smoke_vision", working_dir=str(wd)))

    tool = LookAtImageTool(
        ark_cfg=cfg,
        vision_variant_cfg=variant_cfg,
        variant_key=variant_key,
        working_dir=wd,
        task_id=tid,
        user_id=uid,
        base_dir=wd,
        user_root=ws_user,
    )

    print("[call] question='把图中的文字逐字 OCR 出来'")
    result = tool.execute(image="figures/magic.png", question="把图中的文字逐字 OCR 出来。")
    print(f"[tool result]\n{result}\n")
    if result.startswith("[Error]"):
        print("[FAIL] tool 返回错误")
        return 2

    # OCR 命中(容忍模型加空格/大小写差异,去掉分隔比对)
    norm = result.replace(" ", "").replace("\n", "").upper()
    if MAGIC.replace("-", "") in norm.replace("-", ""):
        print(f"[OK] OCR 命中魔术串 {MAGIC}")
    else:
        print(f"[WARN] 未在结果里精确匹配 {MAGIC} —— 人工核对上面 result(模型可能换了排版)")

    with session_scope() as s:
        rows = s.execute(text(
            "SELECT kind, model_profile, units, cost_cny FROM usage_events "
            "WHERE task_id = :tid"
        ), {"tid": str(tid)}).all()
    assert len(rows) == 1, f"usage_events 行数应 1,实际 {len(rows)}"
    row = rows[0]
    assert row.kind == "vision", f"kind 应 vision,实际 {row.kind}"
    assert row.model_profile == f"doubao.{variant_key}", f"model_profile={row.model_profile}"
    for k in ("tokens_in", "tokens_out", "input_cny_per_mtoken", "output_cny_per_mtoken"):
        assert k in row.units, f"units 缺 {k}"
    print(f"[OK] usage_events: kind={row.kind} model={row.model_profile} "
          f"cost_cny={row.cost_cny} units={row.units}")

    print("\n[PASS] smoke_look_at_image 全部通过")
    return 0


if __name__ == "__main__":
    sys.exit(main())