zcbot/tools/look_at_image.py

"""look_at_image: 给 DeepSeek V4 主模型(纯文本)借一双眼睛。

主模型无视觉,这个 tool 走豆包 Seed 2.0 Lite(全模态理解)读单图 —— OCR / 描述画面 /
读图表表格 / 识别物体。模型自决何时调(用户贴了图问"这写的啥" / "图里是什么")。

模型 ID + 单价全在 `config/media/doubao.yaml` 的 vision 段,本 tool 只装配。
计费:token 计费(同 chat),成功后写一行 usage_events(kind="vision")。
图片路径解析 + base64 复用 tools/image_ref(与 seedream i2i 同一套边界/校验)。
"""
from __future__ import annotations

from pathlib import Path
from typing import Any, Optional
from uuid import UUID

from core.ark_client import ArkClient, ArkConfig, ArkError
from core.storage.usage import record_vision_usage

from .base import Tool, compact_tool_output
from .image_ref import load_image_as_data_url

# 不带 question 时的默认提问:全覆盖(描述 + OCR + 图表读数),让模型一次把图里能用的信息都吐出来
_DEFAULT_QUESTION = (
    "请仔细看这张图并回答:"
    "1) 完整描述画面内容(主体/场景/关键元素);"
    "2) 如果图中有任何文字,逐字准确 OCR 出来,尽量保留原排版与换行;"
    "3) 如果是图表/表格/示意图,把其中的数据、坐标轴、图例、结构关系读出来。"
)


class LookAtImageTool(Tool):
    name = "look_at_image"
    description = (
        "Read/understand an image using Doubao Seed 2.0 Lite vision (the main model is text-only). "
        "Use to OCR text, describe a picture, read charts/tables/diagrams, or identify objects in an "
        "image the user uploaded (look for a `[用户上传的参考图] <path>` line in their message) or that "
        "was generated/saved in the task. Pass the image path; optionally a specific question "
        "(default: describe + OCR everything). Cheap (token-based, usually < ¥0.01). "
        "Returns the model's textual reading of the image."
    )
    parameters = {
        "type": "object",
        "properties": {
            "image": {
                "type": "string",
                "description": (
                    "要看的图片相对路径(task_dir 内,如 'figures/xxx.png',或用户消息里 "
                    "`[用户上传的参考图]` 行给的路径,或某工具上次返回的 saved 路径)。"
                ),
            },
            "question": {
                "type": "string",
                "description": (
                    "想从图里知道什么(可选)。如「这张图里的表格数据是多少」「图中仪表读数」"
                    "「把这页文字 OCR 出来」。不传则默认全面描述 + OCR 全部文字。"
                ),
            },
        },
        "required": ["image"],
    }

    def __init__(
        self,
        *,
        ark_cfg: ArkConfig,
        vision_variant_cfg: dict,
        variant_key: str,
        working_dir: Path,
        task_id: UUID,
        user_id: UUID,
        base_dir: Optional[Path] = None,
        user_root: Optional[Path] = None,
    ) -> None:
        super().__init__(base_dir, user_root=user_root)
        self.ark_cfg = ark_cfg
        self.cfg = vision_variant_cfg
        self.variant_key = variant_key  # 'seed_2_lite' → usage_events.model_profile = "doubao.seed_2_lite"
        self.working_dir = Path(working_dir)
        self.task_id = task_id
        self.user_id = user_id

    def execute(self, image: str, question: Optional[str] = None) -> str:
        if not (image or "").strip():
            return "[Error] image(图片路径)不能为空"

        cfg = self.cfg
        max_bytes = int(float(cfg.get("max_image_mb", 10)) * 1024 * 1024)
        data_url, disp, err = load_image_as_data_url(
            image.strip(),
            working_dir=self.working_dir,
            user_root=self.user_root,
            display_fn=self._display,
            max_bytes=max_bytes,
        )
        if err:
            return err

        q = (question or "").strip() or _DEFAULT_QUESTION
        model_id = cfg["model_id"]
        timeout_s = float(cfg.get("request_timeout_s", 60))
        endpoint = cfg.get("endpoint", "/chat/completions")

        body: dict[str, Any] = {
            "model": model_id,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": q},
                        {"type": "image_url", "image_url": {"url": data_url}},
                    ],
                }
            ],
        }

        try:
            with ArkClient(self.ark_cfg, timeout_s=timeout_s) as client:
                resp = client.post_json(endpoint, body, timeout_s=timeout_s)
        except ArkError as e:
            return f"[Error] look_at_image API: {e}"

        answer = self._extract_answer(resp)
        if not answer:
            return (
                "[Error] vision 响应缺内容(模型未返回文本)。"
                "可能图片格式异常或模型暂不可用,稍后重试。"
            )

        usage = resp.get("usage") or {}
        tin = int(usage.get("prompt_tokens", 0) or 0)
        tout = int(usage.get("completion_tokens", 0) or 0)

        # 记账;失败不阻塞 tool 返回(沿用 seedream 兜底)
        cost_cny = 0.0
        try:
            cost = record_vision_usage(
                task_id=self.task_id,
                user_id=self.user_id,
                model_profile=f"doubao.{self.variant_key}",
                prompt_tokens=tin,
                completion_tokens=tout,
                input_cny_per_mtoken=float(cfg.get("price_cny_per_mtoken_input", 0)),
                output_cny_per_mtoken=float(cfg.get("price_cny_per_mtoken_output", 0)),
                extra_units={"image": disp},
            )
            cost_cny = float(cost)
        except Exception as e:
            print(f"[look_at_image] record_vision_usage failed: {type(e).__name__}: {e}", flush=True)

        # 第一行 banner(key=value · 分隔,与 seedream/seedance 同协议,便于前端/对账)
        banner = (
            f"[look_at_image] model={model_id} · image={disp} · "
            f"tokens={tin}+{tout} · cost=¥{cost_cny:.4f}"
        )
        # 图片解读正文可能很长(整页 OCR),压一下防爆上下文(保头尾)
        return f"{banner}\n\n{compact_tool_output(answer)}"

    @staticmethod
    def _extract_answer(resp: dict) -> str:
        """OpenAI 兼容 chat 响应取文本: choices[0].message.content。

        content 可能是 str,也可能是 list[{type:text,text:...}](多模态返回形态),都兜住。
        """
        choices = resp.get("choices")
        if not (isinstance(choices, list) and choices):
            return ""
        msg = choices[0].get("message") if isinstance(choices[0], dict) else None
        if not isinstance(msg, dict):
            return ""
        content = msg.get("content")
        if isinstance(content, str):
            return content.strip()
        if isinstance(content, list):
            parts = [
                c.get("text", "")
                for c in content
                if isinstance(c, dict) and c.get("type") == "text"
            ]
            return "\n".join(p for p in parts if p).strip()
        return ""