"""look_at_image: 给 DeepSeek V4 主模型(纯文本)借一双眼睛。主模型无视觉,这个 tool 走豆包 Seed 2.0 Lite(全模态理解)读单图 —— OCR / 描述画面 / 读图表表格 / 识别物体。模型自决何时调(用户贴了图问"这写的啥" / "图里是什么")。模型 ID + 单价全在 `config/media/doubao.yaml` 的 vision 段,本 tool 只装配。计费:token 计费(同 chat),成功后写一行 usage_events(kind="vision")。图片路径解析 + base64 复用 tools/image_ref(与 seedream i2i 同一套边界/校验)。 """ from __future__ import annotations from pathlib import Path from typing import Any, Optional from uuid import UUID import time from core.ark_client import ArkClient, ArkConfig, ArkError, ArkTimeoutError from core.storage.usage import record_vision_usage from .base import Tool, compact_tool_output from .image_ref import load_image_as_data_url # 不带 question 时的默认提问:全覆盖(描述 + OCR + 图表读数),让模型一次把图里能用的信息都吐出来 _DEFAULT_QUESTION = ( "请仔细看这张图并回答:" "1) 完整描述画面内容(主体/场景/关键元素);" "2) 如果图中有任何文字,逐字准确 OCR 出来,尽量保留原排版与换行;" "3) 如果是图表/表格/示意图,把其中的数据、坐标轴、图例、结构关系读出来。" ) class LookAtImageTool(Tool): name = "look_at_image" description = ( "Read/understand an image using Doubao Seed 2.0 Lite vision (the main model is text-only). " "Use to OCR text, describe a picture, read charts/tables/diagrams, or identify objects in an " "image the user uploaded (look for a `[用户上传的参考图] ` line in their message) or that " "was generated/saved in the task. Pass the image path; optionally a specific question " "(default: describe + OCR everything). Cheap (token-based, usually < ¥0.01). " "Returns the model's textual reading of the image." ) parameters = { "type": "object", "properties": { "image": { "type": "string", "description": ( "要看的图片相对路径(task_dir 内,如 'figures/xxx.png',或用户消息里 " "`[用户上传的参考图]` 行给的路径,或某工具上次返回的 saved 路径)。" ), }, "question": { "type": "string", "description": ( "想从图里知道什么(可选)。如「这张图里的表格数据是多少」「图中仪表读数」" "「把这页文字 OCR 出来」。不传则默认全面描述 + OCR 全部文字。" ), }, }, "required": ["image"], } def __init__( self, *, ark_cfg: ArkConfig, vision_variant_cfg: dict, variant_key: str, working_dir: Path, task_id: UUID, user_id: UUID, base_dir: Optional[Path] = None, user_root: Optional[Path] = None, ) -> None: super().__init__(base_dir, user_root=user_root) self.ark_cfg = ark_cfg self.cfg = vision_variant_cfg self.variant_key = variant_key # 'seed_2_lite' → usage_events.model_profile = "doubao.seed_2_lite" self.working_dir = Path(working_dir) self.task_id = task_id self.user_id = user_id def execute(self, image: str, question: Optional[str] = None) -> str: if not (image or "").strip(): return "[Error] image(图片路径)不能为空" cfg = self.cfg max_bytes = int(float(cfg.get("max_image_mb", 10)) * 1024 * 1024) data_url, disp, err = load_image_as_data_url( image.strip(), working_dir=self.working_dir, user_root=self.user_root, display_fn=self._display, max_bytes=max_bytes, ) if err: return err q = (question or "").strip() or _DEFAULT_QUESTION model_id = cfg["model_id"] timeout_s = float(cfg.get("request_timeout_s", 60)) endpoint = cfg.get("endpoint", "/chat/completions") body: dict[str, Any] = { "model": model_id, "messages": [ { "role": "user", "content": [ {"type": "text", "text": q}, {"type": "image_url", "image_url": {"url": data_url}}, ], } ], } # 透明重试:Seed 2.0 Lite 非流式,长 OCR 偶发超时/网络抖动。tool 内消化掉, # 不把 [Error] 抛给主模型 —— 否则主模型会重发整个 tool call(图 base64 重传、 # 输入 token 再付一次)。仅 ArkTimeoutError(超时/网络)重试;HTTP 业务错误不重试。 max_attempts = int(cfg.get("timeout_retries", 1)) + 1 resp = None for attempt in range(max_attempts): try: with ArkClient(self.ark_cfg, timeout_s=timeout_s) as client: resp = client.post_json(endpoint, body, timeout_s=timeout_s) break except ArkTimeoutError as e: if attempt == max_attempts - 1: return f"[Error] look_at_image API: {e}(已重试 {attempt} 次仍超时)" print( f"[look_at_image] timeout, retrying ({attempt + 1}/{max_attempts - 1}): {e}", flush=True, ) time.sleep(2 ** attempt) except ArkError as e: return f"[Error] look_at_image API: {e}" answer = self._extract_answer(resp) if not answer: return ( "[Error] vision 响应缺内容(模型未返回文本)。" "可能图片格式异常或模型暂不可用,稍后重试。" ) usage = resp.get("usage") or {} tin = int(usage.get("prompt_tokens", 0) or 0) tout = int(usage.get("completion_tokens", 0) or 0) # 记账;失败不阻塞 tool 返回(沿用 seedream 兜底) cost_cny = 0.0 try: cost = record_vision_usage( task_id=self.task_id, user_id=self.user_id, model_profile=f"doubao.{self.variant_key}", prompt_tokens=tin, completion_tokens=tout, input_cny_per_mtoken=float(cfg.get("price_cny_per_mtoken_input", 0)), output_cny_per_mtoken=float(cfg.get("price_cny_per_mtoken_output", 0)), extra_units={"image": disp}, ) cost_cny = float(cost) except Exception as e: print(f"[look_at_image] record_vision_usage failed: {type(e).__name__}: {e}", flush=True) # 第一行 banner(key=value · 分隔,与 seedream/seedance 同协议,便于前端/对账) banner = ( f"[look_at_image] model={model_id} · image={disp} · " f"tokens={tin}+{tout} · cost=¥{cost_cny:.4f}" ) # 图片解读正文可能很长(整页 OCR),压一下防爆上下文(保头尾) return f"{banner}\n\n{compact_tool_output(answer)}" @staticmethod def _extract_answer(resp: dict) -> str: """OpenAI 兼容 chat 响应取文本: choices[0].message.content。 content 可能是 str,也可能是 list[{type:text,text:...}](多模态返回形态),都兜住。 """ choices = resp.get("choices") if not (isinstance(choices, list) and choices): return "" msg = choices[0].get("message") if isinstance(choices[0], dict) else None if not isinstance(msg, dict): return "" content = msg.get("content") if isinstance(content, str): return content.strip() if isinstance(content, list): parts = [ c.get("text", "") for c in content if isinstance(c, dict) and c.get("type") == "text" ] return "\n".join(p for p in parts if p).strip() return ""