181 lines
7.2 KiB
Python
181 lines
7.2 KiB
Python
"""look_at_image: 给 DeepSeek V4 主模型(纯文本)借一双眼睛。
|
|
|
|
主模型无视觉,这个 tool 走豆包 Seed 2.0 Lite(全模态理解)读单图 —— OCR / 描述画面 /
|
|
读图表表格 / 识别物体。模型自决何时调(用户贴了图问"这写的啥" / "图里是什么")。
|
|
|
|
模型 ID + 单价全在 `config/media/doubao.yaml` 的 vision 段,本 tool 只装配。
|
|
计费:token 计费(同 chat),成功后写一行 usage_events(kind="vision")。
|
|
图片路径解析 + base64 复用 tools/image_ref(与 seedream i2i 同一套边界/校验)。
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
from uuid import UUID
|
|
|
|
from core.ark_client import ArkClient, ArkConfig, ArkError
|
|
from core.storage.usage import record_vision_usage
|
|
|
|
from .base import Tool, compact_tool_output
|
|
from .image_ref import load_image_as_data_url
|
|
|
|
# 不带 question 时的默认提问:全覆盖(描述 + OCR + 图表读数),让模型一次把图里能用的信息都吐出来
|
|
_DEFAULT_QUESTION = (
|
|
"请仔细看这张图并回答:"
|
|
"1) 完整描述画面内容(主体/场景/关键元素);"
|
|
"2) 如果图中有任何文字,逐字准确 OCR 出来,尽量保留原排版与换行;"
|
|
"3) 如果是图表/表格/示意图,把其中的数据、坐标轴、图例、结构关系读出来。"
|
|
)
|
|
|
|
|
|
class LookAtImageTool(Tool):
|
|
name = "look_at_image"
|
|
description = (
|
|
"Read/understand an image using Doubao Seed 2.0 Lite vision (the main model is text-only). "
|
|
"Use to OCR text, describe a picture, read charts/tables/diagrams, or identify objects in an "
|
|
"image the user uploaded (look for a `[用户上传的参考图] <path>` line in their message) or that "
|
|
"was generated/saved in the task. Pass the image path; optionally a specific question "
|
|
"(default: describe + OCR everything). Cheap (token-based, usually < ¥0.01). "
|
|
"Returns the model's textual reading of the image."
|
|
)
|
|
parameters = {
|
|
"type": "object",
|
|
"properties": {
|
|
"image": {
|
|
"type": "string",
|
|
"description": (
|
|
"要看的图片相对路径(task_dir 内,如 'figures/xxx.png',或用户消息里 "
|
|
"`[用户上传的参考图]` 行给的路径,或某工具上次返回的 saved 路径)。"
|
|
),
|
|
},
|
|
"question": {
|
|
"type": "string",
|
|
"description": (
|
|
"想从图里知道什么(可选)。如「这张图里的表格数据是多少」「图中仪表读数」"
|
|
"「把这页文字 OCR 出来」。不传则默认全面描述 + OCR 全部文字。"
|
|
),
|
|
},
|
|
},
|
|
"required": ["image"],
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
ark_cfg: ArkConfig,
|
|
vision_variant_cfg: dict,
|
|
variant_key: str,
|
|
working_dir: Path,
|
|
task_id: UUID,
|
|
user_id: UUID,
|
|
base_dir: Optional[Path] = None,
|
|
user_root: Optional[Path] = None,
|
|
) -> None:
|
|
super().__init__(base_dir, user_root=user_root)
|
|
self.ark_cfg = ark_cfg
|
|
self.cfg = vision_variant_cfg
|
|
self.variant_key = variant_key # 'seed_2_lite' → usage_events.model_profile = "doubao.seed_2_lite"
|
|
self.working_dir = Path(working_dir)
|
|
self.task_id = task_id
|
|
self.user_id = user_id
|
|
|
|
def execute(self, image: str, question: Optional[str] = None) -> str:
|
|
if not (image or "").strip():
|
|
return "[Error] image(图片路径)不能为空"
|
|
|
|
cfg = self.cfg
|
|
max_bytes = int(float(cfg.get("max_image_mb", 10)) * 1024 * 1024)
|
|
data_url, disp, err = load_image_as_data_url(
|
|
image.strip(),
|
|
working_dir=self.working_dir,
|
|
user_root=self.user_root,
|
|
display_fn=self._display,
|
|
max_bytes=max_bytes,
|
|
)
|
|
if err:
|
|
return err
|
|
|
|
q = (question or "").strip() or _DEFAULT_QUESTION
|
|
model_id = cfg["model_id"]
|
|
timeout_s = float(cfg.get("request_timeout_s", 60))
|
|
endpoint = cfg.get("endpoint", "/chat/completions")
|
|
|
|
body: dict[str, Any] = {
|
|
"model": model_id,
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": q},
|
|
{"type": "image_url", "image_url": {"url": data_url}},
|
|
],
|
|
}
|
|
],
|
|
}
|
|
|
|
try:
|
|
with ArkClient(self.ark_cfg, timeout_s=timeout_s) as client:
|
|
resp = client.post_json(endpoint, body, timeout_s=timeout_s)
|
|
except ArkError as e:
|
|
return f"[Error] look_at_image API: {e}"
|
|
|
|
answer = self._extract_answer(resp)
|
|
if not answer:
|
|
return (
|
|
"[Error] vision 响应缺内容(模型未返回文本)。"
|
|
"可能图片格式异常或模型暂不可用,稍后重试。"
|
|
)
|
|
|
|
usage = resp.get("usage") or {}
|
|
tin = int(usage.get("prompt_tokens", 0) or 0)
|
|
tout = int(usage.get("completion_tokens", 0) or 0)
|
|
|
|
# 记账;失败不阻塞 tool 返回(沿用 seedream 兜底)
|
|
cost_cny = 0.0
|
|
try:
|
|
cost = record_vision_usage(
|
|
task_id=self.task_id,
|
|
user_id=self.user_id,
|
|
model_profile=f"doubao.{self.variant_key}",
|
|
prompt_tokens=tin,
|
|
completion_tokens=tout,
|
|
input_cny_per_mtoken=float(cfg.get("price_cny_per_mtoken_input", 0)),
|
|
output_cny_per_mtoken=float(cfg.get("price_cny_per_mtoken_output", 0)),
|
|
extra_units={"image": disp},
|
|
)
|
|
cost_cny = float(cost)
|
|
except Exception as e:
|
|
print(f"[look_at_image] record_vision_usage failed: {type(e).__name__}: {e}", flush=True)
|
|
|
|
# 第一行 banner(key=value · 分隔,与 seedream/seedance 同协议,便于前端/对账)
|
|
banner = (
|
|
f"[look_at_image] model={model_id} · image={disp} · "
|
|
f"tokens={tin}+{tout} · cost=¥{cost_cny:.4f}"
|
|
)
|
|
# 图片解读正文可能很长(整页 OCR),压一下防爆上下文(保头尾)
|
|
return f"{banner}\n\n{compact_tool_output(answer)}"
|
|
|
|
@staticmethod
|
|
def _extract_answer(resp: dict) -> str:
|
|
"""OpenAI 兼容 chat 响应取文本: choices[0].message.content。
|
|
|
|
content 可能是 str,也可能是 list[{type:text,text:...}](多模态返回形态),都兜住。
|
|
"""
|
|
choices = resp.get("choices")
|
|
if not (isinstance(choices, list) and choices):
|
|
return ""
|
|
msg = choices[0].get("message") if isinstance(choices[0], dict) else None
|
|
if not isinstance(msg, dict):
|
|
return ""
|
|
content = msg.get("content")
|
|
if isinstance(content, str):
|
|
return content.strip()
|
|
if isinstance(content, list):
|
|
parts = [
|
|
c.get("text", "")
|
|
for c in content
|
|
if isinstance(c, dict) and c.get("type") == "text"
|
|
]
|
|
return "\n".join(p for p in parts if p).strip()
|
|
return ""
|