zcbot/tools/documents.py

"""Host-side document_search tools.

These tools intentionally keep DOCUMENT_SEARCH_API_KEY on the host side. The
sandbox receives only business arguments and trimmed results / saved paths.
"""
from __future__ import annotations

from pathlib import Path
from typing import Optional

from skills.documents import client as doc_client

from .base import Tool


def _clip(text: str, max_chars: int) -> tuple[str, bool]:
    max_chars = max(0, int(max_chars))
    if len(text) <= max_chars:
        return text, False
    return text[:max_chars], True


class DocumentListKbTool(Tool):
    name = "document_list_kb"
    description = (
        "List internal materials knowledge bases available in document_search. "
        "Use before document_search when the user did not specify a materials domain."
    )
    parameters = {"type": "object", "properties": {}}

    def execute(self) -> str:
        try:
            kbs = doc_client.list_kb()
        except Exception as e:
            return f"[Error] document_list_kb failed: {type(e).__name__}: {e}"
        if not kbs:
            return "(no knowledge bases returned)"
        lines = ["Knowledge bases:"]
        for kb in kbs:
            lines.append(
                "- id={id} kb_name={kb_name} ch_name={ch_name} file_count={file_count}".format(
                    id=kb.get("id", ""),
                    kb_name=kb.get("kb_name", ""),
                    ch_name=kb.get("ch_name", ""),
                    file_count=kb.get("file_count", ""),
                )
            )
        return "\n".join(lines)


class DocumentSearchTool(Tool):
    name = "document_search"
    description = (
        "Search the internal materials document knowledge base. "
        "Returns file metadata and truncated markdown content; increase content_chars_per_doc only when needed."
    )
    parameters = {
        "type": "object",
        "properties": {
            "query": {"type": "string", "description": "Search query, Chinese or English; technical terms are usually better in English."},
            "kb_names": {
                "type": "array",
                "items": {"type": "string"},
                "description": "Optional knowledge-base names from document_list_kb.",
            },
            "classification_ids": {
                "type": "array",
                "items": {"type": "integer"},
                "description": "Optional materials domain ids, 1-7.",
            },
            "max_documents": {
                "type": "integer",
                "default": 6,
                "description": "Number of documents to return, 1-20.",
            },
            "content_chars_per_doc": {
                "type": "integer",
                "default": 1200,
                "description": "Maximum markdown characters returned per document, 0-5000.",
            },
        },
        "required": ["query"],
    }

    def execute(
        self,
        query: str,
        kb_names: Optional[list[str]] = None,
        classification_ids: Optional[list[int]] = None,
        max_documents: int = 6,
        content_chars_per_doc: int = 1200,
    ) -> str:
        query = (query or "").strip()
        if not query:
            return "[Error] query 不能为空"
        max_documents = min(max(int(max_documents), 1), 20)
        content_chars_per_doc = min(max(int(content_chars_per_doc), 0), 5000)
        try:
            docs = doc_client.search(
                query=query,
                kb_names=kb_names or None,
                classification_ids=classification_ids or None,
                max_documents=max_documents,
            )
        except Exception as e:
            return f"[Error] document_search failed: {type(e).__name__}: {e}"
        if not docs:
            return f"(no documents found for query: {query!r})"

        lines = [f"Document search results for: {query!r}"]
        for i, d in enumerate(docs, 1):
            content = d.get("md_content") or ""
            snippet, truncated = _clip(str(content), content_chars_per_doc)
            lines.append("")
            lines.append(f"{i}. file_name={d.get('file_name') or ''}")
            lines.append(f"   kb_name={d.get('kb_name') or ''}")
            lines.append(f"   character_count={d.get('character_count') or 0}")
            if d.get("md_filename"):
                lines.append(f"   md_filename={d.get('md_filename')}")
            if snippet:
                suffix = " ...(truncated)" if truncated else ""
                lines.append(f"   md_content[:{content_chars_per_doc}]={snippet}{suffix}")
        return "\n".join(lines)


class DocumentDownloadTool(Tool):
    name = "document_download"
    description = (
        "Download an original document from document_search into the current task_dir/documents/. "
        "Use file_name and kb_name returned by document_search."
    )
    parameters = {
        "type": "object",
        "properties": {
            "file_name": {"type": "string", "description": "Original file_name or md_filename returned by document_search."},
            "kb_name": {"type": "string", "description": "Knowledge-base name returned by document_search."},
            "preview": {"type": "boolean", "default": False, "description": "Request inline preview disposition from the upstream API. Usually false."},
        },
        "required": ["file_name", "kb_name"],
    }

    def __init__(
        self,
        *,
        working_dir: Path,
        base_dir: Optional[Path] = None,
        user_root: Optional[Path] = None,
    ) -> None:
        super().__init__(base_dir=base_dir, user_root=user_root)
        self.working_dir = Path(working_dir)

    def execute(self, file_name: str, kb_name: str, preview: bool = False) -> str:
        if not (file_name or "").strip() or not (kb_name or "").strip():
            return "[Error] file_name / kb_name 不可为空"
        try:
            rel = doc_client.download(
                file_name=file_name,
                kb_name=kb_name,
                working_dir=str(self.working_dir),
                preview=bool(preview),
            )
        except Exception as e:
            return f"[Error] document_download failed: {type(e).__name__}: {e}"
        return f"saved: {self._display(self.working_dir / rel)}"