zcbot/core/export_docx.py

380 lines
13 KiB
Python

"""把 task 的 PG messages 表 + state.json 渲染为 .docx 对话稿。
布局:
- 文档开头 meta 表(task_id / 模式 / 描述 / 模型 / 创建时间 / 消息数 / tokens / 导出时间)
- 主体每条消息一组段落,全部左排,小字号,角色用不同颜色加粗区分
- assistant 的 reasoning_content 默认带,灰色斜体
- tool 结果保留前 head + 中间省略 + 后 tail 三段
- tool_calls 把 function 名 + 参数 JSON 单列展示
调用入口:
- 顶层函数 export_chat_to_docx(task_dir, out_path=None, ...)
- CLI 子命令 `python cli.py export <task_id>` 与 REPL `/export [<task_id>]` 都走它
§7 B Step 2 后:messages 从 PG 读(按 task_id);state.json 还在 task_dir 下(Step 3 删)。
"""
from __future__ import annotations
import json
from datetime import datetime
from pathlib import Path
from typing import Optional
from uuid import UUID
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Cm, Pt, RGBColor
# ───────────────────────── 配色 ─────────────────────────
# 选 Word 浅底高对比度的 GitHub-ish 色板,不刺眼也能区分
COLOR_USER = RGBColor(0x6F, 0x42, 0xC1) # 紫
COLOR_ASSISTANT = RGBColor(0x1F, 0x6F, 0xEB) # 蓝
COLOR_TOOL_CALL = RGBColor(0xBF, 0x63, 0x10) # 橙(深一点保证可读)
COLOR_TOOL_RESULT = RGBColor(0x1A, 0x7F, 0x37) # 绿
COLOR_REASONING = RGBColor(0x6E, 0x76, 0x81) # 中灰,斜体
COLOR_SYSTEM = RGBColor(0x57, 0x60, 0x6A) # 暗灰
COLOR_META_LABEL = RGBColor(0x57, 0x60, 0x6A)
# ───────────────────────── 字体辅助 ─────────────────────────
def _set_run_fonts(run, *, cn_font: str = "宋体", en_font: str = "Times New Roman") -> None:
rPr = run._element.get_or_add_rPr()
rFonts = rPr.find(qn("w:rFonts"))
if rFonts is None:
rFonts = OxmlElement("w:rFonts")
rPr.append(rFonts)
rFonts.set(qn("w:eastAsia"), cn_font)
rFonts.set(qn("w:ascii"), en_font)
rFonts.set(qn("w:hAnsi"), en_font)
def _preserve_spaces(run) -> None:
"""让 docx 不压缩连续空格 — 代码块/JSON 缩进必须留住。"""
for t in run._element.iter(qn("w:t")):
t.set(qn("xml:space"), "preserve")
# ───────────────────────── 文档骨架 ─────────────────────────
def _init_doc() -> Document:
doc = Document()
section = doc.sections[0]
section.page_height = Cm(29.7)
section.page_width = Cm(21)
section.top_margin = Cm(2.0)
section.bottom_margin = Cm(2.0)
section.left_margin = Cm(2.5)
section.right_margin = Cm(2.0)
normal = doc.styles["Normal"]
normal.font.name = "Times New Roman"
normal.font.size = Pt(9.5)
pf = normal.paragraph_format
pf.line_spacing = 1.3
pf.space_before = Pt(0)
pf.space_after = Pt(0)
pf.first_line_indent = None
return doc
# ───────────────────────── 段落原语 ─────────────────────────
def _add_role_header(doc: Document, label: str, color: RGBColor) -> None:
p = doc.add_paragraph()
pf = p.paragraph_format
pf.first_line_indent = None
pf.space_before = Pt(8)
pf.space_after = Pt(2)
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
run = p.add_run(label)
run.font.size = Pt(10.5)
run.font.bold = True
run.font.color.rgb = color
_set_run_fonts(run, cn_font="黑体", en_font="Consolas")
def _add_text(
doc: Document,
text: str,
*,
color: Optional[RGBColor] = None,
italic: bool = False,
mono: bool = False,
size: Pt = Pt(9.5),
indent_left: Optional[Pt] = None,
) -> None:
"""整段文本输出。保留 \n 换行;mono 用等宽中文(新宋体)+ Consolas。"""
if not text:
return
p = doc.add_paragraph()
pf = p.paragraph_format
pf.first_line_indent = None
pf.line_spacing = 1.25
pf.space_before = Pt(0)
pf.space_after = Pt(2)
if indent_left is not None:
pf.left_indent = indent_left
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
cn_font = "新宋体" if mono else "宋体"
en_font = "Consolas" if mono else "Times New Roman"
lines = text.split("\n")
for i, line in enumerate(lines):
if i > 0:
br = p.add_run()
br.add_break()
run = p.add_run(line)
run.font.size = size
if color is not None:
run.font.color.rgb = color
if italic:
run.italic = True
_set_run_fonts(run, cn_font=cn_font, en_font=en_font)
if mono:
_preserve_spaces(run)
# ───────────────────────── 工具结果裁剪 ─────────────────────────
def _truncate_with_ellipsis(text: str, head: int, tail: int) -> str:
"""前 head + 省略 + 后 tail。整体短于阈值则原样返回。"""
if text is None:
return ""
if len(text) <= head + tail + 80:
return text
omitted = len(text) - head - tail
return f"{text[:head]}\n\n... [omitted {omitted} chars] ...\n\n{text[-tail:]}"
def _format_args(args_str: str) -> str:
"""tool_call 参数若是合法 JSON 就 pretty,否则原样返回。"""
if not args_str:
return ""
try:
parsed = json.loads(args_str)
return json.dumps(parsed, ensure_ascii=False, indent=2)
except Exception:
return args_str
# ───────────────────────── Meta 区块 ─────────────────────────
def _add_meta_block(
doc: Document, meta: dict, task_state: dict, n_msgs: int, task_dir: Path
) -> None:
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
p.paragraph_format.first_line_indent = None
p.paragraph_format.space_before = Pt(0)
p.paragraph_format.space_after = Pt(4)
title = f"Task 对话记录 - {meta.get('id') or task_state.get('task_id') or '?'}"
run = p.add_run(title)
run.font.size = Pt(14)
run.font.bold = True
_set_run_fonts(run, cn_font="黑体", en_font="Consolas")
desc = task_state.get("description") or ""
mode = task_state.get("mode") or ""
status = task_state.get("status") or ""
model = meta.get("model") or task_state.get("model") or ""
profile = meta.get("model_profile") or task_state.get("model_profile") or ""
cwd = meta.get("cwd") or task_state.get("cwd") or ""
created = meta.get("created_at") or task_state.get("created_at") or ""
updated = task_state.get("updated_at") or ""
tp = task_state.get("tokens_prompt", 0)
tc = task_state.get("tokens_completion", 0)
rows = [
("Task ID", meta.get("id") or task_state.get("task_id") or "?"),
("模式", mode),
("描述", desc),
("状态", status),
("模型", model),
("Profile", profile),
("CWD", cwd),
("创建时间", created),
("更新时间", updated),
("消息数", str(n_msgs)),
("Tokens", f"{tp} prompt / {tc} completion / {tp + tc} total"),
("Task dir", str(task_dir)),
("导出时间", datetime.now().isoformat(timespec="seconds")),
]
table = doc.add_table(rows=len(rows), cols=2)
try:
table.style = "Light Grid Accent 1"
except KeyError:
pass
for ri, (k, v) in enumerate(rows):
c1 = table.rows[ri].cells[0]
c1.text = ""
p1 = c1.paragraphs[0]
p1.paragraph_format.first_line_indent = None
p1.paragraph_format.line_spacing = 1.15
run = p1.add_run(k)
run.font.size = Pt(9)
run.font.bold = True
run.font.color.rgb = COLOR_META_LABEL
_set_run_fonts(run, cn_font="宋体", en_font="Times New Roman")
c2 = table.rows[ri].cells[1]
c2.text = ""
p2 = c2.paragraphs[0]
p2.paragraph_format.first_line_indent = None
p2.paragraph_format.line_spacing = 1.15
run = p2.add_run(str(v) if v else "-")
run.font.size = Pt(9)
_set_run_fonts(run, cn_font="宋体", en_font="Times New Roman")
# ───────────────────────── 单条消息渲染 ─────────────────────────
def _render_message(
doc: Document,
msg: dict,
*,
include_reasoning: bool,
tool_head: int,
tool_tail: int,
) -> None:
role = msg.get("role")
if role == "system":
_add_role_header(doc, "[system]", COLOR_SYSTEM)
content = msg.get("content") or ""
# system prompt 通常 2-5KB,导出时也压一下
content = _truncate_with_ellipsis(content, 1500, 500)
_add_text(doc, content, color=COLOR_SYSTEM, size=Pt(8.5), mono=True)
return
if role == "user":
_add_role_header(doc, "[user]", COLOR_USER)
_add_text(doc, msg.get("content") or "", size=Pt(10))
return
if role == "assistant":
_add_role_header(doc, "[assistant]", COLOR_ASSISTANT)
if include_reasoning:
rc = msg.get("reasoning_content") or ""
if not rc:
psf = msg.get("provider_specific_fields") or {}
rc = psf.get("reasoning_content") or ""
if rc:
_add_text(
doc, "▎reasoning",
color=COLOR_REASONING, size=Pt(8.5), italic=True,
)
_add_text(
doc, rc,
color=COLOR_REASONING, size=Pt(9), italic=True,
indent_left=Pt(12),
)
content = msg.get("content") or ""
if content:
_add_text(doc, content, size=Pt(10))
for call in msg.get("tool_calls") or []:
fn_obj = call.get("function") or {}
fn = fn_obj.get("name", "?")
args = fn_obj.get("arguments", "")
cid = call.get("id", "")
_add_text(
doc, f"▎tool_call -> {fn} ({cid})",
color=COLOR_TOOL_CALL, size=Pt(9), italic=True,
)
_add_text(
doc, _format_args(args),
color=COLOR_TOOL_CALL, size=Pt(8.5), mono=True,
indent_left=Pt(12),
)
return
if role == "tool":
cid = msg.get("tool_call_id", "")
_add_role_header(doc, f"[tool result] ({cid})", COLOR_TOOL_RESULT)
content = msg.get("content") or ""
truncated = _truncate_with_ellipsis(content, tool_head, tool_tail)
_add_text(
doc, truncated,
color=COLOR_TOOL_RESULT, size=Pt(8.5), mono=True,
indent_left=Pt(12),
)
return
# 兜底:未知 role
_add_role_header(doc, f"[{role or 'unknown'}]", COLOR_SYSTEM)
_add_text(doc, msg.get("content") or "", size=Pt(9.5))
# ───────────────────────── 顶层入口 ─────────────────────────
def export_chat_to_docx(
task_dir: Path,
out_path: Optional[Path] = None,
*,
include_system: bool = False,
include_reasoning: bool = True,
tool_head: int = 1000,
tool_tail: int = 500,
) -> Path:
"""渲染 task 对话为 .docx,返回写入路径。
task_dir 名字必须是 UUID(messages 从 PG 按 task_id 读)。state.json 仍在
task_dir 下(Step 3 前)提供 mode/desc/tokens 等 meta。
"""
try:
tid = UUID(task_dir.name)
except ValueError:
raise ValueError(f"task_dir name 不是有效 UUID: {task_dir.name}")
# 从 PG 读 messages,按 idx 排序
from sqlalchemy import select
from core.storage import session_scope
from core.storage.models import Message as MessageRow
with session_scope() as s:
rows = s.execute(
select(MessageRow).where(MessageRow.task_id == tid).order_by(MessageRow.idx)
).scalars().all()
messages = [dict(r.payload) for r in rows]
state_path = task_dir / "state.json"
task_state: dict = {}
if state_path.exists():
try:
task_state = json.loads(state_path.read_text(encoding="utf-8")) or {}
except Exception:
task_state = {}
if out_path is None:
out_path = task_dir / f"chat_{tid}.docx"
meta = {"id": str(tid), "model": task_state.get("model", ""),
"model_profile": task_state.get("model_profile", ""),
"cwd": task_state.get("cwd", ""),
"created_at": task_state.get("created_at", "")}
doc = _init_doc()
_add_meta_block(doc, meta, task_state, len(messages), task_dir)
doc.add_paragraph() # 与 meta 表保持一行间距
for msg in messages:
if msg.get("role") == "system" and not include_system:
continue
_render_message(
doc, msg,
include_reasoning=include_reasoning,
tool_head=tool_head,
tool_tail=tool_tail,
)
out_path.parent.mkdir(parents=True, exist_ok=True)
doc.save(str(out_path))
return out_path