diff --git a/PROGRESS.md b/PROGRESS.md index 078400b..f80b2be 100644 --- a/PROGRESS.md +++ b/PROGRESS.md @@ -22,7 +22,7 @@ ## 已完成关键能力 -**Phase 1-3**(2026 早期):骨架 + skill 系统 + run_python。所有工具基目录是用户当前 cwd(不是 zcbot 仓库本身),agent 操作的是用户项目。`tools/fs.py` 的 `edit` 用 CoreCoder 风格唯一匹配。`tools/run_python.py` 过滤 `*API_KEY *TOKEN *SECRET *PASSWORD *PRIVATE_KEY` 环境变量。三个 skill 中 `ppt/` 最完整(v3:商务红硬约束 + apply_brand 品牌条 + Iconify 图标库 + scripts:fetch_icon / quality_check / source_to_md / render_icon)。 +**Phase 1-3**(2026 早期):骨架 + skill 系统 + run_python。所有工具基目录是用户当前 cwd(不是 zcbot 仓库本身),agent 操作的是用户项目。`tools/fs.py` 的 `edit` 用 CoreCoder 风格唯一匹配。`tools/run_python.py` 过滤 `*API_KEY *TOKEN *SECRET *PASSWORD *PRIVATE_KEY` 环境变量。三个 skill 中 `ppt/` 最完整(v3:商务红硬约束 + apply_brand 品牌条 + Iconify 图标库 + scripts:fetch_icon / quality_check / render_icon;素材摄取改用 markitdown CLI)。 **Phase 4**(2026-05-06): - `core/probe.py` + `cli.py probe` —— basic_chat / parallel_tools / thinking_mode / long_context 四项探测 diff --git a/requirements.txt b/requirements.txt index 78e9f5d..50d5740 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,6 @@ rich>=13.7.0 python-pptx>=0.6.21 python-docx>=1.1.0 matplotlib>=3.8.0 + +# 素材摄取: PDF/DOCX/PPTX/XLSX/HTML/URL → Markdown (ppt 阶段零 + proposal 阶段零) +markitdown[pdf,docx,pptx,xlsx]>=0.0.1 diff --git a/skills/ppt/SKILL.md b/skills/ppt/SKILL.md index 41191e8..7effffe 100644 --- a/skills/ppt/SKILL.md +++ b/skills/ppt/SKILL.md @@ -12,7 +12,7 @@ description: 生成 PowerPoint 演示文稿 (.pptx)。当用户要求做汇报 P - `references/layouts.md` —— 9 种版式的 python-pptx 起手代码 + 安全区/越界保护 + `apply_brand` 品牌条 - `references/icons.md` —— 业务图标两层:Iconify (在线/本地缓存) / unicode 字形兜底 - `assets/icons/` —— 本地图标缓存 (Iconify 拉过的图存这,见 `INDEX.md` 推荐清单) -- `scripts/source_to_md.py` —— PDF/DOCX/PPTX/URL → 干净 Markdown +- 素材摄取: 直接用 `markitdown` CLI (PDF/DOCX/PPTX/XLSX/HTML/URL → 干净 Markdown) - `scripts/fetch_icon.py` —— 从 Iconify CDN 拉 SVG/PNG (染主题色,缓存本地) - `scripts/render_icon.py` —— unicode 字形 → 透明 PNG (Iconify 没有时兜底) - `scripts/quality_check.py` —— 产物 .pptx 验收 (越界 / 文本溢出 / 颜色一致) @@ -94,7 +94,7 @@ python /scripts/quality_check.py / --spec / -├── source.md # source_to_md.py 转出的素材 +├── source.md # markitdown 转出的素材 ├── spec_lock.md # 八条对齐落定 ├── slides/ │ └── chart_p3.png # 各页用到的图片素材 diff --git a/skills/ppt/scripts/source_to_md.py b/skills/ppt/scripts/source_to_md.py deleted file mode 100644 index ffd1285..0000000 --- a/skills/ppt/scripts/source_to_md.py +++ /dev/null @@ -1,157 +0,0 @@ -"""source_to_md.py: 把素材转成干净 Markdown,作为后续策略阶段的输入。 - -用法: - python source_to_md.py # 自动按扩展名识别 - python source_to_md.py # http/https 走 web 抓 - python source_to_md.py file.pdf -o source.md - -支持: - .pdf → pypdf 提取文本 - .docx → python-docx 段落 - .pptx → python-pptx 提取每页文字 - .txt/.md → 直读 - URL → requests + 简易 HTML 剥离 - -设计原则:模型在策略阶段只看 Markdown,不读二进制 / 不爬复杂排版。 -""" -from __future__ import annotations - -import argparse -import re -import sys -from pathlib import Path -from urllib.parse import urlparse - - -def from_pdf(path: Path) -> str: - try: - from pypdf import PdfReader - except ImportError: - return "[error] pip install pypdf" - reader = PdfReader(str(path)) - parts = [f"# {path.stem}\n"] - for i, page in enumerate(reader.pages, 1): - text = (page.extract_text() or "").strip() - if text: - parts.append(f"\n## Page {i}\n\n{text}\n") - return "\n".join(parts) - - -def from_docx(path: Path) -> str: - try: - from docx import Document - except ImportError: - return "[error] pip install python-docx" - doc = Document(str(path)) - parts = [f"# {path.stem}\n"] - for para in doc.paragraphs: - text = para.text.strip() - if not text: - continue - style = (para.style.name or "").lower() if para.style else "" - if "heading 1" in style: - parts.append(f"\n## {text}\n") - elif "heading 2" in style: - parts.append(f"\n### {text}\n") - elif "heading 3" in style: - parts.append(f"\n#### {text}\n") - else: - parts.append(f"\n{text}\n") - return "".join(parts) - - -def from_pptx(path: Path) -> str: - try: - from pptx import Presentation - except ImportError: - return "[error] pip install python-pptx" - prs = Presentation(str(path)) - parts = [f"# {path.stem}\n"] - for i, slide in enumerate(prs.slides, 1): - parts.append(f"\n## Slide {i}\n") - for shape in slide.shapes: - if shape.has_text_frame: - txt = shape.text_frame.text.strip() - if txt: - parts.append(f"\n{txt}\n") - return "".join(parts) - - -def from_text(path: Path) -> str: - return path.read_text(encoding="utf-8", errors="replace") - - -_TAG_RE = re.compile(r"<[^>]+>") -_WS_RE = re.compile(r"\n{3,}") - - -def from_url(url: str) -> str: - try: - import requests - except ImportError: - return "[error] pip install requests" - r = requests.get(url, timeout=30, headers={ - "User-Agent": "Mozilla/5.0 (compatible; ppt-source-to-md/1.0)" - }) - r.raise_for_status() - html = r.text - - # 极简剥离:script/style 删,标签去除 - html = re.sub(r"", "", html, flags=re.I) - html = re.sub(r"", "", html, flags=re.I) - - title_m = re.search(r"]*>([^<]+)", html, re.I) - title = title_m.group(1).strip() if title_m else url - - # 块级标签转换行 - html = re.sub(r"]*>", "\n", html, flags=re.I) - text = _TAG_RE.sub("", html) - text = re.sub(r" ", " ", text) - text = re.sub(r"&", "&", text) - text = re.sub(r"<", "<", text) - text = re.sub(r">", ">", text) - text = re.sub(r""", '"', text) - text = "\n".join(line.strip() for line in text.splitlines()) - text = _WS_RE.sub("\n\n", text).strip() - - return f"# {title}\n\nSource: {url}\n\n{text}\n" - - -def dispatch(src: str) -> str: - parsed = urlparse(src) - if parsed.scheme in ("http", "https"): - return from_url(src) - - path = Path(src) - if not path.exists(): - return f"[error] not found: {src}" - - ext = path.suffix.lower() - if ext == ".pdf": - return from_pdf(path) - if ext == ".docx": - return from_docx(path) - if ext == ".pptx": - return from_pptx(path) - if ext in (".txt", ".md"): - return from_text(path) - return f"[error] unsupported extension: {ext}" - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("src", help="文件路径或 http(s) URL") - ap.add_argument("-o", "--output", type=Path, default=None, - help="写到文件;默认打印到 stdout") - args = ap.parse_args() - - md = dispatch(args.src) - if args.output: - args.output.write_text(md, encoding="utf-8") - print(f"[ok] {args.output} ({len(md)} chars)") - else: - sys.stdout.write(md) - - -if __name__ == "__main__": - main() diff --git a/skills/proposal/SKILL.md b/skills/proposal/SKILL.md index 67061e9..25391a9 100644 --- a/skills/proposal/SKILL.md +++ b/skills/proposal/SKILL.md @@ -21,29 +21,15 @@ description: 撰写中国科研项目申报书 / 课题任务书 (国家重点 - `/scripts/word_count.py` —— 章节字数 vs 预算 - `/scripts/quality_check.py` —— 结构完整性 / 假大空话术 / 占位符未替换 / 指南覆盖度 (--spec 选项) -## 阶段零: 摄取素材 (有 PDF/DOCX 时才走) +## 阶段零: 摄取素材 (有 PDF/DOCX/XLSX/URL 时才走) -用户给指南 PDF / 团队介绍 DOCX / 预算 XLSX → 先转成 `/source/.md`,后续阶段一才能读。用 `run_python` 即可,不需要新工具: +用户给指南 PDF / 团队介绍 DOCX / 预算 XLSX / 政策网页 URL → 先转成 `/source/.md`,后续阶段一才能读。统一用 `markitdown` CLI,表格 / 列表 / 标题层级会自动保留: -```python -# PDF (指南文件) -from pypdf import PdfReader -text = "\n\n".join(p.extract_text() or "" for p in PdfReader(pdf_path).pages) -Path("/source/guide.md").write_text(text, encoding="utf-8") - -# DOCX (团队/前期成果) -from docx import Document -doc = Document(docx_path) -md = "\n".join(p.text for p in doc.paragraphs if p.text.strip()) -# 表格 -for t in doc.tables: - for row in t.rows: md += "\n| " + " | ".join(c.text.strip() for c in row.cells) + " |" - -# XLSX (预算) -from openpyxl import load_workbook -wb = load_workbook(xlsx_path) -for ws in wb.worksheets: - for row in ws.iter_rows(values_only=True): print(row) +```bash +markitdown /guide.pdf -o /source/guide.md +markitdown /team.docx -o /source/team.md +markitdown /budget.xlsx -o /source/budget.md +markitdown https://example.com/x -o /source/policy.md ``` 转完后 spec_lock 阶段直接 `read /source/*.md` 拿事实,不要凭印象写。