"""source_to_md.py: 把素材转成干净 Markdown,作为后续策略阶段的输入。
用法:
python source_to_md.py # 自动按扩展名识别
python source_to_md.py # http/https 走 web 抓
python source_to_md.py file.pdf -o source.md
支持:
.pdf → pypdf 提取文本
.docx → python-docx 段落
.pptx → python-pptx 提取每页文字
.txt/.md → 直读
URL → requests + 简易 HTML 剥离
设计原则:模型在策略阶段只看 Markdown,不读二进制 / 不爬复杂排版。
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
from urllib.parse import urlparse
def from_pdf(path: Path) -> str:
try:
from pypdf import PdfReader
except ImportError:
return "[error] pip install pypdf"
reader = PdfReader(str(path))
parts = [f"# {path.stem}\n"]
for i, page in enumerate(reader.pages, 1):
text = (page.extract_text() or "").strip()
if text:
parts.append(f"\n## Page {i}\n\n{text}\n")
return "\n".join(parts)
def from_docx(path: Path) -> str:
try:
from docx import Document
except ImportError:
return "[error] pip install python-docx"
doc = Document(str(path))
parts = [f"# {path.stem}\n"]
for para in doc.paragraphs:
text = para.text.strip()
if not text:
continue
style = (para.style.name or "").lower() if para.style else ""
if "heading 1" in style:
parts.append(f"\n## {text}\n")
elif "heading 2" in style:
parts.append(f"\n### {text}\n")
elif "heading 3" in style:
parts.append(f"\n#### {text}\n")
else:
parts.append(f"\n{text}\n")
return "".join(parts)
def from_pptx(path: Path) -> str:
try:
from pptx import Presentation
except ImportError:
return "[error] pip install python-pptx"
prs = Presentation(str(path))
parts = [f"# {path.stem}\n"]
for i, slide in enumerate(prs.slides, 1):
parts.append(f"\n## Slide {i}\n")
for shape in slide.shapes:
if shape.has_text_frame:
txt = shape.text_frame.text.strip()
if txt:
parts.append(f"\n{txt}\n")
return "".join(parts)
def from_text(path: Path) -> str:
return path.read_text(encoding="utf-8", errors="replace")
_TAG_RE = re.compile(r"<[^>]+>")
_WS_RE = re.compile(r"\n{3,}")
def from_url(url: str) -> str:
try:
import requests
except ImportError:
return "[error] pip install requests"
r = requests.get(url, timeout=30, headers={
"User-Agent": "Mozilla/5.0 (compatible; ppt-source-to-md/1.0)"
})
r.raise_for_status()
html = r.text
# 极简剥离:script/style 删,标签去除
html = re.sub(r"