zcbot/skills/ppt/scripts/total_md_split.py

373 lines
11 KiB
Python

#!/usr/bin/env python3
"""
PPT Master - Speaker Notes Splitting Tool
Splits the total.md speaker notes file into multiple individual notes files,
each corresponding to one SVG page.
Usage:
python3 scripts/total_md_split.py <project_path>
python3 scripts/total_md_split.py <project_path> -o output_dir
Examples:
python3 scripts/total_md_split.py projects/<svg_title>_ppt169_YYYYMMDD
python3 scripts/total_md_split.py projects/<svg_title>_ppt169_YYYYMMDD -o notes
Dependencies:
None (only uses standard library)
Notes:
- Checks the one-to-one mapping between SVG files and speaker notes
- Outputs a notice if any SVG file has no corresponding notes
- Split documents do not include the level-1 heading
- Split document names match the SVG filenames with .md extension
"""
import sys
try: # zcbot: Windows GBK 控制台兼容,避免 emoji/© 等触发 UnicodeEncodeError
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
except Exception:
pass
import argparse
import re
from pathlib import Path
HEADING_RE = re.compile(r'^(#{1,6})\s*(.+?)\s*$')
HR_RE = re.compile(r'^\s*[-*]{3,}\s*$')
def normalize_title(title: str) -> str:
"""Normalize titles for fuzzy matching with SVG stems."""
if not title:
return ''
text = title.strip()
# Replace any non-alnum / non-CJK run with underscore
text = re.sub(r'[^0-9A-Za-z\u4e00-\u9fff]+', '_', text)
text = re.sub(r'_+', '_', text).strip('_')
return text.lower()
def extract_leading_number(text: str) -> int | None:
"""Extract leading slide number if present."""
if not text:
return None
# Try 1: Start with digits (standard)
m = re.match(r'^(\d{1,3})', text.strip())
if m:
return int(m.group(1))
# Try 2: Common prefixes (Slide X, Page X, 第X页)
# Case insensitive for English
text_lower = text.lower().strip()
# Slide/Page X
m = re.match(r'^(?:slide|page|p)\s*[-_:]?\s*(\d{1,3})', text_lower)
if m:
return int(m.group(1))
# 第X页/张
m = re.match(r'^第\s*(\d{1,3})\s*[页张]', text_lower)
if m:
return int(m.group(1))
return None
def build_match_maps(svg_stems: list[str]) -> tuple[set[str], dict[str, list[str]], dict[int, list[str]]]:
"""Build exact, normalized, and numeric maps for SVG stem matching."""
exact = set(svg_stems)
norm_map: dict[str, list[str]] = {}
num_map: dict[int, list[str]] = {}
for stem in svg_stems:
norm = normalize_title(stem)
if norm:
norm_map.setdefault(norm, []).append(stem)
num = extract_leading_number(stem)
if num is not None:
num_map.setdefault(num, []).append(stem)
return exact, norm_map, num_map
def match_title(
raw_title: str,
exact: set[str],
norm_map: dict[str, list[str]],
num_map: dict[int, list[str]],
svg_stems: list[str] | None = None,
) -> str | None:
"""Match a note heading to its corresponding SVG stem."""
if raw_title in exact:
return raw_title
norm = normalize_title(raw_title)
if norm in norm_map and len(norm_map[norm]) == 1:
return norm_map[norm][0]
num = extract_leading_number(raw_title)
if num is not None and num in num_map and len(num_map[num]) == 1:
return num_map[num][0]
if norm and svg_stems:
candidates = [s for s in svg_stems if norm in normalize_title(s)]
if len(candidates) == 1:
return candidates[0]
return None
def find_svg_files(project_path: Path) -> list[Path]:
"""
Find SVG files in the project
Args:
project_path: Project directory path
Returns:
List of SVG files (sorted by filename)
"""
svg_dir = project_path / 'svg_output'
if not svg_dir.exists():
print(f"Error: {svg_dir} directory does not exist")
return []
return sorted(svg_dir.glob('*.svg'))
def parse_total_md(
md_path: Path,
svg_stems: list[str] | None = None,
verbose: bool = True,
) -> dict[str, str]:
"""
Parse total.md file and extract speaker notes content for each level-1 heading
Args:
md_path: Path to total.md file
Returns:
Dictionary where key is the level-1 heading (without #) and value is the notes content
"""
if not md_path.exists():
print(f"Error: {md_path} file does not exist")
return {}
try:
with open(md_path, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f"Error: Unable to read file {md_path}: {e}")
return {}
svg_stems = svg_stems or []
exact, norm_map, num_map = build_match_maps(svg_stems)
# Parse by headings (supports # / ## / ###)
notes: dict[str, str] = {}
current_key: str | None = None
current_lines: list[str] = []
unmatched_headings: list[str] = []
lines = content.splitlines()
for line in lines:
m = HEADING_RE.match(line)
if m:
raw_title = m.group(2).strip()
matched = match_title(raw_title, exact, norm_map, num_map, svg_stems)
if matched:
if current_key is not None:
text = '\n'.join(current_lines).strip()
if current_key in notes and text:
notes[current_key] = (notes[current_key].rstrip() + "\n\n" + text).strip()
elif current_key not in notes:
notes[current_key] = text
current_key = matched
current_lines = []
continue
unmatched_headings.append(raw_title)
if HR_RE.match(line):
continue
if current_key is not None:
current_lines.append(line)
if current_key is not None:
text = '\n'.join(current_lines).strip()
if current_key in notes and text:
notes[current_key] = (notes[current_key].rstrip() + "\n\n" + text).strip()
elif current_key not in notes:
notes[current_key] = text
if verbose and unmatched_headings:
print("\n[Notice] Found unmatched headings (ignored):")
for t in unmatched_headings[:10]:
print(f" - {t}")
if len(unmatched_headings) > 10:
print(f" ... and {len(unmatched_headings) - 10} more")
return notes
def check_svg_note_mapping(svg_files: list[Path], notes: dict[str, str]) -> tuple[bool, list[str]]:
"""
Check the mapping between SVG files and speaker notes
Args:
svg_files: List of SVG files
notes: Notes dictionary (key is heading)
Returns:
(whether all matched, list of missing notes headings)
"""
missing_notes = []
for svg_path in svg_files:
# Extract SVG filename (without extension)
svg_stem = svg_path.stem
# Check if a corresponding heading exists in the notes
if svg_stem not in notes:
missing_notes.append(svg_stem)
return len(missing_notes) == 0, missing_notes
def split_notes(notes: dict[str, str], output_dir: Path, verbose: bool = True) -> bool:
"""
Split and save notes dictionary into multiple files
Args:
notes: Notes dictionary (key is heading, value is content)
output_dir: Output directory
verbose: Whether to output detailed information
Returns:
Whether successful
"""
if not notes:
print("Error: No notes content found")
return False
output_dir.mkdir(parents=True, exist_ok=True)
success_count = 0
for title, content in notes.items():
# Generate output filename (same name as SVG file, with .md extension)
output_path = output_dir / f"{title}.md"
try:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(content)
if verbose:
print(f" Generated: {output_path.name}")
success_count += 1
except Exception as e:
if verbose:
print(f" Error: Unable to write file {output_path}: {e}")
if verbose:
print(f"\n[Done] Successfully generated {success_count}/{len(notes)} file(s)")
return success_count == len(notes)
def main() -> None:
"""Run the CLI entry point."""
parser = argparse.ArgumentParser(
description='PPT Master - Speaker Notes Splitting Tool',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='''
Examples:
%(prog)s projects/<svg_title>_ppt169_YYYYMMDD
%(prog)s projects/<svg_title>_ppt169_YYYYMMDD -o notes
%(prog)s projects/<svg_title>_ppt169_YYYYMMDD -q
Features:
- Reads the total.md speaker notes file
- Checks the mapping between SVG files and notes
- Splits notes into multiple individual files
- Output filenames match SVG filenames
'''
)
parser.add_argument('project_path', type=str, help='Project directory path')
parser.add_argument('-o', '--output', type=str, default=None, help='Output directory path (default: notes directory under project)')
parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode')
args = parser.parse_args()
project_path = Path(args.project_path)
if not project_path.exists():
print(f"Error: Path does not exist: {project_path}")
sys.exit(1)
# Determine output directory
if args.output:
output_dir = Path(args.output)
else:
output_dir = project_path / 'notes'
verbose = not args.quiet
if verbose:
print("PPT Master - Speaker Notes Splitting Tool")
print("=" * 50)
print(f" Project path: {project_path}")
print(f" Output directory: {output_dir}")
print()
# Find SVG files
svg_files = find_svg_files(project_path)
if not svg_files:
print("Error: No SVG files found")
sys.exit(1)
if verbose:
print(f" Found {len(svg_files)} SVG file(s)")
# Parse total.md
total_md_path = project_path / 'notes' / 'total.md'
svg_stems = [p.stem for p in svg_files]
notes = parse_total_md(total_md_path, svg_stems, verbose)
if not notes:
print("Error: No notes content found")
sys.exit(1)
if verbose:
print(f" Found {len(notes)} notes section(s)")
print()
# Check mapping
all_match, missing_notes = check_svg_note_mapping(svg_files, notes)
if not all_match:
print("Error: SVG files and notes do not match")
print(f" Missing notes: {', '.join(missing_notes)}")
print("\nPlease regenerate the notes file to ensure every SVG has corresponding notes.")
sys.exit(1)
if verbose:
print("[OK] SVG files and notes have one-to-one correspondence")
print()
# Split notes
success = split_notes(notes, output_dir, verbose)
if success:
if verbose:
print(f"\n[Done] Notes splitting complete")
sys.exit(0)
else:
print(f"\n[Failed] Notes splitting failed")
sys.exit(1)
if __name__ == '__main__':
main()