zcbot/skills/ppt/scripts/total_md_split.py

#!/usr/bin/env python3
"""
PPT Master - Speaker Notes Splitting Tool

Splits the total.md speaker notes file into multiple individual notes files,
each corresponding to one SVG page.

Usage:
    python3 scripts/total_md_split.py <project_path>
    python3 scripts/total_md_split.py <project_path> -o output_dir

Examples:
    python3 scripts/total_md_split.py projects/<svg_title>_ppt169_YYYYMMDD
    python3 scripts/total_md_split.py projects/<svg_title>_ppt169_YYYYMMDD -o notes

Dependencies:
    None (only uses standard library)

Notes:
    - Checks the one-to-one mapping between SVG files and speaker notes
    - Outputs a notice if any SVG file has no corresponding notes
    - Split documents do not include the level-1 heading
    - Split document names match the SVG filenames with .md extension
"""

import sys
try:  # zcbot: Windows GBK 控制台兼容，避免 emoji/© 等触发 UnicodeEncodeError
    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
    sys.stderr.reconfigure(encoding="utf-8", errors="replace")
except Exception:
    pass
import argparse
import re
from pathlib import Path

HEADING_RE = re.compile(r'^(#{1,6})\s*(.+?)\s*$')
HR_RE = re.compile(r'^\s*[-*]{3,}\s*$')


def normalize_title(title: str) -> str:
    """Normalize titles for fuzzy matching with SVG stems."""
    if not title:
        return ''
    text = title.strip()
    # Replace any non-alnum / non-CJK run with underscore
    text = re.sub(r'[^0-9A-Za-z\u4e00-\u9fff]+', '_', text)
    text = re.sub(r'_+', '_', text).strip('_')
    return text.lower()


def extract_leading_number(text: str) -> int | None:
    """Extract leading slide number if present."""
    if not text:
        return None

    # Try 1: Start with digits (standard)
    m = re.match(r'^(\d{1,3})', text.strip())
    if m:
        return int(m.group(1))

    # Try 2: Common prefixes (Slide X, Page X, 第X页)
    # Case insensitive for English
    text_lower = text.lower().strip()

    # Slide/Page X
    m = re.match(r'^(?:slide|page|p)\s*[-_:]?\s*(\d{1,3})', text_lower)
    if m:
        return int(m.group(1))

    # 第X页/张
    m = re.match(r'^第\s*(\d{1,3})\s*[页张]', text_lower)
    if m:
        return int(m.group(1))

    return None


def build_match_maps(svg_stems: list[str]) -> tuple[set[str], dict[str, list[str]], dict[int, list[str]]]:
    """Build exact, normalized, and numeric maps for SVG stem matching."""
    exact = set(svg_stems)
    norm_map: dict[str, list[str]] = {}
    num_map: dict[int, list[str]] = {}
    for stem in svg_stems:
        norm = normalize_title(stem)
        if norm:
            norm_map.setdefault(norm, []).append(stem)
        num = extract_leading_number(stem)
        if num is not None:
            num_map.setdefault(num, []).append(stem)
    return exact, norm_map, num_map


def match_title(
    raw_title: str,
    exact: set[str],
    norm_map: dict[str, list[str]],
    num_map: dict[int, list[str]],
    svg_stems: list[str] | None = None,
) -> str | None:
    """Match a note heading to its corresponding SVG stem."""
    if raw_title in exact:
        return raw_title
    norm = normalize_title(raw_title)
    if norm in norm_map and len(norm_map[norm]) == 1:
        return norm_map[norm][0]
    num = extract_leading_number(raw_title)
    if num is not None and num in num_map and len(num_map[num]) == 1:
        return num_map[num][0]
    if norm and svg_stems:
        candidates = [s for s in svg_stems if norm in normalize_title(s)]
        if len(candidates) == 1:
            return candidates[0]
    return None


def find_svg_files(project_path: Path) -> list[Path]:
    """
    Find SVG files in the project

    Args:
        project_path: Project directory path

    Returns:
        List of SVG files (sorted by filename)
    """
    svg_dir = project_path / 'svg_output'

    if not svg_dir.exists():
        print(f"Error: {svg_dir} directory does not exist")
        return []

    return sorted(svg_dir.glob('*.svg'))


def parse_total_md(
    md_path: Path,
    svg_stems: list[str] | None = None,
    verbose: bool = True,
) -> dict[str, str]:
    """
    Parse total.md file and extract speaker notes content for each level-1 heading

    Args:
        md_path: Path to total.md file

    Returns:
        Dictionary where key is the level-1 heading (without #) and value is the notes content
    """
    if not md_path.exists():
        print(f"Error: {md_path} file does not exist")
        return {}

    try:
        with open(md_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except Exception as e:
        print(f"Error: Unable to read file {md_path}: {e}")
        return {}

    svg_stems = svg_stems or []
    exact, norm_map, num_map = build_match_maps(svg_stems)

    # Parse by headings (supports # / ## / ###)
    notes: dict[str, str] = {}
    current_key: str | None = None
    current_lines: list[str] = []
    unmatched_headings: list[str] = []

    lines = content.splitlines()
    for line in lines:
        m = HEADING_RE.match(line)
        if m:
            raw_title = m.group(2).strip()
            matched = match_title(raw_title, exact, norm_map, num_map, svg_stems)
            if matched:
                if current_key is not None:
                    text = '\n'.join(current_lines).strip()
                    if current_key in notes and text:
                        notes[current_key] = (notes[current_key].rstrip() + "\n\n" + text).strip()
                    elif current_key not in notes:
                        notes[current_key] = text
                current_key = matched
                current_lines = []
                continue
            unmatched_headings.append(raw_title)

        if HR_RE.match(line):
            continue
        if current_key is not None:
            current_lines.append(line)

    if current_key is not None:
        text = '\n'.join(current_lines).strip()
        if current_key in notes and text:
            notes[current_key] = (notes[current_key].rstrip() + "\n\n" + text).strip()
        elif current_key not in notes:
            notes[current_key] = text

    if verbose and unmatched_headings:
        print("\n[Notice] Found unmatched headings (ignored):")
        for t in unmatched_headings[:10]:
            print(f"  - {t}")
        if len(unmatched_headings) > 10:
            print(f"  ... and {len(unmatched_headings) - 10} more")

    return notes


def check_svg_note_mapping(svg_files: list[Path], notes: dict[str, str]) -> tuple[bool, list[str]]:
    """
    Check the mapping between SVG files and speaker notes

    Args:
        svg_files: List of SVG files
        notes: Notes dictionary (key is heading)

    Returns:
        (whether all matched, list of missing notes headings)
    """
    missing_notes = []

    for svg_path in svg_files:
        # Extract SVG filename (without extension)
        svg_stem = svg_path.stem

        # Check if a corresponding heading exists in the notes
        if svg_stem not in notes:
            missing_notes.append(svg_stem)

    return len(missing_notes) == 0, missing_notes


def split_notes(notes: dict[str, str], output_dir: Path, verbose: bool = True) -> bool:
    """
    Split and save notes dictionary into multiple files

    Args:
        notes: Notes dictionary (key is heading, value is content)
        output_dir: Output directory
        verbose: Whether to output detailed information

    Returns:
        Whether successful
    """
    if not notes:
        print("Error: No notes content found")
        return False

    output_dir.mkdir(parents=True, exist_ok=True)

    success_count = 0

    for title, content in notes.items():
        # Generate output filename (same name as SVG file, with .md extension)
        output_path = output_dir / f"{title}.md"

        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(content)

            if verbose:
                print(f"  Generated: {output_path.name}")

            success_count += 1

        except Exception as e:
            if verbose:
                print(f"  Error: Unable to write file {output_path}: {e}")

    if verbose:
        print(f"\n[Done] Successfully generated {success_count}/{len(notes)} file(s)")

    return success_count == len(notes)


def main() -> None:
    """Run the CLI entry point."""
    parser = argparse.ArgumentParser(
        description='PPT Master - Speaker Notes Splitting Tool',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
Examples:
    %(prog)s projects/<svg_title>_ppt169_YYYYMMDD
    %(prog)s projects/<svg_title>_ppt169_YYYYMMDD -o notes
    %(prog)s projects/<svg_title>_ppt169_YYYYMMDD -q

Features:
    - Reads the total.md speaker notes file
    - Checks the mapping between SVG files and notes
    - Splits notes into multiple individual files
    - Output filenames match SVG filenames
'''
    )

    parser.add_argument('project_path', type=str, help='Project directory path')
    parser.add_argument('-o', '--output', type=str, default=None, help='Output directory path (default: notes directory under project)')
    parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode')

    args = parser.parse_args()

    project_path = Path(args.project_path)
    if not project_path.exists():
        print(f"Error: Path does not exist: {project_path}")
        sys.exit(1)

    # Determine output directory
    if args.output:
        output_dir = Path(args.output)
    else:
        output_dir = project_path / 'notes'

    verbose = not args.quiet

    if verbose:
        print("PPT Master - Speaker Notes Splitting Tool")
        print("=" * 50)
        print(f"  Project path: {project_path}")
        print(f"  Output directory: {output_dir}")
        print()

    # Find SVG files
    svg_files = find_svg_files(project_path)

    if not svg_files:
        print("Error: No SVG files found")
        sys.exit(1)

    if verbose:
        print(f"  Found {len(svg_files)} SVG file(s)")

    # Parse total.md
    total_md_path = project_path / 'notes' / 'total.md'
    svg_stems = [p.stem for p in svg_files]
    notes = parse_total_md(total_md_path, svg_stems, verbose)

    if not notes:
        print("Error: No notes content found")
        sys.exit(1)

    if verbose:
        print(f"  Found {len(notes)} notes section(s)")
        print()

    # Check mapping
    all_match, missing_notes = check_svg_note_mapping(svg_files, notes)

    if not all_match:
        print("Error: SVG files and notes do not match")
        print(f"  Missing notes: {', '.join(missing_notes)}")
        print("\nPlease regenerate the notes file to ensure every SVG has corresponding notes.")
        sys.exit(1)

    if verbose:
        print("[OK] SVG files and notes have one-to-one correspondence")
        print()

    # Split notes
    success = split_notes(notes, output_dir, verbose)

    if success:
        if verbose:
            print(f"\n[Done] Notes splitting complete")
        sys.exit(0)
    else:
        print(f"\n[Failed] Notes splitting failed")
        sys.exit(1)


if __name__ == '__main__':
    main()