zcbot/skills/paper/scripts/word_count.py

"""核算各章节篇幅, 对照论文类型 + 语言的预算, 输出表格。

计量: CJK 字符按 1 字; 连续 ASCII 串 (英文单词 / 数字) 按 1 计 —— 对中文稿近似"字数",
对英文稿近似"词数"。预算按 (paper_type, lang) 取, 两套不同口径。

用法:
  python word_count.py <sections_dir> --type original --lang en
  python word_count.py <sections_dir> --type original --lang zh
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path


# BUDGETS[type][section] = {"en": (lo, hi), "zh": (lo, hi), "desc": str}
# en 口径=词数, zh 口径=字数。综述的 thematic 主体章节不设固定预算 (篇数可变)。
BUDGETS: dict[str, dict[str, dict]] = {
    "original": {
        "00_title_abstract": {"en": (150, 320), "zh": (200, 400), "desc": "Title + Abstract + Keywords"},
        "01_introduction":   {"en": (600, 1000), "zh": (1000, 1800), "desc": "Introduction"},
        "02_methods":        {"en": (800, 1600), "zh": (1200, 2400), "desc": "Materials & Methods"},
        "03_results":        {"en": (1000, 1900), "zh": (1500, 2900), "desc": "Results"},
        "04_discussion":     {"en": (800, 1600), "zh": (1200, 2400), "desc": "Discussion"},
        "05_conclusion":     {"en": (180, 400), "zh": (300, 600), "desc": "Conclusion"},
    },
    "review": {
        "00_title_abstract": {"en": (180, 350), "zh": (250, 450), "desc": "Title + Abstract + Keywords"},
        "01_introduction":   {"en": (700, 1300), "zh": (1200, 2200), "desc": "Introduction"},
        # 02_..NN_ thematic 主体不设预算 (篇数可变, word_count 标 no budget)
        "98_outlook":        {"en": (500, 1100), "zh": (800, 1800), "desc": "Challenges & Outlook"},
        "99_conclusion":     {"en": (180, 450), "zh": (300, 700), "desc": "Conclusion"},
    },
    "letter": {
        "00_title_abstract": {"en": (120, 250), "zh": (180, 350), "desc": "Title + Abstract"},
        "01_main":           {"en": (1500, 3000), "zh": (2000, 4000), "desc": "Main text (condensed)"},
    },
}


_HEADING_RE = re.compile(r"^#+\s+")
_BLOCKQUOTE_RE = re.compile(r"^>")
_TABLE_LINE_RE = re.compile(r"^\s*\|")


def count_chars(text: str) -> int:
    n = 0
    for line in text.splitlines():
        stripped = line.strip()
        if not stripped:
            continue
        if _HEADING_RE.match(stripped) or _BLOCKQUOTE_RE.match(stripped) or _TABLE_LINE_RE.match(stripped):
            continue
        if stripped.startswith("<TODO") and stripped.endswith(">"):
            continue
        for c in stripped:
            if "一" <= c <= "鿿":
                n += 1
        n += len(re.findall(r"[A-Za-z0-9]+", stripped))
    return n


def main() -> None:
    ap = argparse.ArgumentParser(description="论文章节篇幅核算")
    ap.add_argument("sections_dir", type=Path)
    ap.add_argument("--type", required=True, choices=list(BUDGETS.keys()))
    ap.add_argument("--lang", required=True, choices=["zh", "en"])
    args = ap.parse_args()

    if not args.sections_dir.is_dir():
        print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr)
        sys.exit(2)

    budget = BUDGETS[args.type]
    files = sorted(args.sections_dir.glob("*.md"))
    if not files:
        print(f"[ERR] no .md found in {args.sections_dir}", file=sys.stderr)
        sys.exit(2)

    unit = "词" if args.lang == "en" else "字"
    print(f"\n[篇幅核算] type={args.type} lang={args.lang} (口径: {unit})\n")
    header = f"{'章节':<26} {'篇幅':>8} {'下限':>6} {'上限':>6} 状态"
    print(header)
    print("-" * len(header))

    total = 0
    overflow = 0
    underflow = 0
    for f in files:
        text = f.read_text(encoding="utf-8")
        n = count_chars(text)
        total += n
        stem = f.stem
        bud = None
        for key, val in budget.items():
            if stem.startswith(key):
                bud = val
                break
        if bud is None:
            print(f"{stem:<26} {n:>8}      -      -  (no budget; thematic/aux section)")
            continue
        lo, hi = bud[args.lang]
        status = "OK"
        if n > hi:
            status = f"WARN over {n - hi}"
            overflow += 1
        elif n < lo:
            status = f"WARN under {lo - n}"
            underflow += 1
        print(f"{stem:<26} {n:>8} {lo:>6} {hi:>6}  {status}")

    print("-" * len(header))
    print(f"{'合计':<26} {total:>8}")
    if overflow or underflow:
        print(f"\n[WARN] {overflow} 项超出 / {underflow} 项不足 (含摘要/正文)。回头调整。")
        sys.exit(1)
    print("\n[OK] 全部章节篇幅合规。")


if __name__ == "__main__":
    main()