zcbot/skills/proposal/scripts/word_count.py

"""核算各章节字数, 对照基金类型的字数预算, 输出表格。

中文字数统计: CJK 字符按 1 个字数算; 数字/英文连续片段按 1 个字算 (近似 NSFC/科技部口径)。

用法:
  python word_count.py <sections_dir> --fund-type key_rd
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path


# 字数预算表 (字数下限, 字数上限, 章节描述)
# 来自 fund_types.md
BUDGETS: dict[str, dict[str, tuple[int, int, str]]] = {
    "key_rd": {
        "01_summary":         (1200, 1500, "申报项目简介"),
        "02_background":      (1600, 2000, "国内外现状"),
        "03_objectives":      (3500, 4500, "项目目标 + 考核指标 + 预期成果 (合三节)"),
        "04_content":         (5500, 7000, "研究内容 + 研究方法 + 技术路线 (合三节)"),
        "05_decomposition":   (1600, 2000, "课题分解情况 (各课题正文另算)"),
        "06_innovation":      (1000, 2000, "主要创新点 (3-5 条 × 500)"),
        "07_benefit":         (1200, 1500, "预期经济社会效益"),
        "08_basis":           (4000, 6000, "申报与参与单位研究基础"),
        "09_schedule":        (1500, 2000, "进度安排"),
        "10_organization":    (1500, 3000, "组织实施 + 保障 + 知识产权 + 风险"),
    },
    "major_project": {
        "01_objectives":      (500, 2500, "课题目标 (5 项以内 × 500)"),
        "02_content":         (5500, 7000, "研究内容 3000 + 研究方法 4000"),
        "03_innovation":      (500, 2500, "创新点 (≤5 × 500)"),
        "04_benefit":         (800, 1000, "预期经济社会效益"),
        "05_schedule":        (1500, 3000, "年度计划 + 里程碑"),
        "06_organization":    (1500, 2500, "组织实施 (1000+1000+500)"),
        "07_ip":              (300, 500, "知识产权对策"),
    },
    "nsfc_joint_fund": {
        "01_research_content": (5000, 10000, "(一) 立项依据与研究内容"),
        "02_basis":            (1500, 3000, "(二) 研究基础与工作条件"),
        "03_other":            (200, 500, "(三) 其他需要说明的情况"),
    },
    "nsfc_general": {
        "01_research_content": (5000, 8000, "(一) 立项依据与研究内容"),
        "02_basis":            (1500, 3000, "(二) 研究基础与工作条件"),
        "03_other":            (200, 500, "(三) 其他需要说明的情况"),
    },
    "nsfc_youth": {
        "01_research_content": (5000, 8000, "(一) 立项依据与研究内容"),
        "02_basis":            (1500, 3000, "(二) 研究基础与工作条件"),
        "03_other":            (200, 500, "(三) 其他需要说明的情况"),
    },
    "provincial": {
        "01_research_content": (3500, 6000, "立项依据与研究内容"),
        "02_basis":            (1000, 2000, "研究基础与工作条件"),
    },
    "enterprise": {
        "00_overview":         (500, 1500, "项目背景与合作"),
        "01_objectives":       (500, 1000, "技术目标与考核指标"),
        "02_content":          (1500, 4000, "技术方案"),
        "03_schedule":         (300, 800, "进度与交付物"),
        "04_budget":           (200, 500, "经费"),
    },
}


_HEADING_RE = re.compile(r"^#+\s+")
_BLOCKQUOTE_RE = re.compile(r"^>")
_TABLE_LINE_RE = re.compile(r"^\s*\|")


def count_chars(text: str) -> int:
    """中文按 1 字, 连续 ASCII 串按 1 字。模板提示 (> 开头)、标题 (#)、表格 (|) 不计。"""
    n = 0
    for line in text.splitlines():
        stripped = line.strip()
        if not stripped:
            continue
        if _HEADING_RE.match(stripped) or _BLOCKQUOTE_RE.match(stripped) or _TABLE_LINE_RE.match(stripped):
            continue
        # 跳过纯模板占位行
        if stripped.startswith("<TODO") and stripped.endswith(">"):
            continue
        # CJK characters
        for c in stripped:
            if "一" <= c <= "鿿":
                n += 1
        # ASCII words (连续 a-zA-Z0-9 串视为 1 个字)
        n += len(re.findall(r"[A-Za-z0-9]+", stripped))
    return n


def main() -> None:
    ap = argparse.ArgumentParser(description="章节字数核算")
    ap.add_argument("sections_dir", type=Path)
    ap.add_argument("--fund-type", required=True, choices=list(BUDGETS.keys()))
    args = ap.parse_args()

    if not args.sections_dir.is_dir():
        print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr)
        sys.exit(2)

    budget = BUDGETS[args.fund_type]
    files = sorted(args.sections_dir.glob("*.md"))
    if not files:
        print(f"[ERR] no .md found in {args.sections_dir}", file=sys.stderr)
        sys.exit(2)

    print(f"\n[字数核算] fund_type={args.fund_type}\n")
    header = f"{'章节':<28} {'字数':>8} {'下限':>6} {'上限':>6} 状态"
    print(header)
    print("-" * len(header))

    total = 0
    overflow = 0
    underflow = 0
    for f in files:
        text = f.read_text(encoding="utf-8")
        n = count_chars(text)
        total += n
        # match by stem prefix (e.g. 01_summary)
        stem = f.stem
        bud = None
        for key, val in budget.items():
            if stem.startswith(key):
                bud = val
                break
        if bud is None:
            print(f"{stem:<28} {n:>8}      -      -  (no budget defined)")
            continue
        lo, hi, _desc = bud
        status = "OK"
        if n > hi:
            status = f"WARN 超出 {n - hi}"
            overflow += 1
        elif n < lo:
            status = f"WARN 不足 {lo - n}"
            underflow += 1
        print(f"{stem:<28} {n:>8} {lo:>6} {hi:>6}  {status}")

    print("-" * len(header))
    print(f"{'合计':<28} {total:>8}")
    if overflow or underflow:
        print(f"\n[WARN] {overflow} 项超出 / {underflow} 项不足。回头调整。")
        sys.exit(1)
    print("\n[OK] 全部章节字数合规。")


if __name__ == "__main__":
    main()