"""核算各章节字数, 对照基金类型的字数预算, 输出表格。 中文字数统计: CJK 字符按 1 个字数算; 数字/英文连续片段按 1 个字算 (近似 NSFC/科技部口径)。 用法: python word_count.py --fund-type key_rd """ from __future__ import annotations import argparse import re import sys from pathlib import Path # 字数预算表 (字数下限, 字数上限, 章节描述) # 来自 fund_types.md BUDGETS: dict[str, dict[str, tuple[int, int, str]]] = { "key_rd": { "01_summary": (1200, 1500, "申报项目简介"), "02_background": (1600, 2000, "国内外现状"), "03_objectives": (3500, 4500, "项目目标 + 考核指标 + 预期成果 (合三节)"), "04_content": (5500, 7000, "研究内容 + 研究方法 + 技术路线 (合三节)"), "05_decomposition": (1600, 2000, "课题分解情况 (各课题正文另算)"), "06_innovation": (1000, 2000, "主要创新点 (3-5 条 × 500)"), "07_benefit": (1200, 1500, "预期经济社会效益"), "08_basis": (4000, 6000, "申报与参与单位研究基础"), "09_schedule": (1500, 2000, "进度安排"), "10_organization": (1500, 3000, "组织实施 + 保障 + 知识产权 + 风险"), }, "major_project": { "01_objectives": (500, 2500, "课题目标 (5 项以内 × 500)"), "02_content": (5500, 7000, "研究内容 3000 + 研究方法 4000"), "03_innovation": (500, 2500, "创新点 (≤5 × 500)"), "04_benefit": (800, 1000, "预期经济社会效益"), "05_schedule": (1500, 3000, "年度计划 + 里程碑"), "06_organization": (1500, 2500, "组织实施 (1000+1000+500)"), "07_ip": (300, 500, "知识产权对策"), }, "nsfc_joint_fund": { "01_research_content": (5000, 10000, "(一) 立项依据与研究内容"), "02_basis": (1500, 3000, "(二) 研究基础与工作条件"), "03_other": (200, 500, "(三) 其他需要说明的情况"), }, "nsfc_general": { "01_research_content": (5000, 8000, "(一) 立项依据与研究内容"), "02_basis": (1500, 3000, "(二) 研究基础与工作条件"), "03_other": (200, 500, "(三) 其他需要说明的情况"), }, "nsfc_youth": { "01_research_content": (5000, 8000, "(一) 立项依据与研究内容"), "02_basis": (1500, 3000, "(二) 研究基础与工作条件"), "03_other": (200, 500, "(三) 其他需要说明的情况"), }, "provincial": { "01_research_content": (3500, 6000, "立项依据与研究内容"), "02_basis": (1000, 2000, "研究基础与工作条件"), }, "enterprise": { "00_overview": (500, 1500, "项目背景与合作"), "01_objectives": (500, 1000, "技术目标与考核指标"), "02_content": (1500, 4000, "技术方案"), "03_schedule": (300, 800, "进度与交付物"), "04_budget": (200, 500, "经费"), }, } _HEADING_RE = re.compile(r"^#+\s+") _BLOCKQUOTE_RE = re.compile(r"^>") _TABLE_LINE_RE = re.compile(r"^\s*\|") def count_chars(text: str) -> int: """中文按 1 字, 连续 ASCII 串按 1 字。模板提示 (> 开头)、标题 (#)、表格 (|) 不计。""" n = 0 for line in text.splitlines(): stripped = line.strip() if not stripped: continue if _HEADING_RE.match(stripped) or _BLOCKQUOTE_RE.match(stripped) or _TABLE_LINE_RE.match(stripped): continue # 跳过纯模板占位行 if stripped.startswith(""): continue # CJK characters for c in stripped: if "一" <= c <= "鿿": n += 1 # ASCII words (连续 a-zA-Z0-9 串视为 1 个字) n += len(re.findall(r"[A-Za-z0-9]+", stripped)) return n def main() -> None: ap = argparse.ArgumentParser(description="章节字数核算") ap.add_argument("sections_dir", type=Path) ap.add_argument("--fund-type", required=True, choices=list(BUDGETS.keys())) args = ap.parse_args() if not args.sections_dir.is_dir(): print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr) sys.exit(2) budget = BUDGETS[args.fund_type] files = sorted(args.sections_dir.glob("*.md")) if not files: print(f"[ERR] no .md found in {args.sections_dir}", file=sys.stderr) sys.exit(2) print(f"\n[字数核算] fund_type={args.fund_type}\n") header = f"{'章节':<28} {'字数':>8} {'下限':>6} {'上限':>6} 状态" print(header) print("-" * len(header)) total = 0 overflow = 0 underflow = 0 for f in files: text = f.read_text(encoding="utf-8") n = count_chars(text) total += n # match by stem prefix (e.g. 01_summary) stem = f.stem bud = None for key, val in budget.items(): if stem.startswith(key): bud = val break if bud is None: print(f"{stem:<28} {n:>8} - - (no budget defined)") continue lo, hi, _desc = bud status = "OK" if n > hi: status = f"WARN 超出 {n - hi}" overflow += 1 elif n < lo: status = f"WARN 不足 {lo - n}" underflow += 1 print(f"{stem:<28} {n:>8} {lo:>6} {hi:>6} {status}") print("-" * len(header)) print(f"{'合计':<28} {total:>8}") if overflow or underflow: print(f"\n[WARN] {overflow} 项超出 / {underflow} 项不足。回头调整。") sys.exit(1) print("\n[OK] 全部章节字数合规。") if __name__ == "__main__": main()