zcbot/skills/proposal/scripts/word_count.py

152 lines
5.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""核算各章节字数, 对照基金类型的字数预算, 输出表格。
中文字数统计: CJK 字符按 1 个字数算; 数字/英文连续片段按 1 个字算 (近似 NSFC/科技部口径)。
用法:
python word_count.py <sections_dir> --fund-type key_rd
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
# 字数预算表 (字数下限, 字数上限, 章节描述)
# 来自 fund_types.md
BUDGETS: dict[str, dict[str, tuple[int, int, str]]] = {
"key_rd": {
"01_summary": (1200, 1500, "申报项目简介"),
"02_background": (1600, 2000, "国内外现状"),
"03_objectives": (3500, 4500, "项目目标 + 考核指标 + 预期成果 (合三节)"),
"04_content": (5500, 7000, "研究内容 + 研究方法 + 技术路线 (合三节)"),
"05_decomposition": (1600, 2000, "课题分解情况 (各课题正文另算)"),
"06_innovation": (1000, 2000, "主要创新点 (3-5 条 × 500)"),
"07_benefit": (1200, 1500, "预期经济社会效益"),
"08_basis": (4000, 6000, "申报与参与单位研究基础"),
"09_schedule": (1500, 2000, "进度安排"),
"10_organization": (1500, 3000, "组织实施 + 保障 + 知识产权 + 风险"),
},
"major_project": {
"01_objectives": (500, 2500, "课题目标 (5 项以内 × 500)"),
"02_content": (5500, 7000, "研究内容 3000 + 研究方法 4000"),
"03_innovation": (500, 2500, "创新点 (≤5 × 500)"),
"04_benefit": (800, 1000, "预期经济社会效益"),
"05_schedule": (1500, 3000, "年度计划 + 里程碑"),
"06_organization": (1500, 2500, "组织实施 (1000+1000+500)"),
"07_ip": (300, 500, "知识产权对策"),
},
"nsfc_joint_fund": {
"01_research_content": (5000, 10000, "(一) 立项依据与研究内容"),
"02_basis": (1500, 3000, "(二) 研究基础与工作条件"),
"03_other": (200, 500, "(三) 其他需要说明的情况"),
},
"nsfc_general": {
"01_research_content": (5000, 8000, "(一) 立项依据与研究内容"),
"02_basis": (1500, 3000, "(二) 研究基础与工作条件"),
"03_other": (200, 500, "(三) 其他需要说明的情况"),
},
"nsfc_youth": {
"01_research_content": (5000, 8000, "(一) 立项依据与研究内容"),
"02_basis": (1500, 3000, "(二) 研究基础与工作条件"),
"03_other": (200, 500, "(三) 其他需要说明的情况"),
},
"provincial": {
"01_research_content": (3500, 6000, "立项依据与研究内容"),
"02_basis": (1000, 2000, "研究基础与工作条件"),
},
"enterprise": {
"00_overview": (500, 1500, "项目背景与合作"),
"01_objectives": (500, 1000, "技术目标与考核指标"),
"02_content": (1500, 4000, "技术方案"),
"03_schedule": (300, 800, "进度与交付物"),
"04_budget": (200, 500, "经费"),
},
}
_HEADING_RE = re.compile(r"^#+\s+")
_BLOCKQUOTE_RE = re.compile(r"^>")
_TABLE_LINE_RE = re.compile(r"^\s*\|")
def count_chars(text: str) -> int:
"""中文按 1 字, 连续 ASCII 串按 1 字。模板提示 (> 开头)、标题 (#)、表格 (|) 不计。"""
n = 0
for line in text.splitlines():
stripped = line.strip()
if not stripped:
continue
if _HEADING_RE.match(stripped) or _BLOCKQUOTE_RE.match(stripped) or _TABLE_LINE_RE.match(stripped):
continue
# 跳过纯模板占位行
if stripped.startswith("<TODO") and stripped.endswith(">"):
continue
# CJK characters
for c in stripped:
if "" <= c <= "鿿":
n += 1
# ASCII words (连续 a-zA-Z0-9 串视为 1 个字)
n += len(re.findall(r"[A-Za-z0-9]+", stripped))
return n
def main() -> None:
ap = argparse.ArgumentParser(description="章节字数核算")
ap.add_argument("sections_dir", type=Path)
ap.add_argument("--fund-type", required=True, choices=list(BUDGETS.keys()))
args = ap.parse_args()
if not args.sections_dir.is_dir():
print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr)
sys.exit(2)
budget = BUDGETS[args.fund_type]
files = sorted(args.sections_dir.glob("*.md"))
if not files:
print(f"[ERR] no .md found in {args.sections_dir}", file=sys.stderr)
sys.exit(2)
print(f"\n[字数核算] fund_type={args.fund_type}\n")
header = f"{'章节':<28} {'字数':>8} {'下限':>6} {'上限':>6} 状态"
print(header)
print("-" * len(header))
total = 0
overflow = 0
underflow = 0
for f in files:
text = f.read_text(encoding="utf-8")
n = count_chars(text)
total += n
# match by stem prefix (e.g. 01_summary)
stem = f.stem
bud = None
for key, val in budget.items():
if stem.startswith(key):
bud = val
break
if bud is None:
print(f"{stem:<28} {n:>8} - - (no budget defined)")
continue
lo, hi, _desc = bud
status = "OK"
if n > hi:
status = f"WARN 超出 {n - hi}"
overflow += 1
elif n < lo:
status = f"WARN 不足 {lo - n}"
underflow += 1
print(f"{stem:<28} {n:>8} {lo:>6} {hi:>6} {status}")
print("-" * len(header))
print(f"{'合计':<28} {total:>8}")
if overflow or underflow:
print(f"\n[WARN] {overflow} 项超出 / {underflow} 项不足。回头调整。")
sys.exit(1)
print("\n[OK] 全部章节字数合规。")
if __name__ == "__main__":
main()