152 lines
5.8 KiB
Python
152 lines
5.8 KiB
Python
"""核算各章节字数, 对照基金类型的字数预算, 输出表格。
|
||
|
||
中文字数统计: CJK 字符按 1 个字数算; 数字/英文连续片段按 1 个字算 (近似 NSFC/科技部口径)。
|
||
|
||
用法:
|
||
python word_count.py <sections_dir> --fund-type key_rd
|
||
"""
|
||
from __future__ import annotations
|
||
import argparse
|
||
import re
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
|
||
# 字数预算表 (字数下限, 字数上限, 章节描述)
|
||
# 来自 fund_types.md
|
||
BUDGETS: dict[str, dict[str, tuple[int, int, str]]] = {
|
||
"key_rd": {
|
||
"01_summary": (1200, 1500, "申报项目简介"),
|
||
"02_background": (1600, 2000, "国内外现状"),
|
||
"03_objectives": (3500, 4500, "项目目标 + 考核指标 + 预期成果 (合三节)"),
|
||
"04_content": (5500, 7000, "研究内容 + 研究方法 + 技术路线 (合三节)"),
|
||
"05_decomposition": (1600, 2000, "课题分解情况 (各课题正文另算)"),
|
||
"06_innovation": (1000, 2000, "主要创新点 (3-5 条 × 500)"),
|
||
"07_benefit": (1200, 1500, "预期经济社会效益"),
|
||
"08_basis": (4000, 6000, "申报与参与单位研究基础"),
|
||
"09_schedule": (1500, 2000, "进度安排"),
|
||
"10_organization": (1500, 3000, "组织实施 + 保障 + 知识产权 + 风险"),
|
||
},
|
||
"major_project": {
|
||
"01_objectives": (500, 2500, "课题目标 (5 项以内 × 500)"),
|
||
"02_content": (5500, 7000, "研究内容 3000 + 研究方法 4000"),
|
||
"03_innovation": (500, 2500, "创新点 (≤5 × 500)"),
|
||
"04_benefit": (800, 1000, "预期经济社会效益"),
|
||
"05_schedule": (1500, 3000, "年度计划 + 里程碑"),
|
||
"06_organization": (1500, 2500, "组织实施 (1000+1000+500)"),
|
||
"07_ip": (300, 500, "知识产权对策"),
|
||
},
|
||
"nsfc_joint_fund": {
|
||
"01_research_content": (5000, 10000, "(一) 立项依据与研究内容"),
|
||
"02_basis": (1500, 3000, "(二) 研究基础与工作条件"),
|
||
"03_other": (200, 500, "(三) 其他需要说明的情况"),
|
||
},
|
||
"nsfc_general": {
|
||
"01_research_content": (5000, 8000, "(一) 立项依据与研究内容"),
|
||
"02_basis": (1500, 3000, "(二) 研究基础与工作条件"),
|
||
"03_other": (200, 500, "(三) 其他需要说明的情况"),
|
||
},
|
||
"nsfc_youth": {
|
||
"01_research_content": (5000, 8000, "(一) 立项依据与研究内容"),
|
||
"02_basis": (1500, 3000, "(二) 研究基础与工作条件"),
|
||
"03_other": (200, 500, "(三) 其他需要说明的情况"),
|
||
},
|
||
"provincial": {
|
||
"01_research_content": (3500, 6000, "立项依据与研究内容"),
|
||
"02_basis": (1000, 2000, "研究基础与工作条件"),
|
||
},
|
||
"enterprise": {
|
||
"00_overview": (500, 1500, "项目背景与合作"),
|
||
"01_objectives": (500, 1000, "技术目标与考核指标"),
|
||
"02_content": (1500, 4000, "技术方案"),
|
||
"03_schedule": (300, 800, "进度与交付物"),
|
||
"04_budget": (200, 500, "经费"),
|
||
},
|
||
}
|
||
|
||
|
||
_HEADING_RE = re.compile(r"^#+\s+")
|
||
_BLOCKQUOTE_RE = re.compile(r"^>")
|
||
_TABLE_LINE_RE = re.compile(r"^\s*\|")
|
||
|
||
|
||
def count_chars(text: str) -> int:
|
||
"""中文按 1 字, 连续 ASCII 串按 1 字。模板提示 (> 开头)、标题 (#)、表格 (|) 不计。"""
|
||
n = 0
|
||
for line in text.splitlines():
|
||
stripped = line.strip()
|
||
if not stripped:
|
||
continue
|
||
if _HEADING_RE.match(stripped) or _BLOCKQUOTE_RE.match(stripped) or _TABLE_LINE_RE.match(stripped):
|
||
continue
|
||
# 跳过纯模板占位行
|
||
if stripped.startswith("<TODO") and stripped.endswith(">"):
|
||
continue
|
||
# CJK characters
|
||
for c in stripped:
|
||
if "一" <= c <= "鿿":
|
||
n += 1
|
||
# ASCII words (连续 a-zA-Z0-9 串视为 1 个字)
|
||
n += len(re.findall(r"[A-Za-z0-9]+", stripped))
|
||
return n
|
||
|
||
|
||
def main() -> None:
|
||
ap = argparse.ArgumentParser(description="章节字数核算")
|
||
ap.add_argument("sections_dir", type=Path)
|
||
ap.add_argument("--fund-type", required=True, choices=list(BUDGETS.keys()))
|
||
args = ap.parse_args()
|
||
|
||
if not args.sections_dir.is_dir():
|
||
print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr)
|
||
sys.exit(2)
|
||
|
||
budget = BUDGETS[args.fund_type]
|
||
files = sorted(args.sections_dir.glob("*.md"))
|
||
if not files:
|
||
print(f"[ERR] no .md found in {args.sections_dir}", file=sys.stderr)
|
||
sys.exit(2)
|
||
|
||
print(f"\n[字数核算] fund_type={args.fund_type}\n")
|
||
header = f"{'章节':<28} {'字数':>8} {'下限':>6} {'上限':>6} 状态"
|
||
print(header)
|
||
print("-" * len(header))
|
||
|
||
total = 0
|
||
overflow = 0
|
||
underflow = 0
|
||
for f in files:
|
||
text = f.read_text(encoding="utf-8")
|
||
n = count_chars(text)
|
||
total += n
|
||
# match by stem prefix (e.g. 01_summary)
|
||
stem = f.stem
|
||
bud = None
|
||
for key, val in budget.items():
|
||
if stem.startswith(key):
|
||
bud = val
|
||
break
|
||
if bud is None:
|
||
print(f"{stem:<28} {n:>8} - - (no budget defined)")
|
||
continue
|
||
lo, hi, _desc = bud
|
||
status = "OK"
|
||
if n > hi:
|
||
status = f"WARN 超出 {n - hi}"
|
||
overflow += 1
|
||
elif n < lo:
|
||
status = f"WARN 不足 {lo - n}"
|
||
underflow += 1
|
||
print(f"{stem:<28} {n:>8} {lo:>6} {hi:>6} {status}")
|
||
|
||
print("-" * len(header))
|
||
print(f"{'合计':<28} {total:>8}")
|
||
if overflow or underflow:
|
||
print(f"\n[WARN] {overflow} 项超出 / {underflow} 项不足。回头调整。")
|
||
sys.exit(1)
|
||
print("\n[OK] 全部章节字数合规。")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|