zcbot/skills/paper/scripts/word_count.py

123 lines
4.6 KiB
Python

"""核算各章节篇幅, 对照论文类型 + 语言的预算, 输出表格。
计量: CJK 字符按 1 字; 连续 ASCII 串 (英文单词 / 数字) 按 1 计 —— 对中文稿近似"字数",
对英文稿近似"词数"。预算按 (paper_type, lang) 取, 两套不同口径。
用法:
python word_count.py <sections_dir> --type original --lang en
python word_count.py <sections_dir> --type original --lang zh
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
# BUDGETS[type][section] = {"en": (lo, hi), "zh": (lo, hi), "desc": str}
# en 口径=词数, zh 口径=字数。综述的 thematic 主体章节不设固定预算 (篇数可变)。
BUDGETS: dict[str, dict[str, dict]] = {
"original": {
"00_title_abstract": {"en": (150, 320), "zh": (200, 400), "desc": "Title + Abstract + Keywords"},
"01_introduction": {"en": (600, 1000), "zh": (1000, 1800), "desc": "Introduction"},
"02_methods": {"en": (800, 1600), "zh": (1200, 2400), "desc": "Materials & Methods"},
"03_results": {"en": (1000, 1900), "zh": (1500, 2900), "desc": "Results"},
"04_discussion": {"en": (800, 1600), "zh": (1200, 2400), "desc": "Discussion"},
"05_conclusion": {"en": (180, 400), "zh": (300, 600), "desc": "Conclusion"},
},
"review": {
"00_title_abstract": {"en": (180, 350), "zh": (250, 450), "desc": "Title + Abstract + Keywords"},
"01_introduction": {"en": (700, 1300), "zh": (1200, 2200), "desc": "Introduction"},
# 02_..NN_ thematic 主体不设预算 (篇数可变, word_count 标 no budget)
"98_outlook": {"en": (500, 1100), "zh": (800, 1800), "desc": "Challenges & Outlook"},
"99_conclusion": {"en": (180, 450), "zh": (300, 700), "desc": "Conclusion"},
},
"letter": {
"00_title_abstract": {"en": (120, 250), "zh": (180, 350), "desc": "Title + Abstract"},
"01_main": {"en": (1500, 3000), "zh": (2000, 4000), "desc": "Main text (condensed)"},
},
}
_HEADING_RE = re.compile(r"^#+\s+")
_BLOCKQUOTE_RE = re.compile(r"^>")
_TABLE_LINE_RE = re.compile(r"^\s*\|")
def count_chars(text: str) -> int:
n = 0
for line in text.splitlines():
stripped = line.strip()
if not stripped:
continue
if _HEADING_RE.match(stripped) or _BLOCKQUOTE_RE.match(stripped) or _TABLE_LINE_RE.match(stripped):
continue
if stripped.startswith("<TODO") and stripped.endswith(">"):
continue
for c in stripped:
if "" <= c <= "鿿":
n += 1
n += len(re.findall(r"[A-Za-z0-9]+", stripped))
return n
def main() -> None:
ap = argparse.ArgumentParser(description="论文章节篇幅核算")
ap.add_argument("sections_dir", type=Path)
ap.add_argument("--type", required=True, choices=list(BUDGETS.keys()))
ap.add_argument("--lang", required=True, choices=["zh", "en"])
args = ap.parse_args()
if not args.sections_dir.is_dir():
print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr)
sys.exit(2)
budget = BUDGETS[args.type]
files = sorted(args.sections_dir.glob("*.md"))
if not files:
print(f"[ERR] no .md found in {args.sections_dir}", file=sys.stderr)
sys.exit(2)
unit = "" if args.lang == "en" else ""
print(f"\n[篇幅核算] type={args.type} lang={args.lang} (口径: {unit})\n")
header = f"{'章节':<26} {'篇幅':>8} {'下限':>6} {'上限':>6} 状态"
print(header)
print("-" * len(header))
total = 0
overflow = 0
underflow = 0
for f in files:
text = f.read_text(encoding="utf-8")
n = count_chars(text)
total += n
stem = f.stem
bud = None
for key, val in budget.items():
if stem.startswith(key):
bud = val
break
if bud is None:
print(f"{stem:<26} {n:>8} - - (no budget; thematic/aux section)")
continue
lo, hi = bud[args.lang]
status = "OK"
if n > hi:
status = f"WARN over {n - hi}"
overflow += 1
elif n < lo:
status = f"WARN under {lo - n}"
underflow += 1
print(f"{stem:<26} {n:>8} {lo:>6} {hi:>6} {status}")
print("-" * len(header))
print(f"{'合计':<26} {total:>8}")
if overflow or underflow:
print(f"\n[WARN] {overflow} 项超出 / {underflow} 项不足 (含摘要/正文)。回头调整。")
sys.exit(1)
print("\n[OK] 全部章节篇幅合规。")
if __name__ == "__main__":
main()