123 lines
4.6 KiB
Python
123 lines
4.6 KiB
Python
"""核算各章节篇幅, 对照论文类型 + 语言的预算, 输出表格。
|
|
|
|
计量: CJK 字符按 1 字; 连续 ASCII 串 (英文单词 / 数字) 按 1 计 —— 对中文稿近似"字数",
|
|
对英文稿近似"词数"。预算按 (paper_type, lang) 取, 两套不同口径。
|
|
|
|
用法:
|
|
python word_count.py <sections_dir> --type original --lang en
|
|
python word_count.py <sections_dir> --type original --lang zh
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
# BUDGETS[type][section] = {"en": (lo, hi), "zh": (lo, hi), "desc": str}
|
|
# en 口径=词数, zh 口径=字数。综述的 thematic 主体章节不设固定预算 (篇数可变)。
|
|
BUDGETS: dict[str, dict[str, dict]] = {
|
|
"original": {
|
|
"00_title_abstract": {"en": (150, 320), "zh": (200, 400), "desc": "Title + Abstract + Keywords"},
|
|
"01_introduction": {"en": (600, 1000), "zh": (1000, 1800), "desc": "Introduction"},
|
|
"02_methods": {"en": (800, 1600), "zh": (1200, 2400), "desc": "Materials & Methods"},
|
|
"03_results": {"en": (1000, 1900), "zh": (1500, 2900), "desc": "Results"},
|
|
"04_discussion": {"en": (800, 1600), "zh": (1200, 2400), "desc": "Discussion"},
|
|
"05_conclusion": {"en": (180, 400), "zh": (300, 600), "desc": "Conclusion"},
|
|
},
|
|
"review": {
|
|
"00_title_abstract": {"en": (180, 350), "zh": (250, 450), "desc": "Title + Abstract + Keywords"},
|
|
"01_introduction": {"en": (700, 1300), "zh": (1200, 2200), "desc": "Introduction"},
|
|
# 02_..NN_ thematic 主体不设预算 (篇数可变, word_count 标 no budget)
|
|
"98_outlook": {"en": (500, 1100), "zh": (800, 1800), "desc": "Challenges & Outlook"},
|
|
"99_conclusion": {"en": (180, 450), "zh": (300, 700), "desc": "Conclusion"},
|
|
},
|
|
"letter": {
|
|
"00_title_abstract": {"en": (120, 250), "zh": (180, 350), "desc": "Title + Abstract"},
|
|
"01_main": {"en": (1500, 3000), "zh": (2000, 4000), "desc": "Main text (condensed)"},
|
|
},
|
|
}
|
|
|
|
|
|
_HEADING_RE = re.compile(r"^#+\s+")
|
|
_BLOCKQUOTE_RE = re.compile(r"^>")
|
|
_TABLE_LINE_RE = re.compile(r"^\s*\|")
|
|
|
|
|
|
def count_chars(text: str) -> int:
|
|
n = 0
|
|
for line in text.splitlines():
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
continue
|
|
if _HEADING_RE.match(stripped) or _BLOCKQUOTE_RE.match(stripped) or _TABLE_LINE_RE.match(stripped):
|
|
continue
|
|
if stripped.startswith("<TODO") and stripped.endswith(">"):
|
|
continue
|
|
for c in stripped:
|
|
if "一" <= c <= "鿿":
|
|
n += 1
|
|
n += len(re.findall(r"[A-Za-z0-9]+", stripped))
|
|
return n
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description="论文章节篇幅核算")
|
|
ap.add_argument("sections_dir", type=Path)
|
|
ap.add_argument("--type", required=True, choices=list(BUDGETS.keys()))
|
|
ap.add_argument("--lang", required=True, choices=["zh", "en"])
|
|
args = ap.parse_args()
|
|
|
|
if not args.sections_dir.is_dir():
|
|
print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
budget = BUDGETS[args.type]
|
|
files = sorted(args.sections_dir.glob("*.md"))
|
|
if not files:
|
|
print(f"[ERR] no .md found in {args.sections_dir}", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
unit = "词" if args.lang == "en" else "字"
|
|
print(f"\n[篇幅核算] type={args.type} lang={args.lang} (口径: {unit})\n")
|
|
header = f"{'章节':<26} {'篇幅':>8} {'下限':>6} {'上限':>6} 状态"
|
|
print(header)
|
|
print("-" * len(header))
|
|
|
|
total = 0
|
|
overflow = 0
|
|
underflow = 0
|
|
for f in files:
|
|
text = f.read_text(encoding="utf-8")
|
|
n = count_chars(text)
|
|
total += n
|
|
stem = f.stem
|
|
bud = None
|
|
for key, val in budget.items():
|
|
if stem.startswith(key):
|
|
bud = val
|
|
break
|
|
if bud is None:
|
|
print(f"{stem:<26} {n:>8} - - (no budget; thematic/aux section)")
|
|
continue
|
|
lo, hi = bud[args.lang]
|
|
status = "OK"
|
|
if n > hi:
|
|
status = f"WARN over {n - hi}"
|
|
overflow += 1
|
|
elif n < lo:
|
|
status = f"WARN under {lo - n}"
|
|
underflow += 1
|
|
print(f"{stem:<26} {n:>8} {lo:>6} {hi:>6} {status}")
|
|
|
|
print("-" * len(header))
|
|
print(f"{'合计':<26} {total:>8}")
|
|
if overflow or underflow:
|
|
print(f"\n[WARN] {overflow} 项超出 / {underflow} 项不足 (含摘要/正文)。回头调整。")
|
|
sys.exit(1)
|
|
print("\n[OK] 全部章节篇幅合规。")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|