"""核算各章节篇幅, 对照论文类型 + 语言的预算, 输出表格。 计量: CJK 字符按 1 字; 连续 ASCII 串 (英文单词 / 数字) 按 1 计 —— 对中文稿近似"字数", 对英文稿近似"词数"。预算按 (paper_type, lang) 取, 两套不同口径。 用法: python word_count.py --type original --lang en python word_count.py --type original --lang zh """ from __future__ import annotations import argparse import re import sys from pathlib import Path # BUDGETS[type][section] = {"en": (lo, hi), "zh": (lo, hi), "desc": str} # en 口径=词数, zh 口径=字数。综述的 thematic 主体章节不设固定预算 (篇数可变)。 BUDGETS: dict[str, dict[str, dict]] = { "original": { "00_title_abstract": {"en": (150, 320), "zh": (200, 400), "desc": "Title + Abstract + Keywords"}, "01_introduction": {"en": (600, 1000), "zh": (1000, 1800), "desc": "Introduction"}, "02_methods": {"en": (800, 1600), "zh": (1200, 2400), "desc": "Materials & Methods"}, "03_results": {"en": (1000, 1900), "zh": (1500, 2900), "desc": "Results"}, "04_discussion": {"en": (800, 1600), "zh": (1200, 2400), "desc": "Discussion"}, "05_conclusion": {"en": (180, 400), "zh": (300, 600), "desc": "Conclusion"}, }, "review": { "00_title_abstract": {"en": (180, 350), "zh": (250, 450), "desc": "Title + Abstract + Keywords"}, "01_introduction": {"en": (700, 1300), "zh": (1200, 2200), "desc": "Introduction"}, # 02_..NN_ thematic 主体不设预算 (篇数可变, word_count 标 no budget) "98_outlook": {"en": (500, 1100), "zh": (800, 1800), "desc": "Challenges & Outlook"}, "99_conclusion": {"en": (180, 450), "zh": (300, 700), "desc": "Conclusion"}, }, "letter": { "00_title_abstract": {"en": (120, 250), "zh": (180, 350), "desc": "Title + Abstract"}, "01_main": {"en": (1500, 3000), "zh": (2000, 4000), "desc": "Main text (condensed)"}, }, } _HEADING_RE = re.compile(r"^#+\s+") _BLOCKQUOTE_RE = re.compile(r"^>") _TABLE_LINE_RE = re.compile(r"^\s*\|") def count_chars(text: str) -> int: n = 0 for line in text.splitlines(): stripped = line.strip() if not stripped: continue if _HEADING_RE.match(stripped) or _BLOCKQUOTE_RE.match(stripped) or _TABLE_LINE_RE.match(stripped): continue if stripped.startswith(""): continue for c in stripped: if "一" <= c <= "鿿": n += 1 n += len(re.findall(r"[A-Za-z0-9]+", stripped)) return n def main() -> None: ap = argparse.ArgumentParser(description="论文章节篇幅核算") ap.add_argument("sections_dir", type=Path) ap.add_argument("--type", required=True, choices=list(BUDGETS.keys())) ap.add_argument("--lang", required=True, choices=["zh", "en"]) args = ap.parse_args() if not args.sections_dir.is_dir(): print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr) sys.exit(2) budget = BUDGETS[args.type] files = sorted(args.sections_dir.glob("*.md")) if not files: print(f"[ERR] no .md found in {args.sections_dir}", file=sys.stderr) sys.exit(2) unit = "词" if args.lang == "en" else "字" print(f"\n[篇幅核算] type={args.type} lang={args.lang} (口径: {unit})\n") header = f"{'章节':<26} {'篇幅':>8} {'下限':>6} {'上限':>6} 状态" print(header) print("-" * len(header)) total = 0 overflow = 0 underflow = 0 for f in files: text = f.read_text(encoding="utf-8") n = count_chars(text) total += n stem = f.stem bud = None for key, val in budget.items(): if stem.startswith(key): bud = val break if bud is None: print(f"{stem:<26} {n:>8} - - (no budget; thematic/aux section)") continue lo, hi = bud[args.lang] status = "OK" if n > hi: status = f"WARN over {n - hi}" overflow += 1 elif n < lo: status = f"WARN under {lo - n}" underflow += 1 print(f"{stem:<26} {n:>8} {lo:>6} {hi:>6} {status}") print("-" * len(header)) print(f"{'合计':<26} {total:>8}") if overflow or underflow: print(f"\n[WARN] {overflow} 项超出 / {underflow} 项不足 (含摘要/正文)。回头调整。") sys.exit(1) print("\n[OK] 全部章节篇幅合规。") if __name__ == "__main__": main()