zcbot/skills/proposal/scripts/quality_check.py

"""申报书质量检查 — 在交付前跑一遍。

检查项 (来自 review_redlines.md):
- 结构完整性: 必备章节是否都有
- 假大空话术: "国际领先 / 首次提出 / 填补空白" 等敏感词
- 指标可考核性: 是否有"显著 / 大幅 / 优异"等不可量化词
- 引文真实性: 占位符 [REF-xx] / [Smith et al., 2023] / <TODO> 是否还在
- 经费表占位符: 总预算 / 中央财政 等是否还是空格

用法:
  python quality_check.py <sections_dir> --fund-type key_rd
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path


REQUIRED_SECTIONS: dict[str, list[str]] = {
    "key_rd": [
        "00_basic_info", "01_summary", "02_background", "03_objectives",
        "04_content", "05_decomposition", "06_innovation", "07_benefit",
        "08_basis", "09_schedule", "10_organization",
    ],
    "major_project": [
        "00_basic_info", "01_objectives", "02_content", "03_innovation",
        "04_benefit", "05_schedule", "06_organization", "07_ip",
        "08_terms", "09_personnel", "10_budget", "11_appendix",
    ],
    "nsfc_joint_fund": ["01_research_content", "02_basis", "03_other"],
    "nsfc_general":    ["01_research_content", "02_basis", "03_other"],
    "nsfc_youth":      ["01_research_content", "02_basis", "03_other"],
    "provincial":      ["01_research_content", "02_basis"],
    "enterprise":      ["00_overview", "01_objectives", "02_content", "03_schedule", "04_budget"],
}


# 假大空 / 不可考核词组
HOLLOW_PHRASES = [
    "国际领先", "国际一流", "填补空白", "首次提出", "重大突破",
    "立足国际前沿", "聚焦关键核心", "世界一流",
    "深远影响", "独树一帜", "重要意义", "划时代",
]
UNQUANTIFIABLE_WORDS = [
    "显著提升", "大幅提升", "明显改善", "性能优异", "体验优良",
    "极大", "大大", "若干", "大量", "多种",
]
PLACEHOLDER_PATTERNS = [
    r"<TODO[^>]*>",
    r"\[REF-[A-Z0-9]+\]",
    r"\[Smith et al",  # 演示用占位
    r"XXX 万元",
    r"XX 万元",
    r"X 年 X 月",
]


def check_structure(sections_dir: Path, fund_type: str) -> list[str]:
    required = REQUIRED_SECTIONS.get(fund_type, [])
    existing = {f.stem for f in sections_dir.glob("*.md")}
    issues = []
    for req in required:
        if not any(s.startswith(req) for s in existing):
            issues.append(f"缺章节: {req}")
    return issues


def check_phrases(text: str, file_label: str) -> list[str]:
    issues = []
    for phrase in HOLLOW_PHRASES:
        if phrase in text:
            issues.append(f"[{file_label}] 假大空: '{phrase}'")
    for word in UNQUANTIFIABLE_WORDS:
        # 简单包含检查 (不区分上下文)
        if word in text:
            issues.append(f"[{file_label}] 不可考核: '{word}' — 改成具体数字")
    return issues


def check_placeholders(text: str, file_label: str) -> list[str]:
    issues = []
    for pat in PLACEHOLDER_PATTERNS:
        matches = re.findall(pat, text)
        for m in matches:
            issues.append(f"[{file_label}] 占位符未替换: '{m}'")
    return issues


def main() -> None:
    ap = argparse.ArgumentParser(description="申报书质量检查")
    ap.add_argument("sections_dir", type=Path)
    ap.add_argument("--fund-type", required=True, choices=list(REQUIRED_SECTIONS.keys()))
    ap.add_argument("--strict", action="store_true",
                    help="严格模式: 任何检查项失败均退出 1")
    args = ap.parse_args()

    if not args.sections_dir.is_dir():
        print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr)
        sys.exit(2)

    print(f"\n[质量检查]申报书质量检查 ({args.fund_type})\n")

    all_issues: list[str] = []

    # 1. 结构
    struct = check_structure(args.sections_dir, args.fund_type)
    if struct:
        print("[ERR] 结构问题:")
        for s in struct:
            print(f"   -{s}")
        all_issues.extend(struct)
    else:
        print("[OK] 结构完整")

    # 2-4. 内容
    files = sorted(args.sections_dir.glob("*.md"))
    print(f"\n共 {len(files)} 个章节, 逐章扫描...\n")
    for f in files:
        text = f.read_text(encoding="utf-8")
        label = f.stem
        ph = check_phrases(text, label)
        ph_holders = check_placeholders(text, label)
        sub_issues = ph + ph_holders
        if sub_issues:
            print(f"[WARN] {label}:")
            for s in sub_issues:
                print(f"   -{s.split('] ', 1)[1]}")
            all_issues.extend(sub_issues)

    print("\n" + "=" * 60)
    if all_issues:
        print(f"[WARN] 共发现 {len(all_issues)} 个问题。")
        print("\n建议:")
        print("  - 假大空词组 → 换成具体数字 / 对比")
        print("  - 不可考核词 → 量化指标 (TPS / 准确率 / 万元 / N 篇)")
        print("  - 占位符未替换 → 找用户提供真实数据 / 替换 <TODO>")
        if args.strict:
            sys.exit(1)
    else:
        print("[OK] 全部检查通过, 可以渲染 docx 了。")


if __name__ == "__main__":
    main()