"""申报书质量检查 — 在交付前跑一遍。 检查项 (来自 review_redlines.md): - 结构完整性: 必备章节是否都有 - 假大空话术: "国际领先 / 首次提出 / 填补空白" 等敏感词 - 指标可考核性: 是否有"显著 / 大幅 / 优异"等不可量化词 - 引文真实性: 占位符 [REF-xx] / [Smith et al., 2023] / 是否还在 - 经费表占位符: 总预算 / 中央财政 等是否还是空格 用法: python quality_check.py --fund-type key_rd """ from __future__ import annotations import argparse import re import sys from pathlib import Path REQUIRED_SECTIONS: dict[str, list[str]] = { "key_rd": [ "00_basic_info", "01_summary", "02_background", "03_objectives", "04_content", "05_decomposition", "06_innovation", "07_benefit", "08_basis", "09_schedule", "10_organization", "11_team", "12_budget", "13_appendix", ], "major_project": [ "00_basic_info", "01_objectives", "02_content", "03_innovation", "04_benefit", "05_schedule", "06_organization", "07_ip", "08_terms", "09_personnel", "10_budget", "11_appendix", ], "nsfc_joint_fund": ["01_research_content", "02_basis", "03_other"], "nsfc_general": ["01_research_content", "02_basis", "03_other"], "nsfc_youth": ["01_research_content", "02_basis", "03_other"], "provincial": ["01_research_content", "02_basis"], "enterprise": ["00_overview", "01_objectives", "02_content", "03_schedule", "04_budget"], } # 假大空 / 不可考核词组 HOLLOW_PHRASES = [ "国际领先", "国际一流", "填补空白", "首次提出", "重大突破", "立足国际前沿", "聚焦关键核心", "世界一流", "深远影响", "独树一帜", "重要意义", "划时代", ] UNQUANTIFIABLE_WORDS = [ "显著提升", "大幅提升", "明显改善", "性能优异", "体验优良", "极大", "大大", "若干", "大量", "多种", ] PLACEHOLDER_PATTERNS = [ r"]*>", r"\[REF-[A-Z0-9]+\]", r"\[Smith et al", # 演示用占位 r"XXX 万元", r"XX 万元", r"X 年 X 月", ] def check_structure(sections_dir: Path, fund_type: str) -> list[str]: required = REQUIRED_SECTIONS.get(fund_type, []) existing = {f.stem for f in sections_dir.glob("*.md")} issues = [] for req in required: if not any(s.startswith(req) for s in existing): issues.append(f"缺章节: {req}") return issues def check_phrases(text: str, file_label: str) -> list[str]: issues = [] for phrase in HOLLOW_PHRASES: if phrase in text: issues.append(f"[{file_label}] 假大空: '{phrase}'") for word in UNQUANTIFIABLE_WORDS: # 简单包含检查 (不区分上下文) if word in text: issues.append(f"[{file_label}] 不可考核: '{word}' — 改成具体数字") return issues def check_placeholders(text: str, file_label: str) -> list[str]: issues = [] for pat in PLACEHOLDER_PATTERNS: matches = re.findall(pat, text) for m in matches: issues.append(f"[{file_label}] 占位符未替换: '{m}'") return issues def parse_spec_metrics(spec_path: Path) -> list[str]: """从 spec_lock.md 的"7. 考核指标矩阵"段抽出"指南考核指标"那列。 寻找形如 `| 1 | 指南指标 | ... |` 的表行(序号 = 数字),取第 2 列。 返回每条指南指标的关键短语列表 (用于在 sections 中模糊匹配)。 """ if not spec_path.exists(): return [] txt = spec_path.read_text(encoding="utf-8") # 截取 "考核指标矩阵" 段到下一节标题 m = re.search(r"考核指标矩阵.*?(?=\n##\s|\Z)", txt, re.DOTALL) if not m: return [] block = m.group(0) out: list[str] = [] for line in block.splitlines(): if not line.strip().startswith("|"): continue cells = [c.strip() for c in line.strip().strip("|").split("|")] if len(cells) < 3: continue # 表头行 / 分隔行 跳过 if not cells[0].isdigit(): continue guide_metric = cells[1] if guide_metric and not guide_metric.startswith(" list[str]: """每条指南考核指标必须在某个章节里以**关键词**形式出现 (>=2 个核心词命中)。""" metrics = parse_spec_metrics(spec_path) if not metrics: return [] # 把 sections 全文拼起来 full = "\n".join(f.read_text(encoding="utf-8") for f in sections_dir.glob("*.md")) issues = [] for metric in metrics: # 提关键词: 取长度 >=2 的中文片段 / 数字 / 字母组合 tokens = re.findall(r"[一-鿿]{2,}|[A-Za-z][\w]*|\d+\.?\d*", metric) if not tokens: continue hits = sum(1 for t in tokens if t in full) # 至少命中 2 个 token, 且至少 30% 的 token 出现 if hits < 2 or hits / len(tokens) < 0.3: issues.append(f"指南指标可能未在正文覆盖: '{metric[:50]}...' (命中 {hits}/{len(tokens)} 关键词)") return issues def main() -> None: ap = argparse.ArgumentParser(description="申报书质量检查") ap.add_argument("sections_dir", type=Path) ap.add_argument("--fund-type", required=True, choices=list(REQUIRED_SECTIONS.keys())) ap.add_argument("--spec", type=Path, default=None, help="spec_lock.md 路径; 提供后会做指南考核指标覆盖度检查") ap.add_argument("--strict", action="store_true", help="严格模式: 任何检查项失败均退出 1") args = ap.parse_args() if not args.sections_dir.is_dir(): print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr) sys.exit(2) print(f"\n[质量检查] fund_type={args.fund_type}\n") all_issues: list[str] = [] # 1. 结构 struct = check_structure(args.sections_dir, args.fund_type) if struct: print("[ERR] 结构问题:") for s in struct: print(f" -{s}") all_issues.extend(struct) else: print("[OK] 结构完整") # 2-4. 内容 (假大空 / 不可考核词 / 占位符) files = sorted(args.sections_dir.glob("*.md")) print(f"\n共 {len(files)} 个章节, 逐章扫描...\n") for f in files: text = f.read_text(encoding="utf-8") label = f.stem ph = check_phrases(text, label) ph_holders = check_placeholders(text, label) sub_issues = ph + ph_holders if sub_issues: print(f"[WARN] {label}:") for s in sub_issues: print(f" -{s.split('] ', 1)[1]}") all_issues.extend(sub_issues) # 5. 指南覆盖度 (--spec 提供时) if args.spec: if not args.spec.exists(): print(f"\n[ERR] spec 文件不存在: {args.spec}") all_issues.append("spec 文件不存在") else: print(f"\n[指南覆盖度] 对照 {args.spec.name}") cov_issues = check_spec_coverage(args.sections_dir, args.spec) if cov_issues: print("[WARN] 部分指南指标可能在正文未充分覆盖:") for s in cov_issues: print(f" -{s}") all_issues.extend(cov_issues) else: print("[OK] 指南考核指标在正文均有体现") print("\n" + "=" * 60) if all_issues: print(f"[WARN] 共发现 {len(all_issues)} 个问题。") print("\n建议:") print(" - 假大空词组 -> 换成具体数字 / 对比") print(" - 不可考核词 -> 量化指标 (TPS / 准确率 / 万元 / N 篇)") print(" - 占位符未替换 -> 找用户提供真实数据 / 替换 ") print(" - 未覆盖指南指标 -> 在对应章节明确写出该指标的实现方式") if args.strict: sys.exit(1) else: print("[OK] 全部检查通过, 可以渲染 docx 了。") if __name__ == "__main__": main()