"""科研方向简报质量检查 — 渲染前跑一遍。 检查项: - 结构完整性: 按深度(flash/standard/deep)必备段落是否齐全 - 占位符泄漏: / [CITE-xx] 占位是否还在 - 过度宣称: "国际领先 / 首次 / 颠覆 / unprecedented" 等无证据夸张词(简报要有判断但别吹) - 无源句式: "据报道 / 有研究表明 / 业内普遍认为" 等不挂引文的论断(简报每条论断要可溯源) - 引文交叉核对: 文中学术引 [n] 与参考文献清单 [n] 互查(orphan / uncited / 编号连续) - web 来源计数: [W1].. 单独统计,提醒和学术引文分开 用法: python quality_check.py --depth standard python quality_check.py --depth flash --strict """ from __future__ import annotations import argparse import re import sys from pathlib import Path # 各深度必备段落(stem 前缀匹配;references 单独判) REQUIRED_SECTIONS: dict[str, list[str]] = { "flash": ["00_tldr", "02_clusters", "references"], "standard": ["00_tldr", "01_overview", "02_clusters", "05_gaps", "references"], "deep": ["00_tldr", "01_overview", "02_clusters", "04_progress", "05_gaps", "references"], } # 各深度热点簇数预算(02_clusters 里 '### 簇' 计数) CLUSTER_BUDGET: dict[str, tuple[int, int]] = { "flash": (2, 3), "standard": (3, 5), "deep": (5, 7), } OVERCLAIM_PHRASES = [ "国际领先", "国际一流", "世界领先", "世界一流", "填补空白", "重大突破", "划时代", "前所未有", "颠覆性", "革命性", "world-first", "world-leading", "unprecedented", "groundbreaking", "revolutionary", "state of the art", ] # 无源句式: 出现这些但同段没有 [n]/[CITE-] 引文 → 论断悬空 UNSOURCED_PHRASES = [ "据报道", "有研究表明", "研究显示", "业内普遍认为", "众所周知", "大量研究", "普遍认为", "据悉", ] PLACEHOLDER_PATTERNS = [ r"]*>", r"\[CITE-[A-Za-z0-9_\-]+\]", r"\bXX+\b", ] _INTEXT_CITE_RE = re.compile(r"\[(\d[\d,\s\-]*)\]") # 学术引 [7] / [7-9] / [7,9] _REF_ENTRY_RE = re.compile(r"^\s*\[(\d+)\]") # 参考文献条目 [n] _WEB_CITE_RE = re.compile(r"\[W\d+\]") # web 来源 [W1] _CITE_TOKEN_RE = re.compile(r"\[(?:\d[\d,\s\-]*|CITE-[A-Za-z0-9_\-]+)\]") def _is_references_file(stem: str) -> bool: s = stem.lower() return "reference" in s or s.endswith("_refs") or "参考文献" in stem or "08_" in s def check_structure(sections_dir: Path, depth: str) -> list[str]: required = REQUIRED_SECTIONS.get(depth, []) existing = {f.stem for f in sections_dir.glob("*.md")} issues = [] for req in required: if req == "references": if not any(_is_references_file(s) for s in existing): issues.append("缺段落: 参考文献 (08_references)") continue if not any(s.startswith(req) for s in existing): issues.append(f"缺段落: {req}") return issues def check_clusters(sections_dir: Path, depth: str) -> list[str]: lo, hi = CLUSTER_BUDGET.get(depth, (0, 999)) n = 0 for md in sections_dir.glob("*.md"): if md.stem.startswith("02_clusters"): n += len(re.findall(r"^#{2,4}\s*簇", md.read_text(encoding="utf-8"), re.MULTILINE)) if n == 0: return ["02_clusters 里没找到 '### 簇N' 小节 — 热点聚类是简报主体"] if n < lo: return [f"热点簇 {n} 个, 少于 {depth} 档预算 {lo}-{hi} (簇太少, 方向覆盖可能不足)"] if n > hi: return [f"热点簇 {n} 个, 多于 {depth} 档预算 {lo}-{hi} (簇太多, 考虑合并 — 重要性优先于完整性)"] return [] def check_phrases(text: str, label: str) -> list[str]: issues = [] low = text.lower() for phrase in OVERCLAIM_PHRASES: if phrase in text or phrase.lower() in low: issues.append(f"[{label}] 过度宣称: '{phrase}' — 换成可被数据支撑的具体表述") return issues def check_unsourced(text: str, label: str) -> list[str]: """无源句式: 整段出现却无任何引文标记 → 悬空论断。按段落(空行分隔)判。""" issues = [] for para in re.split(r"\n\s*\n", text): if _CITE_TOKEN_RE.search(para) or _WEB_CITE_RE.search(para): continue # 本段有引文, 放过 for phrase in UNSOURCED_PHRASES: if phrase in para: snippet = para.strip().replace("\n", " ")[:40] issues.append(f"[{label}] 无源论断: '{phrase}' 所在段无引文标记 — 挂 [n] 或删 (\"{snippet}...\")") break return issues def check_placeholders(text: str, label: str) -> list[str]: issues = [] for pat in PLACEHOLDER_PATTERNS: for m in re.findall(pat, text): issues.append(f"[{label}] 占位符未替换: '{m}'") return issues def _expand_cite_group(grp: str) -> set[int]: out: set[int] = set() for part in grp.split(","): part = part.strip() if not part: continue if "-" in part: a, _, b = part.partition("-") try: lo, hi = int(a), int(b) except ValueError: continue if 0 < lo <= hi <= 999: out.update(range(lo, hi + 1)) else: try: out.add(int(part)) except ValueError: continue return out def check_citations(sections_dir: Path) -> list[str]: issues: list[str] = [] cited: set[int] = set() ref_nums: list[int] = [] web_in_text = 0 web_refs = 0 for md in sorted(sections_dir.glob("*.md")): text = md.read_text(encoding="utf-8") if _is_references_file(md.stem): for ln in text.splitlines(): m = _REF_ENTRY_RE.match(ln) if m: ref_nums.append(int(m.group(1))) if re.match(r"^\s*\[W\d+\]", ln): web_refs += 1 else: for grp in _INTEXT_CITE_RE.findall(text): cited.update(_expand_cite_group(grp)) web_in_text += len(_WEB_CITE_RE.findall(text)) if not ref_nums and not cited: return ["未发现任何学术引文 (文中 [n] 和参考文献清单都为空) — 简报论断需文献支撑"] ref_set = set(ref_nums) orphan = sorted(cited - ref_set) if orphan: issues.append(f"orphan cite — 文中引了 {orphan} 但参考文献清单缺对应条目 (编造/漏排, 走 citation_verify)") uncited = sorted(ref_set - cited) if uncited: issues.append(f"uncited ref — 参考文献第 {uncited} 条正文从未引用 (删除或补引)") dups = sorted({n for n in ref_nums if ref_nums.count(n) > 1}) if dups: issues.append(f"参考文献编号重复: {dups}") if ref_set: gaps = sorted(set(range(1, max(ref_set) + 1)) - ref_set) if gaps: issues.append(f"参考文献编号不连续, 缺号: {gaps} (顺序编码制需 1..N 连续)") if 1 not in ref_set: issues.append("参考文献编号未从 [1] 起") # web 来源只提示, 不判错(它们和学术引文分开计数) if web_in_text and not web_refs: issues.append(f"文中有 {web_in_text} 处 [W..] web 引用但参考文献无 [W..] 条目 — 补 web 来源(URL+日期)") return issues def main() -> None: ap = argparse.ArgumentParser(description="科研方向简报质量检查") ap.add_argument("sections_dir", type=Path) ap.add_argument("--depth", required=True, choices=list(REQUIRED_SECTIONS.keys())) ap.add_argument("--strict", action="store_true", help="严格模式: 任何问题退出 1") args = ap.parse_args() if not args.sections_dir.is_dir(): print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr) sys.exit(2) print(f"\n[简报质量检查] depth={args.depth}\n") all_issues: list[str] = [] struct = check_structure(args.sections_dir, args.depth) if struct: print("[ERR] 结构问题:") for s in struct: print(f" - {s}") all_issues.extend(struct) else: print("[OK] 结构完整") cl = check_clusters(args.sections_dir, args.depth) if cl: print("\n[WARN] 热点簇数:") for s in cl: print(f" - {s}") all_issues.extend(cl) else: print("[OK] 热点簇数在预算内") files = sorted(args.sections_dir.glob("*.md")) print(f"\n共 {len(files)} 个段落, 逐段扫描 (过度宣称 / 无源论断 / 占位符)...\n") for f in files: text = f.read_text(encoding="utf-8") sub = (check_phrases(text, f.stem) + check_unsourced(text, f.stem) + check_placeholders(text, f.stem)) if sub: print(f"[WARN] {f.stem}:") for s in sub: print(f" - {s.split('] ', 1)[1] if '] ' in s else s}") all_issues.extend(sub) cite_issues = check_citations(args.sections_dir) if cite_issues: print("\n[ERR] 引文交叉核对:") for s in cite_issues: print(f" - {s}") all_issues.extend(cite_issues) else: print("\n[OK] 学术引文 [n] 与参考文献清单一致 (无 orphan / uncited, 编号连续)") print("\n" + "=" * 60) if all_issues: print(f"[WARN] 共发现 {len(all_issues)} 个问题。") print("\n建议:") print(" - 过度宣称 -> 换成数据支撑的具体表述") print(" - 无源论断 -> 挂 [n] 引文或删该句") print(" - 占位符未替换 -> 走 citation_verify 把 [CITE-] 映射成真实引文") print(" - orphan cite -> 大概率编造, 走 citation_verify 三角核验") print(" - uncited ref -> 删条目或正文补引") if args.strict: sys.exit(1) else: print("[OK] 未发现问题。") if __name__ == "__main__": main()