zcbot/skills/brief/scripts/quality_check.py

"""科研方向简报质量检查 — 渲染前跑一遍。

检查项:
- 结构完整性: 按深度(flash/standard/deep)必备段落是否齐全
- 占位符泄漏: <TODO> / [CITE-xx] 占位是否还在
- 过度宣称: "国际领先 / 首次 / 颠覆 / unprecedented" 等无证据夸张词(简报要有判断但别吹)
- 无源句式: "据报道 / 有研究表明 / 业内普遍认为" 等不挂引文的论断(简报每条论断要可溯源)
- 引文交叉核对: 文中学术引 [n] 与参考文献清单 [n] 互查(orphan / uncited / 编号连续)
- web 来源计数: [W1].. 单独统计,提醒和学术引文分开

用法:
  python quality_check.py <sections_dir> --depth standard
  python quality_check.py <sections_dir> --depth flash --strict
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path


# 各深度必备段落(stem 前缀匹配;references 单独判)
REQUIRED_SECTIONS: dict[str, list[str]] = {
    "flash":    ["00_tldr", "02_clusters", "references"],
    "standard": ["00_tldr", "01_overview", "02_clusters", "05_gaps", "references"],
    "deep":     ["00_tldr", "01_overview", "02_clusters", "04_progress", "05_gaps", "references"],
}

# 各深度热点簇数预算(02_clusters 里 '### 簇' 计数)
CLUSTER_BUDGET: dict[str, tuple[int, int]] = {
    "flash": (2, 3),
    "standard": (3, 5),
    "deep": (5, 7),
}

OVERCLAIM_PHRASES = [
    "国际领先", "国际一流", "世界领先", "世界一流", "填补空白", "重大突破",
    "划时代", "前所未有", "颠覆性", "革命性",
    "world-first", "world-leading", "unprecedented", "groundbreaking",
    "revolutionary", "state of the art",
]

# 无源句式: 出现这些但同段没有 [n]/[CITE-] 引文 → 论断悬空
UNSOURCED_PHRASES = [
    "据报道", "有研究表明", "研究显示", "业内普遍认为", "众所周知",
    "大量研究", "普遍认为", "据悉",
]

PLACEHOLDER_PATTERNS = [
    r"<TODO[^>]*>",
    r"\[CITE-[A-Za-z0-9_\-]+\]",
    r"\bXX+\b",
]

_INTEXT_CITE_RE = re.compile(r"\[(\d[\d,\s\-]*)\]")     # 学术引 [7] / [7-9] / [7,9]
_REF_ENTRY_RE = re.compile(r"^\s*\[(\d+)\]")            # 参考文献条目 [n]
_WEB_CITE_RE = re.compile(r"\[W\d+\]")                  # web 来源 [W1]
_CITE_TOKEN_RE = re.compile(r"\[(?:\d[\d,\s\-]*|CITE-[A-Za-z0-9_\-]+)\]")


def _is_references_file(stem: str) -> bool:
    s = stem.lower()
    return "reference" in s or s.endswith("_refs") or "参考文献" in stem or "08_" in s


def check_structure(sections_dir: Path, depth: str) -> list[str]:
    required = REQUIRED_SECTIONS.get(depth, [])
    existing = {f.stem for f in sections_dir.glob("*.md")}
    issues = []
    for req in required:
        if req == "references":
            if not any(_is_references_file(s) for s in existing):
                issues.append("缺段落: 参考文献 (08_references)")
            continue
        if not any(s.startswith(req) for s in existing):
            issues.append(f"缺段落: {req}")
    return issues


def check_clusters(sections_dir: Path, depth: str) -> list[str]:
    lo, hi = CLUSTER_BUDGET.get(depth, (0, 999))
    n = 0
    for md in sections_dir.glob("*.md"):
        if md.stem.startswith("02_clusters"):
            n += len(re.findall(r"^#{2,4}\s*簇", md.read_text(encoding="utf-8"), re.MULTILINE))
    if n == 0:
        return ["02_clusters 里没找到 '### 簇N' 小节 — 热点聚类是简报主体"]
    if n < lo:
        return [f"热点簇 {n} 个, 少于 {depth} 档预算 {lo}-{hi} (簇太少, 方向覆盖可能不足)"]
    if n > hi:
        return [f"热点簇 {n} 个, 多于 {depth} 档预算 {lo}-{hi} (簇太多, 考虑合并 — 重要性优先于完整性)"]
    return []


def check_phrases(text: str, label: str) -> list[str]:
    issues = []
    low = text.lower()
    for phrase in OVERCLAIM_PHRASES:
        if phrase in text or phrase.lower() in low:
            issues.append(f"[{label}] 过度宣称: '{phrase}' — 换成可被数据支撑的具体表述")
    return issues


def check_unsourced(text: str, label: str) -> list[str]:
    """无源句式: 整段出现却无任何引文标记 → 悬空论断。按段落(空行分隔)判。"""
    issues = []
    for para in re.split(r"\n\s*\n", text):
        if _CITE_TOKEN_RE.search(para) or _WEB_CITE_RE.search(para):
            continue  # 本段有引文, 放过
        for phrase in UNSOURCED_PHRASES:
            if phrase in para:
                snippet = para.strip().replace("\n", " ")[:40]
                issues.append(f"[{label}] 无源论断: '{phrase}' 所在段无引文标记 — 挂 [n] 或删 (\"{snippet}...\")")
                break
    return issues


def check_placeholders(text: str, label: str) -> list[str]:
    issues = []
    for pat in PLACEHOLDER_PATTERNS:
        for m in re.findall(pat, text):
            issues.append(f"[{label}] 占位符未替换: '{m}'")
    return issues


def _expand_cite_group(grp: str) -> set[int]:
    out: set[int] = set()
    for part in grp.split(","):
        part = part.strip()
        if not part:
            continue
        if "-" in part:
            a, _, b = part.partition("-")
            try:
                lo, hi = int(a), int(b)
            except ValueError:
                continue
            if 0 < lo <= hi <= 999:
                out.update(range(lo, hi + 1))
        else:
            try:
                out.add(int(part))
            except ValueError:
                continue
    return out


def check_citations(sections_dir: Path) -> list[str]:
    issues: list[str] = []
    cited: set[int] = set()
    ref_nums: list[int] = []
    web_in_text = 0
    web_refs = 0

    for md in sorted(sections_dir.glob("*.md")):
        text = md.read_text(encoding="utf-8")
        if _is_references_file(md.stem):
            for ln in text.splitlines():
                m = _REF_ENTRY_RE.match(ln)
                if m:
                    ref_nums.append(int(m.group(1)))
                if re.match(r"^\s*\[W\d+\]", ln):
                    web_refs += 1
        else:
            for grp in _INTEXT_CITE_RE.findall(text):
                cited.update(_expand_cite_group(grp))
            web_in_text += len(_WEB_CITE_RE.findall(text))

    if not ref_nums and not cited:
        return ["未发现任何学术引文 (文中 [n] 和参考文献清单都为空) — 简报论断需文献支撑"]

    ref_set = set(ref_nums)

    orphan = sorted(cited - ref_set)
    if orphan:
        issues.append(f"orphan cite — 文中引了 {orphan} 但参考文献清单缺对应条目 (编造/漏排, 走 citation_verify)")

    uncited = sorted(ref_set - cited)
    if uncited:
        issues.append(f"uncited ref — 参考文献第 {uncited} 条正文从未引用 (删除或补引)")

    dups = sorted({n for n in ref_nums if ref_nums.count(n) > 1})
    if dups:
        issues.append(f"参考文献编号重复: {dups}")

    if ref_set:
        gaps = sorted(set(range(1, max(ref_set) + 1)) - ref_set)
        if gaps:
            issues.append(f"参考文献编号不连续, 缺号: {gaps} (顺序编码制需 1..N 连续)")
        if 1 not in ref_set:
            issues.append("参考文献编号未从 [1] 起")

    # web 来源只提示, 不判错(它们和学术引文分开计数)
    if web_in_text and not web_refs:
        issues.append(f"文中有 {web_in_text} 处 [W..] web 引用但参考文献无 [W..] 条目 — 补 web 来源(URL+日期)")

    return issues


def main() -> None:
    ap = argparse.ArgumentParser(description="科研方向简报质量检查")
    ap.add_argument("sections_dir", type=Path)
    ap.add_argument("--depth", required=True, choices=list(REQUIRED_SECTIONS.keys()))
    ap.add_argument("--strict", action="store_true", help="严格模式: 任何问题退出 1")
    args = ap.parse_args()

    if not args.sections_dir.is_dir():
        print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr)
        sys.exit(2)

    print(f"\n[简报质量检查] depth={args.depth}\n")
    all_issues: list[str] = []

    struct = check_structure(args.sections_dir, args.depth)
    if struct:
        print("[ERR] 结构问题:")
        for s in struct:
            print(f"   - {s}")
        all_issues.extend(struct)
    else:
        print("[OK] 结构完整")

    cl = check_clusters(args.sections_dir, args.depth)
    if cl:
        print("\n[WARN] 热点簇数:")
        for s in cl:
            print(f"   - {s}")
        all_issues.extend(cl)
    else:
        print("[OK] 热点簇数在预算内")

    files = sorted(args.sections_dir.glob("*.md"))
    print(f"\n共 {len(files)} 个段落, 逐段扫描 (过度宣称 / 无源论断 / 占位符)...\n")
    for f in files:
        text = f.read_text(encoding="utf-8")
        sub = (check_phrases(text, f.stem)
               + check_unsourced(text, f.stem)
               + check_placeholders(text, f.stem))
        if sub:
            print(f"[WARN] {f.stem}:")
            for s in sub:
                print(f"   - {s.split('] ', 1)[1] if '] ' in s else s}")
            all_issues.extend(sub)

    cite_issues = check_citations(args.sections_dir)
    if cite_issues:
        print("\n[ERR] 引文交叉核对:")
        for s in cite_issues:
            print(f"   - {s}")
        all_issues.extend(cite_issues)
    else:
        print("\n[OK] 学术引文 [n] 与参考文献清单一致 (无 orphan / uncited, 编号连续)")

    print("\n" + "=" * 60)
    if all_issues:
        print(f"[WARN] 共发现 {len(all_issues)} 个问题。")
        print("\n建议:")
        print("  - 过度宣称 -> 换成数据支撑的具体表述")
        print("  - 无源论断 -> 挂 [n] 引文或删该句")
        print("  - 占位符未替换 -> 走 citation_verify 把 [CITE-] 映射成真实引文")
        print("  - orphan cite -> 大概率编造, 走 citation_verify 三角核验")
        print("  - uncited ref -> 删条目或正文补引")
        if args.strict:
            sys.exit(1)
    else:
        print("[OK] 未发现问题。")


if __name__ == "__main__":
    main()