zcbot/skills/paper/scripts/quality_check.py

"""论文投稿稿质量检查 — 渲染 docx 前跑一遍。

检查项:
- 结构完整性: 论文类型必备章节是否齐全
- 占位符泄漏: <TODO> / [REF-xx] / [CITE-xx] / (Author, year) 占位是否还在
- 过度宣称: "国际领先 / 首次 / world-first / unprecedented" 等无证据夸张词
- 插图: figures/ 有 png 但 sections 无 ![]() 引用; 代码块 ASCII 字符画; mermaid 缺 caption / 撞名
- **引文交叉核对** (论文版核心): 文中 [n] 与文末参考文献清单互查
    · orphan cite: 文中引了 [7] 但参考文献列表没有第 7 条
    · uncited ref: 参考文献列了第 9 条但正文从没引用
    · 编号不连续 / 不从 1 起 (顺序编码制要求按首次出现顺序连续编号)

用法:
  python quality_check.py <sections_dir> --type original
  python quality_check.py <sections_dir> --type original --strict
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path


REQUIRED_SECTIONS: dict[str, list[str]] = {
    "original": [
        "00_title_abstract", "01_introduction", "02_methods",
        "03_results", "04_discussion", "05_conclusion", "06_references",
    ],
    # 综述: title/abstract + intro + (≥1 个 thematic 主体, 不强制命名) + outlook/conclusion + references
    "review": ["00_title_abstract", "01_introduction", "99_conclusion", "references"],
    "letter": ["00_title_abstract", "01_main", "references"],
}


# 过度宣称 / 无证据夸张 (中英)
OVERCLAIM_PHRASES = [
    "国际领先", "国际一流", "世界领先", "世界一流", "填补空白", "首次提出",
    "重大突破", "划时代", "前所未有",
    "world-first", "world-leading", "unprecedented", "groundbreaking",
    "revolutionary", "first-ever", "state of the art", "best-in-class",
]
PLACEHOLDER_PATTERNS = [
    r"<TODO[^>]*>",
    r"\[REF-[A-Za-z0-9]+\]",
    r"\[CITE-[A-Za-z0-9]+\]",
    r"\[Smith et al",
    r"\(Author,?\s*\d{4}\)",      # APA 占位 (Author, 2024)
    r"\bXX+\b",                    # XX / XXX 占位
]


# 插图相关 (同 proposal)
_BOX_DRAWING_RE = re.compile(r"[┌┐└┘├┤┬┴┼─│╔╗╚╝╠╣╦╩╬═║▲▼◀▶]")
_IMAGE_REF_RE = re.compile(r"!\[[^\]]*\]\([^)\s]+\)")
_FENCE_RE = re.compile(r"^\s*(`{3,}|~{3,})\s*(\S*)\s*$")
_MERMAID_CAPTION_RE = re.compile(r"^\s*%%\s*caption\s*:\s*(.+?)\s*$", re.IGNORECASE)

# 文中引文标记: [7] / [7-9] / [7, 9] / [7,9-11]
_INTEXT_CITE_RE = re.compile(r"\[(\d[\d,\s\-]*)\]")
# 参考文献条目行: 以 [n] 开头
_REF_ENTRY_RE = re.compile(r"^\s*\[(\d+)\]")


def _is_references_file(stem: str) -> bool:
    s = stem.lower()
    return "reference" in s or s.endswith("_refs") or "参考文献" in stem


def _extract_mermaid_caption(block_lines: list[str]) -> str | None:
    for ln in block_lines:
        m = _MERMAID_CAPTION_RE.match(ln)
        if m:
            return m.group(1).strip()
    return None


def check_structure(sections_dir: Path, ptype: str) -> list[str]:
    required = REQUIRED_SECTIONS.get(ptype, [])
    existing = {f.stem for f in sections_dir.glob("*.md")}
    issues = []
    for req in required:
        if req == "references":
            if not any(_is_references_file(s) for s in existing):
                issues.append("缺章节: references (参考文献)")
            continue
        if not any(s.startswith(req) for s in existing):
            issues.append(f"缺章节: {req}")
    return issues


def check_phrases(text: str, label: str) -> list[str]:
    issues = []
    low = text.lower()
    for phrase in OVERCLAIM_PHRASES:
        hit = phrase in text or phrase.lower() in low
        if hit:
            issues.append(f"[{label}] 过度宣称: '{phrase}' — 换成可被数据支撑的具体表述")
    return issues


def check_placeholders(text: str, label: str) -> list[str]:
    issues = []
    for pat in PLACEHOLDER_PATTERNS:
        for m in re.findall(pat, text):
            issues.append(f"[{label}] 占位符未替换: '{m}'")
    return issues


def _expand_cite_group(grp: str) -> set[int]:
    """'7, 9-11' -> {7,9,10,11}。非法片段忽略。"""
    out: set[int] = set()
    for part in grp.split(","):
        part = part.strip()
        if not part:
            continue
        if "-" in part:
            a, _, b = part.partition("-")
            try:
                lo, hi = int(a), int(b)
            except ValueError:
                continue
            if 0 < lo <= hi <= 999:
                out.update(range(lo, hi + 1))
        else:
            try:
                out.add(int(part))
            except ValueError:
                continue
    return out


def check_citations(sections_dir: Path) -> list[str]:
    """文中 [n] 与参考文献清单 [n] 互查。"""
    issues: list[str] = []
    cited: set[int] = set()
    ref_nums: list[int] = []

    for md in sorted(sections_dir.glob("*.md")):
        text = md.read_text(encoding="utf-8")
        if _is_references_file(md.stem):
            for ln in text.splitlines():
                m = _REF_ENTRY_RE.match(ln)
                if m:
                    ref_nums.append(int(m.group(1)))
        else:
            for grp in _INTEXT_CITE_RE.findall(text):
                cited.update(_expand_cite_group(grp))

    if not ref_nums and not cited:
        return ["未发现任何引文 (文中 [n] 和参考文献清单都为空) — 论文一般需要引用支撑"]

    ref_set = set(ref_nums)

    # orphan cite: 引了但参考文献没有
    orphan = sorted(cited - ref_set)
    if orphan:
        issues.append(f"orphan cite — 文中引了 {orphan} 但参考文献清单缺对应条目 (编造/漏排)")

    # uncited ref: 列了但正文从没引
    uncited = sorted(ref_set - cited)
    if uncited:
        issues.append(f"uncited ref — 参考文献第 {uncited} 条正文从未引用 (删除或在正文补引)")

    # 编号重复
    dups = sorted({n for n in ref_nums if ref_nums.count(n) > 1})
    if dups:
        issues.append(f"参考文献编号重复: {dups}")

    # 连续性: 应从 1 起连续
    if ref_set:
        expected = set(range(1, max(ref_set) + 1))
        gaps = sorted(expected - ref_set)
        if gaps:
            issues.append(f"参考文献编号不连续, 缺号: {gaps} (顺序编码制需 1..N 连续)")
        if 1 not in ref_set:
            issues.append("参考文献编号未从 [1] 起")

    return issues


def check_figures(sections_dir: Path) -> list[str]:
    issues: list[str] = []
    figures_dir = sections_dir.parent / "figures"
    pngs = list(figures_dir.glob("*.png")) if figures_dir.is_dir() else []

    total_img_refs = 0
    ascii_art_blocks: list[tuple[str, int]] = []
    mermaid_no_caption: list[tuple[str, int]] = []
    mermaid_captions: dict[str, list[str]] = {}

    for md in sorted(sections_dir.glob("*.md")):
        text = md.read_text(encoding="utf-8")
        total_img_refs += len(_IMAGE_REF_RE.findall(text))
        lines = text.splitlines()
        i = 0
        while i < len(lines):
            m = _FENCE_RE.match(lines[i])
            if not m:
                i += 1
                continue
            fence = m.group(1)
            lang = (m.group(2) or "").lower()
            block_line = i + 1
            i += 1
            buf: list[str] = []
            while i < len(lines):
                mc = _FENCE_RE.match(lines[i])
                if mc and mc.group(1)[0] == fence[0] and len(mc.group(1)) >= len(fence):
                    i += 1
                    break
                buf.append(lines[i])
                i += 1
            if lang == "mermaid":
                cap = _extract_mermaid_caption(buf)
                if not cap:
                    mermaid_no_caption.append((md.name, block_line))
                else:
                    mermaid_captions.setdefault(cap, []).append(f"{md.name}:{block_line}")
                continue
            if any(_BOX_DRAWING_RE.search(ln) for ln in buf):
                ascii_art_blocks.append((md.name, block_line))

    if pngs and total_img_refs == 0:
        names = ", ".join(p.name for p in pngs[:4])
        more = f" ... +{len(pngs) - 4}" if len(pngs) > 4 else ""
        issues.append(f"figures/ 有 {len(pngs)} 张 png ({names}{more}) 但 sections 里 0 个 ![](...) 引用")
    for fname, lineno in ascii_art_blocks:
        issues.append(f"[{fname}:~{lineno}] 代码块里有 ASCII 字符画 — Word 必错位, 改 ```mermaid 块或 ![](figures/x.png)")
    for fname, lineno in mermaid_no_caption:
        issues.append(f"[{fname}:~{lineno}] mermaid 块缺首行 '%% caption: <图题>'")
    for cap, locs in mermaid_captions.items():
        if len(locs) > 1:
            issues.append(f"mermaid caption 撞名: {cap!r} 出现在 {', '.join(locs)}")
    return issues


def main() -> None:
    ap = argparse.ArgumentParser(description="论文质量检查")
    ap.add_argument("sections_dir", type=Path)
    ap.add_argument("--type", required=True, choices=list(REQUIRED_SECTIONS.keys()))
    ap.add_argument("--strict", action="store_true", help="严格模式: 任何问题退出 1")
    args = ap.parse_args()

    if not args.sections_dir.is_dir():
        print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr)
        sys.exit(2)

    print(f"\n[质量检查] type={args.type}\n")
    all_issues: list[str] = []

    struct = check_structure(args.sections_dir, args.type)
    if struct:
        print("[ERR] 结构问题:")
        for s in struct:
            print(f"   - {s}")
        all_issues.extend(struct)
    else:
        print("[OK] 结构完整")

    files = sorted(args.sections_dir.glob("*.md"))
    print(f"\n共 {len(files)} 个章节, 逐章扫描 (过度宣称 / 占位符)...\n")
    for f in files:
        text = f.read_text(encoding="utf-8")
        sub = check_phrases(text, f.stem) + check_placeholders(text, f.stem)
        if sub:
            print(f"[WARN] {f.stem}:")
            for s in sub:
                print(f"   - {s.split('] ', 1)[1] if '] ' in s else s}")
            all_issues.extend(sub)

    cite_issues = check_citations(args.sections_dir)
    if cite_issues:
        print("\n[ERR] 引文交叉核对:")
        for s in cite_issues:
            print(f"   - {s}")
        all_issues.extend(cite_issues)
    else:
        print("\n[OK] 引文 [n] 与参考文献清单一致 (无 orphan / uncited, 编号连续)")

    fig_issues = check_figures(args.sections_dir)
    if fig_issues:
        print("\n[ERR] 插图问题:")
        for s in fig_issues:
            print(f"   - {s}")
        all_issues.extend(fig_issues)
    else:
        print("\n[OK] 插图引用 / 无 ASCII 字符画")

    print("\n" + "=" * 60)
    if all_issues:
        print(f"[WARN] 共发现 {len(all_issues)} 个问题。")
        print("\n建议:")
        print("  - 过度宣称 -> 换成数据支撑的具体表述")
        print("  - 占位符未替换 -> 补真实数据 / 真实引文")
        print("  - orphan cite -> 核对参考文献清单 (大概率编造引文, 走 citation_verify 三角核验)")
        print("  - uncited ref -> 删条目或在正文补引")
        print("  - 插图未挂 / ASCII 字符画 -> ```mermaid 块或 ![](figures/x.png)")
        if args.strict:
            sys.exit(1)
    else:
        print("[OK] 全部检查通过, 可以渲染 docx 了。")


if __name__ == "__main__":
    main()