zcbot/skills/paper/scripts/quality_check.py

306 lines
11 KiB
Python

"""论文投稿稿质量检查 — 渲染 docx 前跑一遍。
检查项:
- 结构完整性: 论文类型必备章节是否齐全
- 占位符泄漏: <TODO> / [REF-xx] / [CITE-xx] / (Author, year) 占位是否还在
- 过度宣称: "国际领先 / 首次 / world-first / unprecedented" 等无证据夸张词
- 插图: figures/ 有 png 但 sections 无 ![]() 引用; 代码块 ASCII 字符画; mermaid 缺 caption / 撞名
- **引文交叉核对** (论文版核心): 文中 [n] 与文末参考文献清单互查
· orphan cite: 文中引了 [7] 但参考文献列表没有第 7 条
· uncited ref: 参考文献列了第 9 条但正文从没引用
· 编号不连续 / 不从 1 起 (顺序编码制要求按首次出现顺序连续编号)
用法:
python quality_check.py <sections_dir> --type original
python quality_check.py <sections_dir> --type original --strict
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
REQUIRED_SECTIONS: dict[str, list[str]] = {
"original": [
"00_title_abstract", "01_introduction", "02_methods",
"03_results", "04_discussion", "05_conclusion", "06_references",
],
# 综述: title/abstract + intro + (≥1 个 thematic 主体, 不强制命名) + outlook/conclusion + references
"review": ["00_title_abstract", "01_introduction", "99_conclusion", "references"],
"letter": ["00_title_abstract", "01_main", "references"],
}
# 过度宣称 / 无证据夸张 (中英)
OVERCLAIM_PHRASES = [
"国际领先", "国际一流", "世界领先", "世界一流", "填补空白", "首次提出",
"重大突破", "划时代", "前所未有",
"world-first", "world-leading", "unprecedented", "groundbreaking",
"revolutionary", "first-ever", "state of the art", "best-in-class",
]
PLACEHOLDER_PATTERNS = [
r"<TODO[^>]*>",
r"\[REF-[A-Za-z0-9]+\]",
r"\[CITE-[A-Za-z0-9]+\]",
r"\[Smith et al",
r"\(Author,?\s*\d{4}\)", # APA 占位 (Author, 2024)
r"\bXX+\b", # XX / XXX 占位
]
# 插图相关 (同 proposal)
_BOX_DRAWING_RE = re.compile(r"[┌┐└┘├┤┬┴┼─│╔╗╚╝╠╣╦╩╬═║▲▼◀▶]")
_IMAGE_REF_RE = re.compile(r"!\[[^\]]*\]\([^)\s]+\)")
_FENCE_RE = re.compile(r"^\s*(`{3,}|~{3,})\s*(\S*)\s*$")
_MERMAID_CAPTION_RE = re.compile(r"^\s*%%\s*caption\s*:\s*(.+?)\s*$", re.IGNORECASE)
# 文中引文标记: [7] / [7-9] / [7, 9] / [7,9-11]
_INTEXT_CITE_RE = re.compile(r"\[(\d[\d,\s\-]*)\]")
# 参考文献条目行: 以 [n] 开头
_REF_ENTRY_RE = re.compile(r"^\s*\[(\d+)\]")
def _is_references_file(stem: str) -> bool:
s = stem.lower()
return "reference" in s or s.endswith("_refs") or "参考文献" in stem
def _extract_mermaid_caption(block_lines: list[str]) -> str | None:
for ln in block_lines:
m = _MERMAID_CAPTION_RE.match(ln)
if m:
return m.group(1).strip()
return None
def check_structure(sections_dir: Path, ptype: str) -> list[str]:
required = REQUIRED_SECTIONS.get(ptype, [])
existing = {f.stem for f in sections_dir.glob("*.md")}
issues = []
for req in required:
if req == "references":
if not any(_is_references_file(s) for s in existing):
issues.append("缺章节: references (参考文献)")
continue
if not any(s.startswith(req) for s in existing):
issues.append(f"缺章节: {req}")
return issues
def check_phrases(text: str, label: str) -> list[str]:
issues = []
low = text.lower()
for phrase in OVERCLAIM_PHRASES:
hit = phrase in text or phrase.lower() in low
if hit:
issues.append(f"[{label}] 过度宣称: '{phrase}' — 换成可被数据支撑的具体表述")
return issues
def check_placeholders(text: str, label: str) -> list[str]:
issues = []
for pat in PLACEHOLDER_PATTERNS:
for m in re.findall(pat, text):
issues.append(f"[{label}] 占位符未替换: '{m}'")
return issues
def _expand_cite_group(grp: str) -> set[int]:
"""'7, 9-11' -> {7,9,10,11}。非法片段忽略。"""
out: set[int] = set()
for part in grp.split(","):
part = part.strip()
if not part:
continue
if "-" in part:
a, _, b = part.partition("-")
try:
lo, hi = int(a), int(b)
except ValueError:
continue
if 0 < lo <= hi <= 999:
out.update(range(lo, hi + 1))
else:
try:
out.add(int(part))
except ValueError:
continue
return out
def check_citations(sections_dir: Path) -> list[str]:
"""文中 [n] 与参考文献清单 [n] 互查。"""
issues: list[str] = []
cited: set[int] = set()
ref_nums: list[int] = []
for md in sorted(sections_dir.glob("*.md")):
text = md.read_text(encoding="utf-8")
if _is_references_file(md.stem):
for ln in text.splitlines():
m = _REF_ENTRY_RE.match(ln)
if m:
ref_nums.append(int(m.group(1)))
else:
for grp in _INTEXT_CITE_RE.findall(text):
cited.update(_expand_cite_group(grp))
if not ref_nums and not cited:
return ["未发现任何引文 (文中 [n] 和参考文献清单都为空) — 论文一般需要引用支撑"]
ref_set = set(ref_nums)
# orphan cite: 引了但参考文献没有
orphan = sorted(cited - ref_set)
if orphan:
issues.append(f"orphan cite — 文中引了 {orphan} 但参考文献清单缺对应条目 (编造/漏排)")
# uncited ref: 列了但正文从没引
uncited = sorted(ref_set - cited)
if uncited:
issues.append(f"uncited ref — 参考文献第 {uncited} 条正文从未引用 (删除或在正文补引)")
# 编号重复
dups = sorted({n for n in ref_nums if ref_nums.count(n) > 1})
if dups:
issues.append(f"参考文献编号重复: {dups}")
# 连续性: 应从 1 起连续
if ref_set:
expected = set(range(1, max(ref_set) + 1))
gaps = sorted(expected - ref_set)
if gaps:
issues.append(f"参考文献编号不连续, 缺号: {gaps} (顺序编码制需 1..N 连续)")
if 1 not in ref_set:
issues.append("参考文献编号未从 [1] 起")
return issues
def check_figures(sections_dir: Path) -> list[str]:
issues: list[str] = []
figures_dir = sections_dir.parent / "figures"
pngs = list(figures_dir.glob("*.png")) if figures_dir.is_dir() else []
total_img_refs = 0
ascii_art_blocks: list[tuple[str, int]] = []
mermaid_no_caption: list[tuple[str, int]] = []
mermaid_captions: dict[str, list[str]] = {}
for md in sorted(sections_dir.glob("*.md")):
text = md.read_text(encoding="utf-8")
total_img_refs += len(_IMAGE_REF_RE.findall(text))
lines = text.splitlines()
i = 0
while i < len(lines):
m = _FENCE_RE.match(lines[i])
if not m:
i += 1
continue
fence = m.group(1)
lang = (m.group(2) or "").lower()
block_line = i + 1
i += 1
buf: list[str] = []
while i < len(lines):
mc = _FENCE_RE.match(lines[i])
if mc and mc.group(1)[0] == fence[0] and len(mc.group(1)) >= len(fence):
i += 1
break
buf.append(lines[i])
i += 1
if lang == "mermaid":
cap = _extract_mermaid_caption(buf)
if not cap:
mermaid_no_caption.append((md.name, block_line))
else:
mermaid_captions.setdefault(cap, []).append(f"{md.name}:{block_line}")
continue
if any(_BOX_DRAWING_RE.search(ln) for ln in buf):
ascii_art_blocks.append((md.name, block_line))
if pngs and total_img_refs == 0:
names = ", ".join(p.name for p in pngs[:4])
more = f" ... +{len(pngs) - 4}" if len(pngs) > 4 else ""
issues.append(f"figures/ 有 {len(pngs)} 张 png ({names}{more}) 但 sections 里 0 个 ![](...) 引用")
for fname, lineno in ascii_art_blocks:
issues.append(f"[{fname}:~{lineno}] 代码块里有 ASCII 字符画 — Word 必错位, 改 ```mermaid 块或 ![](figures/x.png)")
for fname, lineno in mermaid_no_caption:
issues.append(f"[{fname}:~{lineno}] mermaid 块缺首行 '%% caption: <图题>'")
for cap, locs in mermaid_captions.items():
if len(locs) > 1:
issues.append(f"mermaid caption 撞名: {cap!r} 出现在 {', '.join(locs)}")
return issues
def main() -> None:
ap = argparse.ArgumentParser(description="论文质量检查")
ap.add_argument("sections_dir", type=Path)
ap.add_argument("--type", required=True, choices=list(REQUIRED_SECTIONS.keys()))
ap.add_argument("--strict", action="store_true", help="严格模式: 任何问题退出 1")
args = ap.parse_args()
if not args.sections_dir.is_dir():
print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr)
sys.exit(2)
print(f"\n[质量检查] type={args.type}\n")
all_issues: list[str] = []
struct = check_structure(args.sections_dir, args.type)
if struct:
print("[ERR] 结构问题:")
for s in struct:
print(f" - {s}")
all_issues.extend(struct)
else:
print("[OK] 结构完整")
files = sorted(args.sections_dir.glob("*.md"))
print(f"\n{len(files)} 个章节, 逐章扫描 (过度宣称 / 占位符)...\n")
for f in files:
text = f.read_text(encoding="utf-8")
sub = check_phrases(text, f.stem) + check_placeholders(text, f.stem)
if sub:
print(f"[WARN] {f.stem}:")
for s in sub:
print(f" - {s.split('] ', 1)[1] if '] ' in s else s}")
all_issues.extend(sub)
cite_issues = check_citations(args.sections_dir)
if cite_issues:
print("\n[ERR] 引文交叉核对:")
for s in cite_issues:
print(f" - {s}")
all_issues.extend(cite_issues)
else:
print("\n[OK] 引文 [n] 与参考文献清单一致 (无 orphan / uncited, 编号连续)")
fig_issues = check_figures(args.sections_dir)
if fig_issues:
print("\n[ERR] 插图问题:")
for s in fig_issues:
print(f" - {s}")
all_issues.extend(fig_issues)
else:
print("\n[OK] 插图引用 / 无 ASCII 字符画")
print("\n" + "=" * 60)
if all_issues:
print(f"[WARN] 共发现 {len(all_issues)} 个问题。")
print("\n建议:")
print(" - 过度宣称 -> 换成数据支撑的具体表述")
print(" - 占位符未替换 -> 补真实数据 / 真实引文")
print(" - orphan cite -> 核对参考文献清单 (大概率编造引文, 走 citation_verify 三角核验)")
print(" - uncited ref -> 删条目或在正文补引")
print(" - 插图未挂 / ASCII 字符画 -> ```mermaid 块或 ![](figures/x.png)")
if args.strict:
sys.exit(1)
else:
print("[OK] 全部检查通过, 可以渲染 docx 了。")
if __name__ == "__main__":
main()