"""论文投稿稿质量检查 — 渲染 docx 前跑一遍。 检查项: - 结构完整性: 论文类型必备章节是否齐全 - 占位符泄漏: / [REF-xx] / [CITE-xx] / (Author, year) 占位是否还在 - 过度宣称: "国际领先 / 首次 / world-first / unprecedented" 等无证据夸张词 - 插图: figures/ 有 png 但 sections 无 ![]() 引用; 代码块 ASCII 字符画; mermaid 缺 caption / 撞名 - **引文交叉核对** (论文版核心): 文中 [n] 与文末参考文献清单互查 · orphan cite: 文中引了 [7] 但参考文献列表没有第 7 条 · uncited ref: 参考文献列了第 9 条但正文从没引用 · 编号不连续 / 不从 1 起 (顺序编码制要求按首次出现顺序连续编号) 用法: python quality_check.py --type original python quality_check.py --type original --strict """ from __future__ import annotations import argparse import re import sys from pathlib import Path REQUIRED_SECTIONS: dict[str, list[str]] = { "original": [ "00_title_abstract", "01_introduction", "02_methods", "03_results", "04_discussion", "05_conclusion", "06_references", ], # 综述: title/abstract + intro + (≥1 个 thematic 主体, 不强制命名) + outlook/conclusion + references "review": ["00_title_abstract", "01_introduction", "99_conclusion", "references"], "letter": ["00_title_abstract", "01_main", "references"], } # 过度宣称 / 无证据夸张 (中英) OVERCLAIM_PHRASES = [ "国际领先", "国际一流", "世界领先", "世界一流", "填补空白", "首次提出", "重大突破", "划时代", "前所未有", "world-first", "world-leading", "unprecedented", "groundbreaking", "revolutionary", "first-ever", "state of the art", "best-in-class", ] PLACEHOLDER_PATTERNS = [ r"]*>", r"\[REF-[A-Za-z0-9]+\]", r"\[CITE-[A-Za-z0-9]+\]", r"\[Smith et al", r"\(Author,?\s*\d{4}\)", # APA 占位 (Author, 2024) r"\bXX+\b", # XX / XXX 占位 ] # 插图相关 (同 proposal) _BOX_DRAWING_RE = re.compile(r"[┌┐└┘├┤┬┴┼─│╔╗╚╝╠╣╦╩╬═║▲▼◀▶]") _IMAGE_REF_RE = re.compile(r"!\[[^\]]*\]\([^)\s]+\)") _FENCE_RE = re.compile(r"^\s*(`{3,}|~{3,})\s*(\S*)\s*$") _MERMAID_CAPTION_RE = re.compile(r"^\s*%%\s*caption\s*:\s*(.+?)\s*$", re.IGNORECASE) # 文中引文标记: [7] / [7-9] / [7, 9] / [7,9-11] _INTEXT_CITE_RE = re.compile(r"\[(\d[\d,\s\-]*)\]") # 参考文献条目行: 以 [n] 开头 _REF_ENTRY_RE = re.compile(r"^\s*\[(\d+)\]") def _is_references_file(stem: str) -> bool: s = stem.lower() return "reference" in s or s.endswith("_refs") or "参考文献" in stem def _extract_mermaid_caption(block_lines: list[str]) -> str | None: for ln in block_lines: m = _MERMAID_CAPTION_RE.match(ln) if m: return m.group(1).strip() return None def check_structure(sections_dir: Path, ptype: str) -> list[str]: required = REQUIRED_SECTIONS.get(ptype, []) existing = {f.stem for f in sections_dir.glob("*.md")} issues = [] for req in required: if req == "references": if not any(_is_references_file(s) for s in existing): issues.append("缺章节: references (参考文献)") continue if not any(s.startswith(req) for s in existing): issues.append(f"缺章节: {req}") return issues def check_phrases(text: str, label: str) -> list[str]: issues = [] low = text.lower() for phrase in OVERCLAIM_PHRASES: hit = phrase in text or phrase.lower() in low if hit: issues.append(f"[{label}] 过度宣称: '{phrase}' — 换成可被数据支撑的具体表述") return issues def check_placeholders(text: str, label: str) -> list[str]: issues = [] for pat in PLACEHOLDER_PATTERNS: for m in re.findall(pat, text): issues.append(f"[{label}] 占位符未替换: '{m}'") return issues def _expand_cite_group(grp: str) -> set[int]: """'7, 9-11' -> {7,9,10,11}。非法片段忽略。""" out: set[int] = set() for part in grp.split(","): part = part.strip() if not part: continue if "-" in part: a, _, b = part.partition("-") try: lo, hi = int(a), int(b) except ValueError: continue if 0 < lo <= hi <= 999: out.update(range(lo, hi + 1)) else: try: out.add(int(part)) except ValueError: continue return out def check_citations(sections_dir: Path) -> list[str]: """文中 [n] 与参考文献清单 [n] 互查。""" issues: list[str] = [] cited: set[int] = set() ref_nums: list[int] = [] for md in sorted(sections_dir.glob("*.md")): text = md.read_text(encoding="utf-8") if _is_references_file(md.stem): for ln in text.splitlines(): m = _REF_ENTRY_RE.match(ln) if m: ref_nums.append(int(m.group(1))) else: for grp in _INTEXT_CITE_RE.findall(text): cited.update(_expand_cite_group(grp)) if not ref_nums and not cited: return ["未发现任何引文 (文中 [n] 和参考文献清单都为空) — 论文一般需要引用支撑"] ref_set = set(ref_nums) # orphan cite: 引了但参考文献没有 orphan = sorted(cited - ref_set) if orphan: issues.append(f"orphan cite — 文中引了 {orphan} 但参考文献清单缺对应条目 (编造/漏排)") # uncited ref: 列了但正文从没引 uncited = sorted(ref_set - cited) if uncited: issues.append(f"uncited ref — 参考文献第 {uncited} 条正文从未引用 (删除或在正文补引)") # 编号重复 dups = sorted({n for n in ref_nums if ref_nums.count(n) > 1}) if dups: issues.append(f"参考文献编号重复: {dups}") # 连续性: 应从 1 起连续 if ref_set: expected = set(range(1, max(ref_set) + 1)) gaps = sorted(expected - ref_set) if gaps: issues.append(f"参考文献编号不连续, 缺号: {gaps} (顺序编码制需 1..N 连续)") if 1 not in ref_set: issues.append("参考文献编号未从 [1] 起") return issues def check_figures(sections_dir: Path) -> list[str]: issues: list[str] = [] figures_dir = sections_dir.parent / "figures" pngs = list(figures_dir.glob("*.png")) if figures_dir.is_dir() else [] total_img_refs = 0 ascii_art_blocks: list[tuple[str, int]] = [] mermaid_no_caption: list[tuple[str, int]] = [] mermaid_captions: dict[str, list[str]] = {} for md in sorted(sections_dir.glob("*.md")): text = md.read_text(encoding="utf-8") total_img_refs += len(_IMAGE_REF_RE.findall(text)) lines = text.splitlines() i = 0 while i < len(lines): m = _FENCE_RE.match(lines[i]) if not m: i += 1 continue fence = m.group(1) lang = (m.group(2) or "").lower() block_line = i + 1 i += 1 buf: list[str] = [] while i < len(lines): mc = _FENCE_RE.match(lines[i]) if mc and mc.group(1)[0] == fence[0] and len(mc.group(1)) >= len(fence): i += 1 break buf.append(lines[i]) i += 1 if lang == "mermaid": cap = _extract_mermaid_caption(buf) if not cap: mermaid_no_caption.append((md.name, block_line)) else: mermaid_captions.setdefault(cap, []).append(f"{md.name}:{block_line}") continue if any(_BOX_DRAWING_RE.search(ln) for ln in buf): ascii_art_blocks.append((md.name, block_line)) if pngs and total_img_refs == 0: names = ", ".join(p.name for p in pngs[:4]) more = f" ... +{len(pngs) - 4}" if len(pngs) > 4 else "" issues.append(f"figures/ 有 {len(pngs)} 张 png ({names}{more}) 但 sections 里 0 个 ![](...) 引用") for fname, lineno in ascii_art_blocks: issues.append(f"[{fname}:~{lineno}] 代码块里有 ASCII 字符画 — Word 必错位, 改 ```mermaid 块或 ![](figures/x.png)") for fname, lineno in mermaid_no_caption: issues.append(f"[{fname}:~{lineno}] mermaid 块缺首行 '%% caption: <图题>'") for cap, locs in mermaid_captions.items(): if len(locs) > 1: issues.append(f"mermaid caption 撞名: {cap!r} 出现在 {', '.join(locs)}") return issues def main() -> None: ap = argparse.ArgumentParser(description="论文质量检查") ap.add_argument("sections_dir", type=Path) ap.add_argument("--type", required=True, choices=list(REQUIRED_SECTIONS.keys())) ap.add_argument("--strict", action="store_true", help="严格模式: 任何问题退出 1") args = ap.parse_args() if not args.sections_dir.is_dir(): print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr) sys.exit(2) print(f"\n[质量检查] type={args.type}\n") all_issues: list[str] = [] struct = check_structure(args.sections_dir, args.type) if struct: print("[ERR] 结构问题:") for s in struct: print(f" - {s}") all_issues.extend(struct) else: print("[OK] 结构完整") files = sorted(args.sections_dir.glob("*.md")) print(f"\n共 {len(files)} 个章节, 逐章扫描 (过度宣称 / 占位符)...\n") for f in files: text = f.read_text(encoding="utf-8") sub = check_phrases(text, f.stem) + check_placeholders(text, f.stem) if sub: print(f"[WARN] {f.stem}:") for s in sub: print(f" - {s.split('] ', 1)[1] if '] ' in s else s}") all_issues.extend(sub) cite_issues = check_citations(args.sections_dir) if cite_issues: print("\n[ERR] 引文交叉核对:") for s in cite_issues: print(f" - {s}") all_issues.extend(cite_issues) else: print("\n[OK] 引文 [n] 与参考文献清单一致 (无 orphan / uncited, 编号连续)") fig_issues = check_figures(args.sections_dir) if fig_issues: print("\n[ERR] 插图问题:") for s in fig_issues: print(f" - {s}") all_issues.extend(fig_issues) else: print("\n[OK] 插图引用 / 无 ASCII 字符画") print("\n" + "=" * 60) if all_issues: print(f"[WARN] 共发现 {len(all_issues)} 个问题。") print("\n建议:") print(" - 过度宣称 -> 换成数据支撑的具体表述") print(" - 占位符未替换 -> 补真实数据 / 真实引文") print(" - orphan cite -> 核对参考文献清单 (大概率编造引文, 走 citation_verify 三角核验)") print(" - uncited ref -> 删条目或在正文补引") print(" - 插图未挂 / ASCII 字符画 -> ```mermaid 块或 ![](figures/x.png)") if args.strict: sys.exit(1) else: print("[OK] 全部检查通过, 可以渲染 docx 了。") if __name__ == "__main__": main()