306 lines
11 KiB
Python
306 lines
11 KiB
Python
"""论文投稿稿质量检查 — 渲染 docx 前跑一遍。
|
|
|
|
检查项:
|
|
- 结构完整性: 论文类型必备章节是否齐全
|
|
- 占位符泄漏: <TODO> / [REF-xx] / [CITE-xx] / (Author, year) 占位是否还在
|
|
- 过度宣称: "国际领先 / 首次 / world-first / unprecedented" 等无证据夸张词
|
|
- 插图: figures/ 有 png 但 sections 无 ![]() 引用; 代码块 ASCII 字符画; mermaid 缺 caption / 撞名
|
|
- **引文交叉核对** (论文版核心): 文中 [n] 与文末参考文献清单互查
|
|
· orphan cite: 文中引了 [7] 但参考文献列表没有第 7 条
|
|
· uncited ref: 参考文献列了第 9 条但正文从没引用
|
|
· 编号不连续 / 不从 1 起 (顺序编码制要求按首次出现顺序连续编号)
|
|
|
|
用法:
|
|
python quality_check.py <sections_dir> --type original
|
|
python quality_check.py <sections_dir> --type original --strict
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
REQUIRED_SECTIONS: dict[str, list[str]] = {
|
|
"original": [
|
|
"00_title_abstract", "01_introduction", "02_methods",
|
|
"03_results", "04_discussion", "05_conclusion", "06_references",
|
|
],
|
|
# 综述: title/abstract + intro + (≥1 个 thematic 主体, 不强制命名) + outlook/conclusion + references
|
|
"review": ["00_title_abstract", "01_introduction", "99_conclusion", "references"],
|
|
"letter": ["00_title_abstract", "01_main", "references"],
|
|
}
|
|
|
|
|
|
# 过度宣称 / 无证据夸张 (中英)
|
|
OVERCLAIM_PHRASES = [
|
|
"国际领先", "国际一流", "世界领先", "世界一流", "填补空白", "首次提出",
|
|
"重大突破", "划时代", "前所未有",
|
|
"world-first", "world-leading", "unprecedented", "groundbreaking",
|
|
"revolutionary", "first-ever", "state of the art", "best-in-class",
|
|
]
|
|
PLACEHOLDER_PATTERNS = [
|
|
r"<TODO[^>]*>",
|
|
r"\[REF-[A-Za-z0-9]+\]",
|
|
r"\[CITE-[A-Za-z0-9]+\]",
|
|
r"\[Smith et al",
|
|
r"\(Author,?\s*\d{4}\)", # APA 占位 (Author, 2024)
|
|
r"\bXX+\b", # XX / XXX 占位
|
|
]
|
|
|
|
|
|
# 插图相关 (同 proposal)
|
|
_BOX_DRAWING_RE = re.compile(r"[┌┐└┘├┤┬┴┼─│╔╗╚╝╠╣╦╩╬═║▲▼◀▶]")
|
|
_IMAGE_REF_RE = re.compile(r"!\[[^\]]*\]\([^)\s]+\)")
|
|
_FENCE_RE = re.compile(r"^\s*(`{3,}|~{3,})\s*(\S*)\s*$")
|
|
_MERMAID_CAPTION_RE = re.compile(r"^\s*%%\s*caption\s*:\s*(.+?)\s*$", re.IGNORECASE)
|
|
|
|
# 文中引文标记: [7] / [7-9] / [7, 9] / [7,9-11]
|
|
_INTEXT_CITE_RE = re.compile(r"\[(\d[\d,\s\-]*)\]")
|
|
# 参考文献条目行: 以 [n] 开头
|
|
_REF_ENTRY_RE = re.compile(r"^\s*\[(\d+)\]")
|
|
|
|
|
|
def _is_references_file(stem: str) -> bool:
|
|
s = stem.lower()
|
|
return "reference" in s or s.endswith("_refs") or "参考文献" in stem
|
|
|
|
|
|
def _extract_mermaid_caption(block_lines: list[str]) -> str | None:
|
|
for ln in block_lines:
|
|
m = _MERMAID_CAPTION_RE.match(ln)
|
|
if m:
|
|
return m.group(1).strip()
|
|
return None
|
|
|
|
|
|
def check_structure(sections_dir: Path, ptype: str) -> list[str]:
|
|
required = REQUIRED_SECTIONS.get(ptype, [])
|
|
existing = {f.stem for f in sections_dir.glob("*.md")}
|
|
issues = []
|
|
for req in required:
|
|
if req == "references":
|
|
if not any(_is_references_file(s) for s in existing):
|
|
issues.append("缺章节: references (参考文献)")
|
|
continue
|
|
if not any(s.startswith(req) for s in existing):
|
|
issues.append(f"缺章节: {req}")
|
|
return issues
|
|
|
|
|
|
def check_phrases(text: str, label: str) -> list[str]:
|
|
issues = []
|
|
low = text.lower()
|
|
for phrase in OVERCLAIM_PHRASES:
|
|
hit = phrase in text or phrase.lower() in low
|
|
if hit:
|
|
issues.append(f"[{label}] 过度宣称: '{phrase}' — 换成可被数据支撑的具体表述")
|
|
return issues
|
|
|
|
|
|
def check_placeholders(text: str, label: str) -> list[str]:
|
|
issues = []
|
|
for pat in PLACEHOLDER_PATTERNS:
|
|
for m in re.findall(pat, text):
|
|
issues.append(f"[{label}] 占位符未替换: '{m}'")
|
|
return issues
|
|
|
|
|
|
def _expand_cite_group(grp: str) -> set[int]:
|
|
"""'7, 9-11' -> {7,9,10,11}。非法片段忽略。"""
|
|
out: set[int] = set()
|
|
for part in grp.split(","):
|
|
part = part.strip()
|
|
if not part:
|
|
continue
|
|
if "-" in part:
|
|
a, _, b = part.partition("-")
|
|
try:
|
|
lo, hi = int(a), int(b)
|
|
except ValueError:
|
|
continue
|
|
if 0 < lo <= hi <= 999:
|
|
out.update(range(lo, hi + 1))
|
|
else:
|
|
try:
|
|
out.add(int(part))
|
|
except ValueError:
|
|
continue
|
|
return out
|
|
|
|
|
|
def check_citations(sections_dir: Path) -> list[str]:
|
|
"""文中 [n] 与参考文献清单 [n] 互查。"""
|
|
issues: list[str] = []
|
|
cited: set[int] = set()
|
|
ref_nums: list[int] = []
|
|
|
|
for md in sorted(sections_dir.glob("*.md")):
|
|
text = md.read_text(encoding="utf-8")
|
|
if _is_references_file(md.stem):
|
|
for ln in text.splitlines():
|
|
m = _REF_ENTRY_RE.match(ln)
|
|
if m:
|
|
ref_nums.append(int(m.group(1)))
|
|
else:
|
|
for grp in _INTEXT_CITE_RE.findall(text):
|
|
cited.update(_expand_cite_group(grp))
|
|
|
|
if not ref_nums and not cited:
|
|
return ["未发现任何引文 (文中 [n] 和参考文献清单都为空) — 论文一般需要引用支撑"]
|
|
|
|
ref_set = set(ref_nums)
|
|
|
|
# orphan cite: 引了但参考文献没有
|
|
orphan = sorted(cited - ref_set)
|
|
if orphan:
|
|
issues.append(f"orphan cite — 文中引了 {orphan} 但参考文献清单缺对应条目 (编造/漏排)")
|
|
|
|
# uncited ref: 列了但正文从没引
|
|
uncited = sorted(ref_set - cited)
|
|
if uncited:
|
|
issues.append(f"uncited ref — 参考文献第 {uncited} 条正文从未引用 (删除或在正文补引)")
|
|
|
|
# 编号重复
|
|
dups = sorted({n for n in ref_nums if ref_nums.count(n) > 1})
|
|
if dups:
|
|
issues.append(f"参考文献编号重复: {dups}")
|
|
|
|
# 连续性: 应从 1 起连续
|
|
if ref_set:
|
|
expected = set(range(1, max(ref_set) + 1))
|
|
gaps = sorted(expected - ref_set)
|
|
if gaps:
|
|
issues.append(f"参考文献编号不连续, 缺号: {gaps} (顺序编码制需 1..N 连续)")
|
|
if 1 not in ref_set:
|
|
issues.append("参考文献编号未从 [1] 起")
|
|
|
|
return issues
|
|
|
|
|
|
def check_figures(sections_dir: Path) -> list[str]:
|
|
issues: list[str] = []
|
|
figures_dir = sections_dir.parent / "figures"
|
|
pngs = list(figures_dir.glob("*.png")) if figures_dir.is_dir() else []
|
|
|
|
total_img_refs = 0
|
|
ascii_art_blocks: list[tuple[str, int]] = []
|
|
mermaid_no_caption: list[tuple[str, int]] = []
|
|
mermaid_captions: dict[str, list[str]] = {}
|
|
|
|
for md in sorted(sections_dir.glob("*.md")):
|
|
text = md.read_text(encoding="utf-8")
|
|
total_img_refs += len(_IMAGE_REF_RE.findall(text))
|
|
lines = text.splitlines()
|
|
i = 0
|
|
while i < len(lines):
|
|
m = _FENCE_RE.match(lines[i])
|
|
if not m:
|
|
i += 1
|
|
continue
|
|
fence = m.group(1)
|
|
lang = (m.group(2) or "").lower()
|
|
block_line = i + 1
|
|
i += 1
|
|
buf: list[str] = []
|
|
while i < len(lines):
|
|
mc = _FENCE_RE.match(lines[i])
|
|
if mc and mc.group(1)[0] == fence[0] and len(mc.group(1)) >= len(fence):
|
|
i += 1
|
|
break
|
|
buf.append(lines[i])
|
|
i += 1
|
|
if lang == "mermaid":
|
|
cap = _extract_mermaid_caption(buf)
|
|
if not cap:
|
|
mermaid_no_caption.append((md.name, block_line))
|
|
else:
|
|
mermaid_captions.setdefault(cap, []).append(f"{md.name}:{block_line}")
|
|
continue
|
|
if any(_BOX_DRAWING_RE.search(ln) for ln in buf):
|
|
ascii_art_blocks.append((md.name, block_line))
|
|
|
|
if pngs and total_img_refs == 0:
|
|
names = ", ".join(p.name for p in pngs[:4])
|
|
more = f" ... +{len(pngs) - 4}" if len(pngs) > 4 else ""
|
|
issues.append(f"figures/ 有 {len(pngs)} 张 png ({names}{more}) 但 sections 里 0 个  引用")
|
|
for fname, lineno in ascii_art_blocks:
|
|
issues.append(f"[{fname}:~{lineno}] 代码块里有 ASCII 字符画 — Word 必错位, 改 ```mermaid 块或 ")
|
|
for fname, lineno in mermaid_no_caption:
|
|
issues.append(f"[{fname}:~{lineno}] mermaid 块缺首行 '%% caption: <图题>'")
|
|
for cap, locs in mermaid_captions.items():
|
|
if len(locs) > 1:
|
|
issues.append(f"mermaid caption 撞名: {cap!r} 出现在 {', '.join(locs)}")
|
|
return issues
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description="论文质量检查")
|
|
ap.add_argument("sections_dir", type=Path)
|
|
ap.add_argument("--type", required=True, choices=list(REQUIRED_SECTIONS.keys()))
|
|
ap.add_argument("--strict", action="store_true", help="严格模式: 任何问题退出 1")
|
|
args = ap.parse_args()
|
|
|
|
if not args.sections_dir.is_dir():
|
|
print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
print(f"\n[质量检查] type={args.type}\n")
|
|
all_issues: list[str] = []
|
|
|
|
struct = check_structure(args.sections_dir, args.type)
|
|
if struct:
|
|
print("[ERR] 结构问题:")
|
|
for s in struct:
|
|
print(f" - {s}")
|
|
all_issues.extend(struct)
|
|
else:
|
|
print("[OK] 结构完整")
|
|
|
|
files = sorted(args.sections_dir.glob("*.md"))
|
|
print(f"\n共 {len(files)} 个章节, 逐章扫描 (过度宣称 / 占位符)...\n")
|
|
for f in files:
|
|
text = f.read_text(encoding="utf-8")
|
|
sub = check_phrases(text, f.stem) + check_placeholders(text, f.stem)
|
|
if sub:
|
|
print(f"[WARN] {f.stem}:")
|
|
for s in sub:
|
|
print(f" - {s.split('] ', 1)[1] if '] ' in s else s}")
|
|
all_issues.extend(sub)
|
|
|
|
cite_issues = check_citations(args.sections_dir)
|
|
if cite_issues:
|
|
print("\n[ERR] 引文交叉核对:")
|
|
for s in cite_issues:
|
|
print(f" - {s}")
|
|
all_issues.extend(cite_issues)
|
|
else:
|
|
print("\n[OK] 引文 [n] 与参考文献清单一致 (无 orphan / uncited, 编号连续)")
|
|
|
|
fig_issues = check_figures(args.sections_dir)
|
|
if fig_issues:
|
|
print("\n[ERR] 插图问题:")
|
|
for s in fig_issues:
|
|
print(f" - {s}")
|
|
all_issues.extend(fig_issues)
|
|
else:
|
|
print("\n[OK] 插图引用 / 无 ASCII 字符画")
|
|
|
|
print("\n" + "=" * 60)
|
|
if all_issues:
|
|
print(f"[WARN] 共发现 {len(all_issues)} 个问题。")
|
|
print("\n建议:")
|
|
print(" - 过度宣称 -> 换成数据支撑的具体表述")
|
|
print(" - 占位符未替换 -> 补真实数据 / 真实引文")
|
|
print(" - orphan cite -> 核对参考文献清单 (大概率编造引文, 走 citation_verify 三角核验)")
|
|
print(" - uncited ref -> 删条目或在正文补引")
|
|
print(" - 插图未挂 / ASCII 字符画 -> ```mermaid 块或 ")
|
|
if args.strict:
|
|
sys.exit(1)
|
|
else:
|
|
print("[OK] 全部检查通过, 可以渲染 docx 了。")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|