271 lines
9.9 KiB
Python
271 lines
9.9 KiB
Python
"""科研方向简报质量检查 — 渲染前跑一遍。
|
|
|
|
检查项:
|
|
- 结构完整性: 按深度(flash/standard/deep)必备段落是否齐全
|
|
- 占位符泄漏: <TODO> / [CITE-xx] 占位是否还在
|
|
- 过度宣称: "国际领先 / 首次 / 颠覆 / unprecedented" 等无证据夸张词(简报要有判断但别吹)
|
|
- 无源句式: "据报道 / 有研究表明 / 业内普遍认为" 等不挂引文的论断(简报每条论断要可溯源)
|
|
- 引文交叉核对: 文中学术引 [n] 与参考文献清单 [n] 互查(orphan / uncited / 编号连续)
|
|
- web 来源计数: [W1].. 单独统计,提醒和学术引文分开
|
|
|
|
用法:
|
|
python quality_check.py <sections_dir> --depth standard
|
|
python quality_check.py <sections_dir> --depth flash --strict
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
# 各深度必备段落(stem 前缀匹配;references 单独判)
|
|
REQUIRED_SECTIONS: dict[str, list[str]] = {
|
|
"flash": ["00_tldr", "02_clusters", "references"],
|
|
"standard": ["00_tldr", "01_overview", "02_clusters", "05_gaps", "references"],
|
|
"deep": ["00_tldr", "01_overview", "02_clusters", "04_progress", "05_gaps", "references"],
|
|
}
|
|
|
|
# 各深度热点簇数预算(02_clusters 里 '### 簇' 计数)
|
|
CLUSTER_BUDGET: dict[str, tuple[int, int]] = {
|
|
"flash": (2, 3),
|
|
"standard": (3, 5),
|
|
"deep": (5, 7),
|
|
}
|
|
|
|
OVERCLAIM_PHRASES = [
|
|
"国际领先", "国际一流", "世界领先", "世界一流", "填补空白", "重大突破",
|
|
"划时代", "前所未有", "颠覆性", "革命性",
|
|
"world-first", "world-leading", "unprecedented", "groundbreaking",
|
|
"revolutionary", "state of the art",
|
|
]
|
|
|
|
# 无源句式: 出现这些但同段没有 [n]/[CITE-] 引文 → 论断悬空
|
|
UNSOURCED_PHRASES = [
|
|
"据报道", "有研究表明", "研究显示", "业内普遍认为", "众所周知",
|
|
"大量研究", "普遍认为", "据悉",
|
|
]
|
|
|
|
PLACEHOLDER_PATTERNS = [
|
|
r"<TODO[^>]*>",
|
|
r"\[CITE-[A-Za-z0-9_\-]+\]",
|
|
r"\bXX+\b",
|
|
]
|
|
|
|
_INTEXT_CITE_RE = re.compile(r"\[(\d[\d,\s\-]*)\]") # 学术引 [7] / [7-9] / [7,9]
|
|
_REF_ENTRY_RE = re.compile(r"^\s*\[(\d+)\]") # 参考文献条目 [n]
|
|
_WEB_CITE_RE = re.compile(r"\[W\d+\]") # web 来源 [W1]
|
|
_CITE_TOKEN_RE = re.compile(r"\[(?:\d[\d,\s\-]*|CITE-[A-Za-z0-9_\-]+)\]")
|
|
|
|
|
|
def _is_references_file(stem: str) -> bool:
|
|
s = stem.lower()
|
|
return "reference" in s or s.endswith("_refs") or "参考文献" in stem or "08_" in s
|
|
|
|
|
|
def check_structure(sections_dir: Path, depth: str) -> list[str]:
|
|
required = REQUIRED_SECTIONS.get(depth, [])
|
|
existing = {f.stem for f in sections_dir.glob("*.md")}
|
|
issues = []
|
|
for req in required:
|
|
if req == "references":
|
|
if not any(_is_references_file(s) for s in existing):
|
|
issues.append("缺段落: 参考文献 (08_references)")
|
|
continue
|
|
if not any(s.startswith(req) for s in existing):
|
|
issues.append(f"缺段落: {req}")
|
|
return issues
|
|
|
|
|
|
def check_clusters(sections_dir: Path, depth: str) -> list[str]:
|
|
lo, hi = CLUSTER_BUDGET.get(depth, (0, 999))
|
|
n = 0
|
|
for md in sections_dir.glob("*.md"):
|
|
if md.stem.startswith("02_clusters"):
|
|
n += len(re.findall(r"^#{2,4}\s*簇", md.read_text(encoding="utf-8"), re.MULTILINE))
|
|
if n == 0:
|
|
return ["02_clusters 里没找到 '### 簇N' 小节 — 热点聚类是简报主体"]
|
|
if n < lo:
|
|
return [f"热点簇 {n} 个, 少于 {depth} 档预算 {lo}-{hi} (簇太少, 方向覆盖可能不足)"]
|
|
if n > hi:
|
|
return [f"热点簇 {n} 个, 多于 {depth} 档预算 {lo}-{hi} (簇太多, 考虑合并 — 重要性优先于完整性)"]
|
|
return []
|
|
|
|
|
|
def check_phrases(text: str, label: str) -> list[str]:
|
|
issues = []
|
|
low = text.lower()
|
|
for phrase in OVERCLAIM_PHRASES:
|
|
if phrase in text or phrase.lower() in low:
|
|
issues.append(f"[{label}] 过度宣称: '{phrase}' — 换成可被数据支撑的具体表述")
|
|
return issues
|
|
|
|
|
|
def check_unsourced(text: str, label: str) -> list[str]:
|
|
"""无源句式: 整段出现却无任何引文标记 → 悬空论断。按段落(空行分隔)判。"""
|
|
issues = []
|
|
for para in re.split(r"\n\s*\n", text):
|
|
if _CITE_TOKEN_RE.search(para) or _WEB_CITE_RE.search(para):
|
|
continue # 本段有引文, 放过
|
|
for phrase in UNSOURCED_PHRASES:
|
|
if phrase in para:
|
|
snippet = para.strip().replace("\n", " ")[:40]
|
|
issues.append(f"[{label}] 无源论断: '{phrase}' 所在段无引文标记 — 挂 [n] 或删 (\"{snippet}...\")")
|
|
break
|
|
return issues
|
|
|
|
|
|
def check_placeholders(text: str, label: str) -> list[str]:
|
|
issues = []
|
|
for pat in PLACEHOLDER_PATTERNS:
|
|
for m in re.findall(pat, text):
|
|
issues.append(f"[{label}] 占位符未替换: '{m}'")
|
|
return issues
|
|
|
|
|
|
def _expand_cite_group(grp: str) -> set[int]:
|
|
out: set[int] = set()
|
|
for part in grp.split(","):
|
|
part = part.strip()
|
|
if not part:
|
|
continue
|
|
if "-" in part:
|
|
a, _, b = part.partition("-")
|
|
try:
|
|
lo, hi = int(a), int(b)
|
|
except ValueError:
|
|
continue
|
|
if 0 < lo <= hi <= 999:
|
|
out.update(range(lo, hi + 1))
|
|
else:
|
|
try:
|
|
out.add(int(part))
|
|
except ValueError:
|
|
continue
|
|
return out
|
|
|
|
|
|
def check_citations(sections_dir: Path) -> list[str]:
|
|
issues: list[str] = []
|
|
cited: set[int] = set()
|
|
ref_nums: list[int] = []
|
|
web_in_text = 0
|
|
web_refs = 0
|
|
|
|
for md in sorted(sections_dir.glob("*.md")):
|
|
text = md.read_text(encoding="utf-8")
|
|
if _is_references_file(md.stem):
|
|
for ln in text.splitlines():
|
|
m = _REF_ENTRY_RE.match(ln)
|
|
if m:
|
|
ref_nums.append(int(m.group(1)))
|
|
if re.match(r"^\s*\[W\d+\]", ln):
|
|
web_refs += 1
|
|
else:
|
|
for grp in _INTEXT_CITE_RE.findall(text):
|
|
cited.update(_expand_cite_group(grp))
|
|
web_in_text += len(_WEB_CITE_RE.findall(text))
|
|
|
|
if not ref_nums and not cited:
|
|
return ["未发现任何学术引文 (文中 [n] 和参考文献清单都为空) — 简报论断需文献支撑"]
|
|
|
|
ref_set = set(ref_nums)
|
|
|
|
orphan = sorted(cited - ref_set)
|
|
if orphan:
|
|
issues.append(f"orphan cite — 文中引了 {orphan} 但参考文献清单缺对应条目 (编造/漏排, 走 citation_verify)")
|
|
|
|
uncited = sorted(ref_set - cited)
|
|
if uncited:
|
|
issues.append(f"uncited ref — 参考文献第 {uncited} 条正文从未引用 (删除或补引)")
|
|
|
|
dups = sorted({n for n in ref_nums if ref_nums.count(n) > 1})
|
|
if dups:
|
|
issues.append(f"参考文献编号重复: {dups}")
|
|
|
|
if ref_set:
|
|
gaps = sorted(set(range(1, max(ref_set) + 1)) - ref_set)
|
|
if gaps:
|
|
issues.append(f"参考文献编号不连续, 缺号: {gaps} (顺序编码制需 1..N 连续)")
|
|
if 1 not in ref_set:
|
|
issues.append("参考文献编号未从 [1] 起")
|
|
|
|
# web 来源只提示, 不判错(它们和学术引文分开计数)
|
|
if web_in_text and not web_refs:
|
|
issues.append(f"文中有 {web_in_text} 处 [W..] web 引用但参考文献无 [W..] 条目 — 补 web 来源(URL+日期)")
|
|
|
|
return issues
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description="科研方向简报质量检查")
|
|
ap.add_argument("sections_dir", type=Path)
|
|
ap.add_argument("--depth", required=True, choices=list(REQUIRED_SECTIONS.keys()))
|
|
ap.add_argument("--strict", action="store_true", help="严格模式: 任何问题退出 1")
|
|
args = ap.parse_args()
|
|
|
|
if not args.sections_dir.is_dir():
|
|
print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
print(f"\n[简报质量检查] depth={args.depth}\n")
|
|
all_issues: list[str] = []
|
|
|
|
struct = check_structure(args.sections_dir, args.depth)
|
|
if struct:
|
|
print("[ERR] 结构问题:")
|
|
for s in struct:
|
|
print(f" - {s}")
|
|
all_issues.extend(struct)
|
|
else:
|
|
print("[OK] 结构完整")
|
|
|
|
cl = check_clusters(args.sections_dir, args.depth)
|
|
if cl:
|
|
print("\n[WARN] 热点簇数:")
|
|
for s in cl:
|
|
print(f" - {s}")
|
|
all_issues.extend(cl)
|
|
else:
|
|
print("[OK] 热点簇数在预算内")
|
|
|
|
files = sorted(args.sections_dir.glob("*.md"))
|
|
print(f"\n共 {len(files)} 个段落, 逐段扫描 (过度宣称 / 无源论断 / 占位符)...\n")
|
|
for f in files:
|
|
text = f.read_text(encoding="utf-8")
|
|
sub = (check_phrases(text, f.stem)
|
|
+ check_unsourced(text, f.stem)
|
|
+ check_placeholders(text, f.stem))
|
|
if sub:
|
|
print(f"[WARN] {f.stem}:")
|
|
for s in sub:
|
|
print(f" - {s.split('] ', 1)[1] if '] ' in s else s}")
|
|
all_issues.extend(sub)
|
|
|
|
cite_issues = check_citations(args.sections_dir)
|
|
if cite_issues:
|
|
print("\n[ERR] 引文交叉核对:")
|
|
for s in cite_issues:
|
|
print(f" - {s}")
|
|
all_issues.extend(cite_issues)
|
|
else:
|
|
print("\n[OK] 学术引文 [n] 与参考文献清单一致 (无 orphan / uncited, 编号连续)")
|
|
|
|
print("\n" + "=" * 60)
|
|
if all_issues:
|
|
print(f"[WARN] 共发现 {len(all_issues)} 个问题。")
|
|
print("\n建议:")
|
|
print(" - 过度宣称 -> 换成数据支撑的具体表述")
|
|
print(" - 无源论断 -> 挂 [n] 引文或删该句")
|
|
print(" - 占位符未替换 -> 走 citation_verify 把 [CITE-] 映射成真实引文")
|
|
print(" - orphan cite -> 大概率编造, 走 citation_verify 三角核验")
|
|
print(" - uncited ref -> 删条目或正文补引")
|
|
if args.strict:
|
|
sys.exit(1)
|
|
else:
|
|
print("[OK] 未发现问题。")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|