322 lines
12 KiB
Python
322 lines
12 KiB
Python
"""申报书质量检查 — 在交付前跑一遍。
|
|
|
|
检查项 (来自 review_redlines.md):
|
|
- 结构完整性: 必备章节是否都有
|
|
- 假大空话术: "国际领先 / 首次提出 / 填补空白" 等敏感词
|
|
- 指标可考核性: 是否有"显著 / 大幅 / 优异"等不可量化词
|
|
- 引文真实性: 占位符 [REF-xx] / [Smith et al., 2023] / <TODO> 是否还在
|
|
- 经费表占位符: 总预算 / 中央财政 等是否还是空格
|
|
- 插图: figures/ 有 png 但 sections 里没 ![]() 引用; 代码块里出现 ASCII 字符画
|
|
|
|
用法:
|
|
python quality_check.py <sections_dir> --fund-type key_rd
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
REQUIRED_SECTIONS: dict[str, list[str]] = {
|
|
"key_rd": [
|
|
"00_basic_info", "01_summary", "02_background", "03_objectives",
|
|
"04_content", "05_decomposition", "06_innovation", "07_benefit",
|
|
"08_basis", "09_schedule", "10_organization",
|
|
"11_team", "12_budget", "13_appendix",
|
|
],
|
|
"major_project": [
|
|
"00_basic_info", "01_objectives", "02_content", "03_innovation",
|
|
"04_benefit", "05_schedule", "06_organization", "07_ip",
|
|
"08_terms", "09_personnel", "10_budget", "11_appendix",
|
|
],
|
|
"nsfc_joint_fund": ["01_research_content", "02_basis", "03_other"],
|
|
"nsfc_general": ["01_research_content", "02_basis", "03_other"],
|
|
"nsfc_youth": ["01_research_content", "02_basis", "03_other"],
|
|
"provincial": ["01_research_content", "02_basis"],
|
|
"enterprise": ["00_overview", "01_objectives", "02_content", "03_schedule", "04_budget"],
|
|
}
|
|
|
|
|
|
# 假大空 / 不可考核词组
|
|
HOLLOW_PHRASES = [
|
|
"国际领先", "国际一流", "填补空白", "首次提出", "重大突破",
|
|
"立足国际前沿", "聚焦关键核心", "世界一流",
|
|
"深远影响", "独树一帜", "重要意义", "划时代",
|
|
]
|
|
UNQUANTIFIABLE_WORDS = [
|
|
"显著提升", "大幅提升", "明显改善", "性能优异", "体验优良",
|
|
"极大", "大大", "若干", "大量", "多种",
|
|
]
|
|
PLACEHOLDER_PATTERNS = [
|
|
r"<TODO[^>]*>",
|
|
r"\[REF-[A-Z0-9]+\]",
|
|
r"\[Smith et al", # 演示用占位
|
|
r"XXX 万元",
|
|
r"XX 万元",
|
|
r"X 年 X 月",
|
|
]
|
|
|
|
|
|
# 插图相关
|
|
_BOX_DRAWING_RE = re.compile(r"[┌┐└┘├┤┬┴┼─│╔╗╚╝╠╣╦╩╬═║▲▼◀▶]")
|
|
_IMAGE_REF_RE = re.compile(r"!\[[^\]]*\]\([^)\s]+\)")
|
|
_FENCE_RE = re.compile(r"^\s*(`{3,}|~{3,})\s*(\S*)\s*$")
|
|
_MERMAID_CAPTION_RE = re.compile(r"^\s*%%\s*caption\s*:\s*(.+?)\s*$", re.IGNORECASE)
|
|
|
|
|
|
def _extract_mermaid_caption(block_lines: list[str]) -> str | None:
|
|
for ln in block_lines:
|
|
m = _MERMAID_CAPTION_RE.match(ln)
|
|
if m:
|
|
return m.group(1).strip()
|
|
return None
|
|
|
|
|
|
def check_structure(sections_dir: Path, fund_type: str) -> list[str]:
|
|
required = REQUIRED_SECTIONS.get(fund_type, [])
|
|
existing = {f.stem for f in sections_dir.glob("*.md")}
|
|
issues = []
|
|
for req in required:
|
|
if not any(s.startswith(req) for s in existing):
|
|
issues.append(f"缺章节: {req}")
|
|
return issues
|
|
|
|
|
|
def check_phrases(text: str, file_label: str) -> list[str]:
|
|
issues = []
|
|
for phrase in HOLLOW_PHRASES:
|
|
if phrase in text:
|
|
issues.append(f"[{file_label}] 假大空: '{phrase}'")
|
|
for word in UNQUANTIFIABLE_WORDS:
|
|
# 简单包含检查 (不区分上下文)
|
|
if word in text:
|
|
issues.append(f"[{file_label}] 不可考核: '{word}' — 改成具体数字")
|
|
return issues
|
|
|
|
|
|
def check_placeholders(text: str, file_label: str) -> list[str]:
|
|
issues = []
|
|
for pat in PLACEHOLDER_PATTERNS:
|
|
matches = re.findall(pat, text)
|
|
for m in matches:
|
|
issues.append(f"[{file_label}] 占位符未替换: '{m}'")
|
|
return issues
|
|
|
|
|
|
def parse_spec_metrics(spec_path: Path) -> list[str]:
|
|
"""从 spec.md 的"7. 考核指标矩阵"段抽出"指南考核指标"那列。
|
|
|
|
寻找形如 `| 1 | 指南指标 | ... |` 的表行(序号 = 数字),取第 2 列。
|
|
返回每条指南指标的关键短语列表 (用于在 sections 中模糊匹配)。
|
|
"""
|
|
if not spec_path.exists():
|
|
return []
|
|
txt = spec_path.read_text(encoding="utf-8")
|
|
# 截取 "考核指标矩阵" 段到下一节标题
|
|
m = re.search(r"考核指标矩阵.*?(?=\n##\s|\Z)", txt, re.DOTALL)
|
|
if not m:
|
|
return []
|
|
block = m.group(0)
|
|
out: list[str] = []
|
|
for line in block.splitlines():
|
|
if not line.strip().startswith("|"):
|
|
continue
|
|
cells = [c.strip() for c in line.strip().strip("|").split("|")]
|
|
if len(cells) < 3:
|
|
continue
|
|
# 表头行 / 分隔行 跳过
|
|
if not cells[0].isdigit():
|
|
continue
|
|
guide_metric = cells[1]
|
|
if guide_metric and not guide_metric.startswith("<TODO") and not guide_metric.startswith("`"):
|
|
out.append(guide_metric)
|
|
return out
|
|
|
|
|
|
def check_figures(sections_dir: Path) -> list[str]:
|
|
"""四条插图规则:
|
|
1) figures/ 有 png 但 sections 里 0 个  引用 -> 图全没挂上
|
|
2) 任何 fenced 代码块里出现 box-drawing 字符 -> ASCII 字符画当图, Word 必错位
|
|
3) mermaid 块必须有首行 '%% caption: <题>' -> render_diagrams 靠它命名
|
|
4) 同 task 内 mermaid caption 不能撞名 -> 文件名冲突
|
|
"""
|
|
issues: list[str] = []
|
|
figures_dir = sections_dir.parent / "figures"
|
|
pngs = list(figures_dir.glob("*.png")) if figures_dir.is_dir() else []
|
|
|
|
total_img_refs = 0
|
|
ascii_art_blocks: list[tuple[str, int]] = []
|
|
mermaid_no_caption: list[tuple[str, int]] = []
|
|
mermaid_captions: dict[str, list[str]] = {} # caption -> [md:line, ...]
|
|
|
|
for md in sorted(sections_dir.glob("*.md")):
|
|
text = md.read_text(encoding="utf-8")
|
|
total_img_refs += len(_IMAGE_REF_RE.findall(text))
|
|
|
|
lines = text.splitlines()
|
|
i = 0
|
|
while i < len(lines):
|
|
m = _FENCE_RE.match(lines[i])
|
|
if not m:
|
|
i += 1
|
|
continue
|
|
fence = m.group(1)
|
|
lang = (m.group(2) or "").lower()
|
|
block_line = i + 1
|
|
i += 1
|
|
buf: list[str] = []
|
|
while i < len(lines):
|
|
mc = _FENCE_RE.match(lines[i])
|
|
if mc and mc.group(1)[0] == fence[0] and len(mc.group(1)) >= len(fence):
|
|
i += 1
|
|
break
|
|
buf.append(lines[i])
|
|
i += 1
|
|
if lang == "mermaid":
|
|
cap = _extract_mermaid_caption(buf)
|
|
if not cap:
|
|
mermaid_no_caption.append((md.name, block_line))
|
|
else:
|
|
mermaid_captions.setdefault(cap, []).append(f"{md.name}:{block_line}")
|
|
continue
|
|
if any(_BOX_DRAWING_RE.search(ln) for ln in buf):
|
|
ascii_art_blocks.append((md.name, block_line))
|
|
|
|
if pngs and total_img_refs == 0:
|
|
names = ", ".join(p.name for p in pngs[:4])
|
|
more = f" ... +{len(pngs) - 4}" if len(pngs) > 4 else ""
|
|
issues.append(
|
|
f"figures/ 有 {len(pngs)} 张 png ({names}{more}) 但 sections 里 0 个  引用 — "
|
|
f"图全没挂上, 在对应章节加 "
|
|
)
|
|
|
|
for fname, lineno in ascii_art_blocks:
|
|
issues.append(
|
|
f"[{fname}:~{lineno}] 代码块里有 ASCII 字符画 (┌─┐│└─┘) — "
|
|
f"中文 Word 必错位, 改 ```mermaid 块或 "
|
|
)
|
|
|
|
for fname, lineno in mermaid_no_caption:
|
|
issues.append(
|
|
f"[{fname}:~{lineno}] mermaid 块缺首行 '%% caption: <图题>' — "
|
|
f"render_diagrams 靠 caption 命名 png, 没 caption 不渲染"
|
|
)
|
|
|
|
for cap, locs in mermaid_captions.items():
|
|
if len(locs) > 1:
|
|
issues.append(
|
|
f"mermaid caption 撞名: {cap!r} 出现在 {', '.join(locs)} — "
|
|
f"caption 必须全 task 唯一, 改成更具体的题文"
|
|
)
|
|
|
|
return issues
|
|
|
|
|
|
def check_spec_coverage(sections_dir: Path, spec_path: Path) -> list[str]:
|
|
"""每条指南考核指标必须在某个章节里以**关键词**形式出现 (>=2 个核心词命中)。"""
|
|
metrics = parse_spec_metrics(spec_path)
|
|
if not metrics:
|
|
return []
|
|
# 把 sections 全文拼起来
|
|
full = "\n".join(f.read_text(encoding="utf-8") for f in sections_dir.glob("*.md"))
|
|
issues = []
|
|
for metric in metrics:
|
|
# 提关键词: 取长度 >=2 的中文片段 / 数字 / 字母组合
|
|
tokens = re.findall(r"[一-鿿]{2,}|[A-Za-z][\w]*|\d+\.?\d*", metric)
|
|
if not tokens:
|
|
continue
|
|
hits = sum(1 for t in tokens if t in full)
|
|
# 至少命中 2 个 token, 且至少 30% 的 token 出现
|
|
if hits < 2 or hits / len(tokens) < 0.3:
|
|
issues.append(f"指南指标可能未在正文覆盖: '{metric[:50]}...' (命中 {hits}/{len(tokens)} 关键词)")
|
|
return issues
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description="申报书质量检查")
|
|
ap.add_argument("sections_dir", type=Path)
|
|
ap.add_argument("--fund-type", required=True, choices=list(REQUIRED_SECTIONS.keys()))
|
|
ap.add_argument("--spec", type=Path, default=None,
|
|
help="spec.md 路径; 提供后会做指南考核指标覆盖度检查")
|
|
ap.add_argument("--strict", action="store_true",
|
|
help="严格模式: 任何检查项失败均退出 1")
|
|
args = ap.parse_args()
|
|
|
|
if not args.sections_dir.is_dir():
|
|
print(f"[ERR] {args.sections_dir} not a directory", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
print(f"\n[质量检查] fund_type={args.fund_type}\n")
|
|
|
|
all_issues: list[str] = []
|
|
|
|
# 1. 结构
|
|
struct = check_structure(args.sections_dir, args.fund_type)
|
|
if struct:
|
|
print("[ERR] 结构问题:")
|
|
for s in struct:
|
|
print(f" -{s}")
|
|
all_issues.extend(struct)
|
|
else:
|
|
print("[OK] 结构完整")
|
|
|
|
# 2-4. 内容 (假大空 / 不可考核词 / 占位符)
|
|
files = sorted(args.sections_dir.glob("*.md"))
|
|
print(f"\n共 {len(files)} 个章节, 逐章扫描...\n")
|
|
for f in files:
|
|
text = f.read_text(encoding="utf-8")
|
|
label = f.stem
|
|
ph = check_phrases(text, label)
|
|
ph_holders = check_placeholders(text, label)
|
|
sub_issues = ph + ph_holders
|
|
if sub_issues:
|
|
print(f"[WARN] {label}:")
|
|
for s in sub_issues:
|
|
print(f" -{s.split('] ', 1)[1]}")
|
|
all_issues.extend(sub_issues)
|
|
|
|
# 5. 插图引用 / ASCII 字符画
|
|
fig_issues = check_figures(args.sections_dir)
|
|
if fig_issues:
|
|
print("\n[ERR] 插图问题:")
|
|
for s in fig_issues:
|
|
print(f" -{s}")
|
|
all_issues.extend(fig_issues)
|
|
else:
|
|
print("\n[OK] 插图引用 / 无 ASCII 字符画")
|
|
|
|
# 6. 指南覆盖度 (--spec 提供时)
|
|
if args.spec:
|
|
if not args.spec.exists():
|
|
print(f"\n[ERR] spec 文件不存在: {args.spec}")
|
|
all_issues.append("spec 文件不存在")
|
|
else:
|
|
print(f"\n[指南覆盖度] 对照 {args.spec.name}")
|
|
cov_issues = check_spec_coverage(args.sections_dir, args.spec)
|
|
if cov_issues:
|
|
print("[WARN] 部分指南指标可能在正文未充分覆盖:")
|
|
for s in cov_issues:
|
|
print(f" -{s}")
|
|
all_issues.extend(cov_issues)
|
|
else:
|
|
print("[OK] 指南考核指标在正文均有体现")
|
|
|
|
print("\n" + "=" * 60)
|
|
if all_issues:
|
|
print(f"[WARN] 共发现 {len(all_issues)} 个问题。")
|
|
print("\n建议:")
|
|
print(" - 假大空词组 -> 换成具体数字 / 对比")
|
|
print(" - 不可考核词 -> 量化指标 (TPS / 准确率 / 万元 / N 篇)")
|
|
print(" - 占位符未替换 -> 找用户提供真实数据 / 替换 <TODO>")
|
|
print(" - 插图未挂 / ASCII 字符画 -> ```mermaid 块或 ")
|
|
print(" - 未覆盖指南指标 -> 在对应章节明确写出该指标的实现方式")
|
|
if args.strict:
|
|
sys.exit(1)
|
|
else:
|
|
print("[OK] 全部检查通过, 可以渲染 docx 了。")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|