zcbot/skills/ppt/scripts/quality_check.py

399 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""quality_check.py: 验收 .pptx,产出问题清单。
用法:
python quality_check.py <output.pptx> [--spec spec.md]
检查项:
- 文件存在且 > 10KB
- 总页数与 spec 一致 (如提供 spec.md)
- 每页有标题
- 每页 bullet ≤ 5 条
- 文字字号 ≥ 14pt (除页脚)
- 非灰阶(彩色)≤ 3 种 (三色制;文字色 + 形状填充色都计,灰阶/白不计)
- 出现 spec 之外的非灰阶色 (擅自换色 / 非主题色)
- 没有 untitled / output / placeholder 等占位文件名
- **形状不越出画布边界** (left+width / top+height 超界即报)
- **textbox 文本估算行数 > 框高度** —— 推断溢出
- **内容形状互相重叠** (文字压文字 / 文字压图标 / 图标压图标;装饰填充不计)
退出码:
0 = 全通过
1 = 有 warning
2 = 致命问题 (文件缺失等)
"""
from __future__ import annotations
import argparse
import colorsys
import re
import sys
from pathlib import Path
try:
from pptx import Presentation
from pptx.util import Pt
from pptx.enum.dml import MSO_FILL, MSO_COLOR_TYPE
from pptx.enum.shapes import MSO_SHAPE_TYPE
except ImportError:
print("[fatal] pip install python-pptx", file=sys.stderr)
sys.exit(2)
# ---- 重叠检测参数 ----
# 只检"内容形状"(有文字 / 图片)两两重叠 —— 装饰形状(无文字纯色填充:品牌条/分隔线/
# 圆点/色块标签/装饰星箭头)天然不算内容,不参与;"文字叠在色块上"也不会误报(色块无
# 文字)。要抓的是文字压文字 / 文字压图标 / 图标压图标这类真缺陷。
_OVERLAP_MIN_DIM = 0.08 # in:交叠的宽和高都需超过此值(滤掉边缘贴合/发丝线)
_OVERLAP_MIN_RATIO = 0.25 # 交叠面积 / 较小形状面积 超过此比例才算"压住"
# ---- 颜色辅助 ----
# 三色制按"色系数"判定,不是"hex 数":主/辅常同色系,主色的明暗阶(深红 #8A0000)、
# 浅底(wash/soft tint #F2CCCC)都从那三色派生,不该被算成"新色"。所以:
# - 低饱和的浅色/灰阶 → 中性(卡片底、wash 底),不计入彩色
# - 高饱和的算"彩色",但按色相(hue)归桶 —— 同色系(红的深浅)收敛成一个
# 这样"白底+红卡片+深红渐变+金强调"= 2 个色系,不会误报超 3 色。
def _hsv(hex6: str):
r, g, b = (int(hex6[0:2], 16) / 255, int(hex6[2:4], 16) / 255,
int(hex6[4:6], 16) / 255)
return colorsys.rgb_to_hsv(r, g, b) # h,s,v ∈ [0,1]
def _is_chromatic(hex6: str) -> bool:
"""是否计入"彩色"。低饱和(浅底/wash/灰阶)或近黑 → 中性,不计。"""
try:
_h, s, v = _hsv(hex6)
except (ValueError, IndexError):
return False
return s >= 0.30 and v >= 0.18
def _hue_family(hex6: str) -> int:
"""色相归桶(30° 一桶)。同色系的深浅落同一桶,收敛成一个色。"""
h, _s, _v = _hsv(hex6)
return int((h * 360) // 30)
def _is_neutral(hex6: str) -> bool:
"""保留旧名:非彩色(中性)= 不计入三色制。"""
return not _is_chromatic(hex6)
# 标签类形状名:这些天然用小字号(eyebrow/胶囊/页脚/数据来源/KPI 小注),
# 不参与"字号 < 14pt"与"bullet ≤ 5"的统计 —— 它们不是正文 bullet。
_LABEL_NAME_RE = re.compile(
r"(pill|eyebrow|footer|page_num|source|meta|_sub|kpi_sub|badge|tag|label)",
re.IGNORECASE,
)
# bullet 类形状名:真正的要点列表才计入 bullet 数。
_BULLET_NAME_RE = re.compile(r"(bullet|_pt_|agenda|list|item)", re.IGNORECASE)
def _shape_fill_hex(shape) -> str | None:
"""取形状的纯色填充 hex(大写,无 #)。非实心 / 主题色 / 取不到 → None。"""
try:
fill = shape.fill
if fill.type != MSO_FILL.SOLID:
return None
fc = fill.fore_color
if fc.type != MSO_COLOR_TYPE.RGB: # 主题色访问 .rgb 会抛,先挡掉
return None
return str(fc.rgb).upper()
except (TypeError, AttributeError, KeyError, ValueError):
return None
# ---- spec 解析 (松散 markdown 解析,够用就行) ----
def parse_spec(spec_path: Path) -> dict:
if not spec_path or not spec_path.exists():
return {}
text = spec_path.read_text(encoding="utf-8")
spec: dict = {}
m = re.search(r"页数[:\s]*(\d+)", text)
if m:
spec["page_count"] = int(m.group(1))
m = re.search(r"画布[:\s]*(16:9|4:3|9:16|1:1|3:4)", text)
if m:
spec["canvas"] = m.group(1)
hexes = re.findall(r"#([0-9A-Fa-f]{6})", text)
if hexes:
spec["colors"] = [h.upper() for h in hexes[:5]]
return spec
# ---- 检查 ----
def check_pptx(path: Path, spec: dict) -> tuple[list, list]:
"""returns (errors, warnings)"""
errors, warnings = [], []
if not path.exists():
errors.append(f"文件不存在: {path}")
return errors, warnings
size_kb = path.stat().st_size / 1024
if size_kb < 10:
errors.append(f"文件太小 ({size_kb:.1f}KB),python-pptx 可能没写完")
name = path.stem.lower()
if name in ("untitled", "output", "presentation", "untitled1", "new", "test"):
warnings.append(
f"文件名 '{path.name}' 太通用,建议按主题命名"
)
prs = Presentation(path)
n_slides = len(prs.slides)
slide_w_in = prs.slide_width / 914400 # EMU → inch
slide_h_in = prs.slide_height / 914400
print(
f"[info] 文件: {path.name} 大小: {size_kb:.1f}KB "
f"页数: {n_slides} 画布: {slide_w_in:.2f}×{slide_h_in:.2f} in"
)
expected = spec.get("page_count")
if expected and n_slides != expected:
warnings.append(f"页数 {n_slides} 与 spec 期望 {expected} 不符")
spec_colors = set(spec.get("colors", []))
seen_colors: set[str] = set()
for idx, slide in enumerate(prs.slides, 1):
title_text = None
small_font_count = 0
bullet_xs: list = [] # 每个 bullet 项的 x 中心 —— 末尾按列分组判 ≤5
content_shapes: list = [] # (l, t, w, h, label, head) — 有文字 / 图片的形状
for s_i, shape in enumerate(slide.shapes):
# ---- 形状越界检查 (任何 shape) ----
try:
left_in = shape.left / 914400 if shape.left is not None else 0
top_in = shape.top / 914400 if shape.top is not None else 0
w_in = shape.width / 914400 if shape.width is not None else 0
h_in = shape.height / 914400 if shape.height is not None else 0
except (AttributeError, TypeError):
left_in = top_in = w_in = h_in = 0
tol = 0.02 # 0.02 in 容忍 (约 0.5mm)
shape_label = (
shape.name if hasattr(shape, "name") and shape.name
else f"shape#{s_i}"
)
if left_in < -tol or top_in < -tol:
warnings.append(
f"{idx}{shape_label} 起点为负: "
f"({left_in:.2f}, {top_in:.2f})"
)
if left_in + w_in > slide_w_in + tol:
overflow = left_in + w_in - slide_w_in
warnings.append(
f"{idx}{shape_label} 右越界 {overflow:.2f}in "
f"(画布 {slide_w_in:.2f},shape 右 {left_in + w_in:.2f})"
)
if top_in + h_in > slide_h_in + tol:
overflow = top_in + h_in - slide_h_in
warnings.append(
f"{idx}{shape_label} 下越界 {overflow:.2f}in "
f"(画布 {slide_h_in:.2f},shape 底 {top_in + h_in:.2f})"
)
# ---- 形状填充色 (品牌条/徽章/圆点/标签/底块) ----
fill_hex = _shape_fill_hex(shape)
if fill_hex:
seen_colors.add(fill_hex)
# ---- 收集"内容形状"供重叠检测 (有文字 / 图片) ----
try:
is_pic = shape.shape_type == MSO_SHAPE_TYPE.PICTURE
except (AttributeError, ValueError):
is_pic = False
head = ""
if shape.has_text_frame:
head = (shape.text_frame.text or "").strip()
if (head or is_pic) and w_in > 0.05 and h_in > 0.05:
content_shapes.append(
(left_in, top_in, w_in, h_in, shape_label,
head[:18] if head else "[图片]")
)
if not shape.has_text_frame:
continue
tf = shape.text_frame
text = (tf.text or "").strip()
if not text:
continue
if title_text is None and len(text) <= 40 and "\n" not in text:
title_text = text
# ---- 文本溢出估算 ----
# 估算:中文字号 N pt 在框宽 W in 下,每行约 W*72/N 个中文字
# 非空段落数 + 长段落折行数 ≈ 实际行数
# 行数 × (size_pt * 1.4 / 72) > 框高 → 溢出
try:
first_size_pt = None
for para in tf.paragraphs:
for run in para.runs:
if run.font.size:
first_size_pt = run.font.size.pt
break
if first_size_pt:
break
# 大号展示字(标题/KPI 大数字/章节编号 ≥ 40pt)单行短文本,
# 按"每行字数"估折行会假阳(每行才 1-2 字),跳过 —— 标题长度另有
# ≤30 字检查兜底。
if (first_size_pt and first_size_pt < 40
and w_in > 0.5 and h_in > 0.2):
chars_per_line = max(1, int(w_in * 72 / first_size_pt))
est_lines = 0
for para in tf.paragraphs:
ptxt = (para.text or "").strip()
if not ptxt:
continue
est_lines += max(
1,
(len(ptxt) + chars_per_line - 1) // chars_per_line
)
line_height_in = first_size_pt * 1.4 / 72
needed_h = est_lines * line_height_in
if needed_h > h_in + 0.1:
warnings.append(
f"{idx}{shape_label} 文本可能溢出 "
f"(估 {est_lines} 行,需 {needed_h:.2f}in,"
f"框高 {h_in:.2f}in): {text[:25]}..."
)
except (AttributeError, TypeError, ValueError):
pass
is_label = bool(_LABEL_NAME_RE.search(shape_label))
is_bullet_shape = bool(_BULLET_NAME_RE.search(shape_label))
nonempty_paras = [
p for p in tf.paragraphs if (p.text or "").strip()
and (p.text or "").strip() != title_text
]
# bullet 只统计"真要点列表":名字像 bullet 的,或一个框里 ≥2 段的列表。
# KPI 卡 / 卡片标题 / 胶囊这类结构化短文本(单段、非 bullet 名)不算 bullet,
# 否则一页 4 张 KPI 卡会被误报成 "12 条 bullet"。
if not is_label and (is_bullet_shape or len(nonempty_paras) >= 2):
cx = left_in + w_in / 2 # x 中心,供按列分组
bullet_xs.extend([cx] * len(nonempty_paras))
for para in tf.paragraphs:
ptxt = (para.text or "").strip()
if not ptxt:
continue
for run in para.runs:
# 标签类(eyebrow/胶囊/页脚/小注)天然小字,不算"投影看不清"
if run.font.size and not is_label:
if run.font.size < Pt(14):
small_font_count += 1
if run.font.color and run.font.color.type:
try:
rgb = run.font.color.rgb
if rgb is not None:
seen_colors.add(str(rgb))
except (AttributeError, KeyError, ValueError):
pass
if title_text is None:
warnings.append(f"{idx} 页缺标题")
elif len(title_text) > 30:
warnings.append(
f"{idx} 页标题过长 ({len(title_text)} 字): {title_text[:20]}..."
)
# bullet ≤5 按"列"判:双栏对比天生左 3 + 右 3,不该当整页 6 条报。
# 按 slide 中线把 bullet 分左右两列,任一列 > 5 才警告(单列列表也走这条)。
mid = slide_w_in / 2
left_n = sum(1 for x in bullet_xs if x < mid)
right_n = len(bullet_xs) - left_n
max_col = max(left_n, right_n)
if max_col > 5:
warnings.append(
f"{idx} 页单列 bullet {max_col} 条 (上限 5),建议拆页或转图表"
)
if small_font_count > 0:
warnings.append(
f"{idx} 页有 {small_font_count} 处字号 < 14pt,投影看不清"
)
# ---- 内容形状两两重叠 (文字压文字 / 文字压图标 / 图标压图标) ----
for i in range(len(content_shapes)):
ax, ay, aw, ah, alab, ahead = content_shapes[i]
for j in range(i + 1, len(content_shapes)):
bx, by, bw, bh, blab, bhead = content_shapes[j]
ix = min(ax + aw, bx + bw) - max(ax, bx)
iy = min(ay + ah, by + bh) - max(ay, by)
if ix <= _OVERLAP_MIN_DIM or iy <= _OVERLAP_MIN_DIM:
continue
min_area = min(aw * ah, bw * bh)
if min_area <= 0:
continue
ratio = (ix * iy) / min_area
if ratio >= _OVERLAP_MIN_RATIO:
warnings.append(
f"{idx} 页 内容重叠 {ratio * 100:.0f}%: "
f'{alab}("{ahead}") × {blab}("{bhead}")'
)
# 三色制按"色系数"判定:同色系深浅(主色/深红渐变/浅红卡片底)收敛成一桶,
# 低饱和浅色/灰阶不计。这样卡片式设计的派生色阶不会被误报超 3 色。
chromatic = {c for c in seen_colors if _is_chromatic(c)}
families = {_hue_family(c) for c in chromatic}
if len(families) > 3:
warnings.append(
f"彩色色系 {len(families)} 个 (三色制上限 3): "
f"{', '.join('#' + c for c in sorted(chromatic))};收敛到主/辅/强调三色系"
)
if spec_colors:
spec_families = {_hue_family(c) for c in spec_colors if _is_chromatic(c)}
extra = {c for c in chromatic if _hue_family(c) not in spec_families}
if extra:
spec_chromatic = {c for c in spec_colors if _is_chromatic(c)}
warnings.append(
f"出现 spec 之外的色系 {', '.join('#' + c for c in sorted(extra))};"
f"擅自换色 / 非主题色 (spec 定的是 "
f"{', '.join('#' + c for c in sorted(spec_chromatic))})"
)
return errors, warnings
def main():
ap = argparse.ArgumentParser()
ap.add_argument("pptx", type=Path)
ap.add_argument("--spec", type=Path, default=None,
help="spec.md 路径")
args = ap.parse_args()
spec = parse_spec(args.spec) if args.spec else {}
if spec:
print(f"[info] spec 已加载: {spec}")
errors, warnings = check_pptx(args.pptx, spec)
if errors:
print("\n[errors]")
for e in errors:
print(f"{e}")
if warnings:
print("\n[warnings]")
for w in warnings:
print(f" ! {w}")
if not errors and not warnings:
print("\n[ok] 全部通过")
sys.exit(0)
sys.exit(2 if errors else 1)
if __name__ == "__main__":
main()