zcbot/scripts/smoke_paper_skill.py

309 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Smoke: paper_server → zcbot research skill 三步链路。
跑法: .venv/Scripts/python.exe scripts/smoke_paper_skill.py
依赖:paper_server 已 redeploy(list 端点返 abstract / retrieve 端点可用 / filterset_class 生效)。
不动 DB,不动 workspace;PDF 落到系统临时目录,跑完即丢。
三步:
1) search(keyword="cement", limit=5) — 验 list shape + abstract 字段
2) get_paper(<step 1 拿到的第一条 doi>) — 验 retrieve 端点(原 viewset 没挂 mixin 是 404 bug)
3) fetch_pdf(<有 PDF 的那条>, tmp_dir) ×2 — 验文件落盘 + 复用(第二次应跳过下载直接复用)
任一步异常都打印后 continue 下一步,保证整条链路看一遍。
"""
from __future__ import annotations
import os
import sys
import tempfile
import time
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
# 读 .env 注入(PAPER_SERVER_URL 可在这里覆盖,默认 http://paper.xxhhcty.xyz:8080)
env_file = ROOT / ".env"
if env_file.exists():
for line in env_file.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
k, _, v = line.partition("=")
os.environ.setdefault(k.strip(), v.strip())
from skills.research.paper import _BASE_URL, fetch_pdf, fetch_xml, get_paper, search
def _hr(title: str) -> None:
print()
print("=" * 60)
print(f"[{title}]")
print("=" * 60)
def _truncate(s: str, n: int = 200) -> str:
if not s:
return "(empty)"
s = s.replace("\n", " ")
return s if len(s) <= n else s[:n] + "...(+%d chars)" % (len(s) - n)
def step0_trgm_speed() -> None:
_hr("step 0: search(keyword='cement', limit=3) 不带 filter — 验证 pg_trgm GIN 索引生效")
print("note: 加 pg_trgm 索引前会 30s+ timeout(ILIKE 跨 3 列全表扫);加后应几百 ms 内返")
t0 = time.time()
try:
results = search(keyword="cement", limit=3)
dt = (time.time() - t0) * 1000
print(f"[OK] returned {len(results)} in {dt:.0f}ms")
if dt > 5000:
print(f"[WARN] >5s,pg_trgm 索引可能没建对 — 检查 migration 0006_pg_trgm_index 是否跑了")
elif dt < 1000:
print("[OK] <1s,pg_trgm 索引生效")
except Exception as e:
print(f"[FAIL] {type(e).__name__}: {e}")
print(" (若是 ReadTimeout,pg_trgm 没生效;请确认 migration 跑了 + DB 有 superuser 装扩展)")
def step1_search() -> list[dict]:
_hr("step 1: search(keyword='cement', has_pdf=True, limit=5)")
print(f"PAPER_SERVER_URL = {_BASE_URL}")
print("note: 带 has_pdf=True 走 has_fulltext_pdf 索引先过滤,纯 keyword 大词会 ILIKE 全表扫导致 30s timeout (paper_server 既有问题)")
t0 = time.time()
try:
results = search(keyword="cement", has_pdf=True, limit=5)
except Exception as e:
print(f"[FAIL] {type(e).__name__}: {e}")
return []
dt = (time.time() - t0) * 1000
print(f"[OK] returned {len(results)} papers in {dt:.0f}ms")
if not results:
print("[WARN] 0 results — keyword 'cement' + has_pdf=True 居然没命中?后两步会用空集 fallback")
return []
# 验返回 shape:必含 16 字段
expected = {
"id", "doi", "title", "first_author", "first_author_institution",
"publication_year", "publication_date", "publication_name",
"has_fulltext_pdf", "has_fulltext_xml", "has_abstract", "is_oa",
"type", "abstract", "pdf_url", "xml_url",
}
actual = set(results[0].keys())
missing = expected - actual
extra = actual - expected
if missing:
print(f"[FAIL] 字段缺失:{missing}")
if extra:
print(f"[INFO] 字段冗余(应被 _LIST_FIELDS 过滤掉):{extra}")
if not missing and not extra:
print("[OK] 16 字段完整,无冗余")
# 抽样打前 3 条
print()
for i, p in enumerate(results[:3], 1):
print(f"--- paper {i} ---")
print(f" id = {p['id']}")
print(f" doi = {p['doi']}")
print(f" title = {_truncate(p['title'], 100)}")
print(f" year = {p['publication_year']} / date={p['publication_date']} / type={p['type']} / is_oa={p['is_oa']}")
print(f" has_abstract = {p['has_abstract']}")
print(f" has_fulltext_pdf = {p['has_fulltext_pdf']} / has_fulltext_xml = {p['has_fulltext_xml']}")
print(f" pdf_url = {p['pdf_url'] or '(empty)'}")
print(f" xml_url = {p['xml_url'] or '(empty)'}")
print(f" abstract = {_truncate(p['abstract'], 150)}")
return results
def step1b_abstract_filled() -> None:
_hr("step 1b: search(has_abstract=True, limit=3) — 验证 abstract 字段在 list 里确实能装文本")
try:
t0 = time.time()
results = search(keyword="hydration", has_pdf=True, limit=3)
dt = (time.time() - t0) * 1000
print(f"[OK] returned {len(results)} in {dt:.0f}ms")
if not results:
print("[WARN] hydration+has_pdf 也是 0 结果,跳过")
return
for i, p in enumerate(results, 1):
print(f"--- paper {i} --- has_abstract={p['has_abstract']}")
print(f" title = {_truncate(p['title'], 80)}")
print(f" abstract = {_truncate(p['abstract'], 200)}")
any_filled = any(p["abstract"] for p in results if p.get("has_abstract"))
if any_filled:
print("\n[OK] abstract 字段实际装载了文本(non-empty)— list 加 abstract 改动真实生效")
else:
print('\n[INFO] 本批候选 has_abstract 全为 False / abstract 全空;不算 fail,只说明本批走的 serializer 的 "无 PaperAbstract 行返空串" 分支')
except Exception as e:
print(f"[FAIL] {type(e).__name__}: {e}")
def step2_get_paper(papers: list[dict]) -> None:
_hr("step 2: get_paper(<step 1 第一条 doi>)")
if not papers:
print("[SKIP] step 1 无结果,无法挑 doi")
return
doi = papers[0]["doi"]
print(f"querying doi = {doi}")
try:
t0 = time.time()
paper = get_paper(doi)
dt = (time.time() - t0) * 1000
print(f"[OK] retrieve in {dt:.0f}ms")
except Exception as e:
print(f"[FAIL] {type(e).__name__}: {e}")
print(" (若是 404,可能 redeploy 没生效 / retrieve mixin 没挂;原 bug 就是这个)")
return
print(f" id = {paper.get('id')}")
print(f" title = {_truncate(paper.get('title') or '', 100)}")
print(f" has_abstract = {paper.get('has_abstract')}")
print(f" abstract = {_truncate(paper.get('abstract') or '', 200)}")
if paper.get("id") != papers[0]["id"]:
print(f"[WARN] retrieve 返回的 id ({paper.get('id')}) ≠ list 里的 id ({papers[0]['id']})")
def step3_fetch_pdf(papers: list[dict]) -> None:
_hr("step 3: fetch_pdf(轮询候选直到命中真实可下载, tmp_dir) ×2")
candidates = [p for p in papers if p.get("has_fulltext_pdf")]
if not candidates:
print("[SKIP] step 1 候选里没有 has_fulltext_pdf=True,跳过(P1 timeout 也会落到这里)")
return
import httpx as _httpx
tmp_dir = Path(tempfile.mkdtemp(prefix="zcbot_smoke_paper_"))
print(f"tmp working_dir = {tmp_dir}")
print(f"候选数: {len(candidates)}")
success = False
for i, paper in enumerate(candidates, 1):
pid = paper["id"]
print(f"\n--- 尝试候选 {i}/{len(candidates)}: id={pid} doi={paper['doi']} ---")
try:
t0 = time.time()
rel1 = fetch_pdf(pid, str(tmp_dir))
dt1 = (time.time() - t0) * 1000
abs_path = tmp_dir / rel1
size = abs_path.stat().st_size
print(f"[OK] 第 1 次下载: rel={rel1} size={size/1024:.1f}KB in {dt1:.0f}ms")
t0 = time.time()
rel2 = fetch_pdf(pid, str(tmp_dir))
dt2 = (time.time() - t0) * 1000
print(f"[OK] 第 2 次复用: rel={rel2} in {dt2:.0f}ms (期望 <100ms)")
if dt2 > 1000:
print(f"[WARN] 第 2 次 >1s,可能没走复用分支?")
if rel1 != rel2:
print(f"[FAIL] 两次返回路径不一致:{rel1} vs {rel2}")
success = True
break
except _httpx.HTTPStatusError as e:
if e.response.status_code == 404:
print(f"[SKIP] paper_pdf_view 404 — DB has_fulltext_pdf=True 但磁盘文件缺失(paper_server 数据一致性问题,继续下一个)")
continue
print(f"[FAIL] HTTPStatusError {e.response.status_code}: {e}")
break
except RuntimeError as e:
print(f"[INFO] {e}")
continue
except Exception as e:
print(f"[FAIL] {type(e).__name__}: {e}")
break
if not success:
print(f"\n[WARN] {len(candidates)} 个候选全部 404 — paper_server 侧 DB/disk 不一致问题严重,fetch_pdf 链路逻辑未真实验证(本地实现按 has_fulltext_pdf=True 是放行的,服务器侧最终又 404 拦下)")
print(f"(tmp_dir 留着自查:{tmp_dir})")
def step4_fetch_xml() -> None:
_hr("step 4: fetch_xml(<has_fulltext_xml=True 候选>, tmp_dir) ×2 — 验证 XML 直链 + 复用")
try:
candidates = search(keyword="cement", limit=20)
except Exception as e:
print(f"[FAIL] 拉候选失败: {type(e).__name__}: {e}")
return
xml_candidates = [p for p in candidates if p.get("has_fulltext_xml") and p.get("xml_url")]
print(f"候选 {len(candidates)} 条中,has_fulltext_xml=True 且 xml_url 非空: {len(xml_candidates)}")
if not xml_candidates:
print("[INFO] 库里 cement 主题没 has_fulltext_xml 候选,试别的关键词")
for kw in ("hydration", "concrete", "polymer"):
try:
more = search(keyword=kw, limit=20)
xml_candidates = [p for p in more if p.get("has_fulltext_xml") and p.get("xml_url")]
if xml_candidates:
print(f" '{kw}' 拉到 {len(xml_candidates)} 条 XML 候选")
break
except Exception:
continue
if not xml_candidates:
print("[SKIP] 多关键词试了都没 XML 候选;fetch_xml 链路未验证")
return
tmp_dir = Path(tempfile.mkdtemp(prefix="zcbot_smoke_xml_"))
print(f"tmp working_dir = {tmp_dir}")
import httpx as _httpx
success = False
for i, paper in enumerate(xml_candidates[:5], 1):
pid = paper["id"]
print(f"\n--- 尝试候选 {i}: id={pid} doi={paper['doi']} xml_url={paper['xml_url']} ---")
try:
t0 = time.time()
rel1 = fetch_xml(pid, str(tmp_dir))
dt1 = (time.time() - t0) * 1000
size = (tmp_dir / rel1).stat().st_size
print(f"[OK] 第 1 次下载: rel={rel1} size={size/1024:.1f}KB in {dt1:.0f}ms")
t0 = time.time()
rel2 = fetch_xml(pid, str(tmp_dir))
dt2 = (time.time() - t0) * 1000
print(f"[OK] 第 2 次复用: rel={rel2} in {dt2:.0f}ms (期望 <100ms)")
if rel1 != rel2:
print(f"[FAIL] 两次路径不一致:{rel1} vs {rel2}")
success = True
break
except _httpx.HTTPStatusError as e:
if e.response.status_code == 404:
print(f"[SKIP] media 静态 URL 404 — paper_server disk 文件缺失,继续下一个")
continue
print(f"[FAIL] HTTPStatusError {e.response.status_code}: {e}")
break
except Exception as e:
print(f"[FAIL] {type(e).__name__}: {e}")
break
if not success:
print(f"\n[WARN] 5 个候选全部 disk 缺失;fetch_xml 客户端代码本身简单(直链 stream + 复用),不阻塞改动验证")
print(f"(tmp_dir 留着自查:{tmp_dir})")
def main() -> int:
print("=" * 60)
print("zcbot research skill smoke")
print("=" * 60)
try:
step0_trgm_speed()
except Exception as e:
print(f"[FAIL step 0] {type(e).__name__}: {e}")
try:
papers = step1_search()
except Exception as e:
print(f"[FAIL step 1] {type(e).__name__}: {e}")
papers = []
try:
step1b_abstract_filled()
except Exception as e:
print(f"[FAIL step 1b] {type(e).__name__}: {e}")
try:
step2_get_paper(papers)
except Exception as e:
print(f"[FAIL step 2] {type(e).__name__}: {e}")
try:
step3_fetch_pdf(papers)
except Exception as e:
print(f"[FAIL step 3] {type(e).__name__}: {e}")
try:
step4_fetch_xml()
except Exception as e:
print(f"[FAIL step 4] {type(e).__name__}: {e}")
print()
print("=" * 60)
print("smoke done")
print("=" * 60)
return 0
if __name__ == "__main__":
sys.exit(main())