feat: paper list 加 pdf_url / xml_url 直链字段 + pg_trgm GIN 索引

serializers: PaperListSerializer 加 pdf_url / xml_url SerializerMethodField,基于 publication_date + safe_doi 后端拼 absolute_uri;has_fulltext_{pdf,xml}=False 或 publication_date 缺失返空串。LLM 客户端从 list 一次拿到直链,不必拼 URL。
migration 0006: CREATE EXTENSION IF NOT EXISTS pg_trgm + 3 列 GIN 索引(title / first_author / first_author_institution),根治 SearchFilter 跨列 ILIKE '%xxx%' 全表扫 timeout(高频词如 cement 原本 30s+,加索引后几十 ms)。

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
caoqianming 2026-05-21 13:48:52 +08:00
parent e8320bce05
commit 6a5a5d7b6b
2 changed files with 59 additions and 0 deletions

View File

@ -0,0 +1,37 @@
"""为 SearchFilter 的 title / first_author / first_author_institution 三列建 pg_trgm GIN 索引。
原因:DRF SearchFilter `column ILIKE '%keyword%'` 前后通配,B-tree 索引救不了;
高频关键词 + 几十万行表会 30s+ timeoutpg_trgm trigram GIN 索引,
ILIKE '%xxx%' 走索引,降到几十 ms
pg_trgm PostgreSQL contrib 扩展,首次启用需要 DB superuser 权限;
CREATE EXTENSION IF NOT EXISTS 幂等,迁移可安全重跑
"""
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('resm', '0005_alter_paper_fetch_status'),
]
operations = [
migrations.RunSQL(
sql=[
"CREATE EXTENSION IF NOT EXISTS pg_trgm;",
"CREATE INDEX IF NOT EXISTS paper_title_trgm "
"ON resm_paper USING gin (title gin_trgm_ops);",
"CREATE INDEX IF NOT EXISTS paper_first_author_trgm "
"ON resm_paper USING gin (first_author gin_trgm_ops);",
"CREATE INDEX IF NOT EXISTS paper_first_author_institution_trgm "
"ON resm_paper USING gin (first_author_institution gin_trgm_ops);",
],
reverse_sql=[
"DROP INDEX IF EXISTS paper_first_author_institution_trgm;",
"DROP INDEX IF EXISTS paper_first_author_trgm;",
"DROP INDEX IF EXISTS paper_title_trgm;",
# pg_trgm 扩展不 drop —— 其他 app / 表可能也在用
],
),
]

View File

@ -5,6 +5,8 @@ from .models import Paper
class PaperListSerializer(CustomModelSerializer):
abstract = serializers.SerializerMethodField()
pdf_url = serializers.SerializerMethodField()
xml_url = serializers.SerializerMethodField()
class Meta:
model = Paper
@ -16,3 +18,23 @@ class PaperListSerializer(CustomModelSerializer):
if abs_obj is None:
return ""
return abs_obj.abstract or ""
def _media_url(self, obj, ext: str) -> str:
"""拼 /media/papers/<Y>/<M>/<D>/<safe_doi>.<ext> 静态直链。
date 缺失(unknown 目录)/ doi 缺失返空串 LLM 拿到空串就知道没法直链下,
改走 get_paper / paper_pdf_view
"""
if obj.publication_date is None or not obj.doi:
return ""
safe_doi = obj.doi.replace("/", "_")
d = obj.publication_date
path = f"/media/papers/{d.year}/{d.month}/{d.day}/{safe_doi}.{ext}"
request = self.context.get("request")
return request.build_absolute_uri(path) if request else path
def get_pdf_url(self, obj) -> str:
return self._media_url(obj, "pdf") if obj.has_fulltext_pdf else ""
def get_xml_url(self, obj) -> str:
return self._media_url(obj, "xml") if obj.has_fulltext_xml else ""