From 6a5a5d7b6bdbe7b8eb802335c3dd540872f017fb Mon Sep 17 00:00:00 2001 From: caoqianming Date: Thu, 21 May 2026 13:48:52 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20paper=20list=20=E5=8A=A0=20pdf=5Furl=20?= =?UTF-8?q?/=20xml=5Furl=20=E7=9B=B4=E9=93=BE=E5=AD=97=E6=AE=B5=20+=20pg?= =?UTF-8?q?=5Ftrgm=20GIN=20=E7=B4=A2=E5=BC=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit serializers: PaperListSerializer 加 pdf_url / xml_url SerializerMethodField,基于 publication_date + safe_doi 后端拼 absolute_uri;has_fulltext_{pdf,xml}=False 或 publication_date 缺失返空串。LLM 客户端从 list 一次拿到直链,不必拼 URL。 migration 0006: CREATE EXTENSION IF NOT EXISTS pg_trgm + 3 列 GIN 索引(title / first_author / first_author_institution),根治 SearchFilter 跨列 ILIKE '%xxx%' 全表扫 timeout(高频词如 cement 原本 30s+,加索引后几十 ms)。 Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/resm/migrations/0006_pg_trgm_index.py | 37 ++++++++++++++++++++++ apps/resm/serializers.py | 22 +++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 apps/resm/migrations/0006_pg_trgm_index.py diff --git a/apps/resm/migrations/0006_pg_trgm_index.py b/apps/resm/migrations/0006_pg_trgm_index.py new file mode 100644 index 0000000..7a48924 --- /dev/null +++ b/apps/resm/migrations/0006_pg_trgm_index.py @@ -0,0 +1,37 @@ +"""为 SearchFilter 的 title / first_author / first_author_institution 三列建 pg_trgm GIN 索引。 + +原因:DRF SearchFilter 走 `column ILIKE '%keyword%'` 前后通配,B-tree 索引救不了; +高频关键词 + 几十万行表会 30s+ timeout。pg_trgm 给 trigram 建 GIN 索引, +ILIKE '%xxx%' 走索引,降到几十 ms。 + +pg_trgm 是 PostgreSQL contrib 扩展,首次启用需要 DB superuser 权限; +CREATE EXTENSION IF NOT EXISTS 幂等,迁移可安全重跑。 +""" +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('resm', '0005_alter_paper_fetch_status'), + ] + + operations = [ + migrations.RunSQL( + sql=[ + "CREATE EXTENSION IF NOT EXISTS pg_trgm;", + "CREATE INDEX IF NOT EXISTS paper_title_trgm " + "ON resm_paper USING gin (title gin_trgm_ops);", + "CREATE INDEX IF NOT EXISTS paper_first_author_trgm " + "ON resm_paper USING gin (first_author gin_trgm_ops);", + "CREATE INDEX IF NOT EXISTS paper_first_author_institution_trgm " + "ON resm_paper USING gin (first_author_institution gin_trgm_ops);", + ], + reverse_sql=[ + "DROP INDEX IF EXISTS paper_first_author_institution_trgm;", + "DROP INDEX IF EXISTS paper_first_author_trgm;", + "DROP INDEX IF EXISTS paper_title_trgm;", + # pg_trgm 扩展不 drop —— 其他 app / 表可能也在用 + ], + ), + ] diff --git a/apps/resm/serializers.py b/apps/resm/serializers.py index 73efee7..be5bac7 100644 --- a/apps/resm/serializers.py +++ b/apps/resm/serializers.py @@ -5,6 +5,8 @@ from .models import Paper class PaperListSerializer(CustomModelSerializer): abstract = serializers.SerializerMethodField() + pdf_url = serializers.SerializerMethodField() + xml_url = serializers.SerializerMethodField() class Meta: model = Paper @@ -16,3 +18,23 @@ class PaperListSerializer(CustomModelSerializer): if abs_obj is None: return "" return abs_obj.abstract or "" + + def _media_url(self, obj, ext: str) -> str: + """拼 /media/papers////. 静态直链。 + + date 缺失(unknown 目录)/ doi 缺失返空串 —— LLM 拿到空串就知道没法直链下, + 改走 get_paper / paper_pdf_view。 + """ + if obj.publication_date is None or not obj.doi: + return "" + safe_doi = obj.doi.replace("/", "_") + d = obj.publication_date + path = f"/media/papers/{d.year}/{d.month}/{d.day}/{safe_doi}.{ext}" + request = self.context.get("request") + return request.build_absolute_uri(path) if request else path + + def get_pdf_url(self, obj) -> str: + return self._media_url(obj, "pdf") if obj.has_fulltext_pdf else "" + + def get_xml_url(self, obj) -> str: + return self._media_url(obj, "xml") if obj.has_fulltext_xml else ""