From c5636b5131ea918bc7a504a7c3109dbce4c4a881 Mon Sep 17 00:00:00 2001 From: caoqianming Date: Sun, 21 Jun 2026 23:43:58 +0800 Subject: [PATCH] =?UTF-8?q?feat(resm):=20=E6=9C=9F=E5=88=8A/=E5=85=B3?= =?UTF-8?q?=E9=94=AE=E8=AF=8D=E7=9B=91=E6=8E=A7=20PaperMonitor=20+=20?= =?UTF-8?q?=E7=A7=BB=E9=99=A4=E6=AF=8F=E6=97=A5=E5=A2=9E=E9=87=8F=E5=91=A8?= =?UTF-8?q?=E6=9C=9F=E4=BB=BB=E5=8A=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 PaperMonitor model(type=journal/search/keyword、value、name、note、is_active、days、last_run、last_count)+ admin 管理 - 新增 monitor_papers 任务:遍历启用监控,journal→primary_location.source.issn / search→title_and_abstract / keyword→keywords.id,复用 _crawl_openalex_query 入库去重,每天 05:00 调度 - 迁移 0008 建表;0009 种子(8 本无机非金属材料期刊 + 5 英文方向词,note=无机非金属材料)并注册监控周期任务 - 移除 0007:update_paper_meta_from_openalex/elsevier 不再注册为每日周期任务(只需一次性回补,用 backfill_paper_meta_from_openalex);两任务函数保留供手动/回补调用 Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/resm/admin.py | 11 +++ .../0007_auto_update_index_periodic_tasks.py | 71 ---------------- apps/resm/migrations/0008_papermonitor.py | 40 +++++++++ .../0009_seed_monitors_and_schedule.py | 85 +++++++++++++++++++ apps/resm/models.py | 31 ++++++- apps/resm/tasks.py | 39 ++++++++- 6 files changed, 204 insertions(+), 73 deletions(-) delete mode 100644 apps/resm/migrations/0007_auto_update_index_periodic_tasks.py create mode 100644 apps/resm/migrations/0008_papermonitor.py create mode 100644 apps/resm/migrations/0009_seed_monitors_and_schedule.py diff --git a/apps/resm/admin.py b/apps/resm/admin.py index 8c38f3f..8e53541 100644 --- a/apps/resm/admin.py +++ b/apps/resm/admin.py @@ -1,3 +1,14 @@ from django.contrib import admin +from apps.resm.models import PaperMonitor # Register your models here. + + +@admin.register(PaperMonitor) +class PaperMonitorAdmin(admin.ModelAdmin): + list_display = ("type", "name", "value", "note", "is_active", + "days", "last_run", "last_count") + list_filter = ("type", "is_active", "note") + search_fields = ("name", "value", "note") + list_editable = ("is_active", "days") + ordering = ("type", "name") diff --git a/apps/resm/migrations/0007_auto_update_index_periodic_tasks.py b/apps/resm/migrations/0007_auto_update_index_periodic_tasks.py deleted file mode 100644 index 53819d9..0000000 --- a/apps/resm/migrations/0007_auto_update_index_periodic_tasks.py +++ /dev/null @@ -1,71 +0,0 @@ -"""注册“自动更新论文索引”的周期任务(django-celery-beat,DB 调度)。 - -本项目用 DatabaseScheduler,周期任务存在 DB 里。这里用数据迁移幂等地建两条 -每天跑一次的 PeriodicTask: - - apps.resm.tasks.update_paper_meta_from_openalex (03:00,主力增量源) - - apps.resm.tasks.update_paper_meta_from_elsevier (04:00,ScienceDirect 补充) -错开整点,避免同时打两个外部 API。update_or_create 保证迁移可安全重跑。 -""" -import json -from django.db import migrations - -OPENALEX_TASK = "apps.resm.tasks.update_paper_meta_from_openalex" -ELSEVIER_TASK = "apps.resm.tasks.update_paper_meta_from_elsevier" -OPENALEX_NAME = "resm: 自动增量更新论文索引 (OpenAlex)" -ELSEVIER_NAME = "resm: 自动增量更新论文索引 (Elsevier 补充)" - - -def _crontab(CrontabSchedule, hour): - schedule, _ = CrontabSchedule.objects.get_or_create( - minute="0", - hour=str(hour), - day_of_week="*", - day_of_month="*", - month_of_year="*", - ) - return schedule - - -def create_periodic_tasks(apps, schema_editor): - CrontabSchedule = apps.get_model("django_celery_beat", "CrontabSchedule") - PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask") - - PeriodicTask.objects.update_or_create( - name=OPENALEX_NAME, - defaults={ - "task": OPENALEX_TASK, - "crontab": _crontab(CrontabSchedule, 3), - "interval": None, - "kwargs": json.dumps({"days": 30}), - "enabled": True, - "description": "每天用 from_publication_date 拉取最近 30 天发表的论文,保持 resm_paper 索引更新(from_created_date 需 Premium)", - }, - ) - PeriodicTask.objects.update_or_create( - name=ELSEVIER_NAME, - defaults={ - "task": ELSEVIER_TASK, - "crontab": _crontab(CrontabSchedule, 4), - "interval": None, - "kwargs": json.dumps({"days": 7}), - "enabled": True, - "description": "每天用 ScienceDirect Search 补充 Elsevier(10.1016)新刊,补 OpenAlex 收录延迟", - }, - ) - - -def remove_periodic_tasks(apps, schema_editor): - PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask") - PeriodicTask.objects.filter(name__in=[OPENALEX_NAME, ELSEVIER_NAME]).delete() - - -class Migration(migrations.Migration): - - dependencies = [ - ("resm", "0006_pg_trgm_index"), - ("django_celery_beat", "__latest__"), - ] - - operations = [ - migrations.RunPython(create_periodic_tasks, remove_periodic_tasks), - ] diff --git a/apps/resm/migrations/0008_papermonitor.py b/apps/resm/migrations/0008_papermonitor.py new file mode 100644 index 0000000..34d0840 --- /dev/null +++ b/apps/resm/migrations/0008_papermonitor.py @@ -0,0 +1,40 @@ +# Generated by Django 4.2.27 on 2026-06-21 15:23 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone + + +class Migration(migrations.Migration): + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ('resm', '0006_pg_trgm_index'), + ] + + operations = [ + migrations.CreateModel( + name='PaperMonitor', + fields=[ + ('id', models.CharField(editable=False, help_text='主键ID', max_length=20, primary_key=True, serialize=False, verbose_name='主键ID')), + ('create_time', models.DateTimeField(default=django.utils.timezone.now, help_text='创建时间', verbose_name='创建时间')), + ('update_time', models.DateTimeField(auto_now=True, help_text='修改时间', verbose_name='修改时间')), + ('is_deleted', models.BooleanField(default=False, help_text='删除标记', verbose_name='删除标记')), + ('type', models.CharField(choices=[('journal', '期刊(ISSN)'), ('search', '搜索词(标题/摘要)'), ('keyword', 'OpenAlex关键词ID')], db_index=True, max_length=20, verbose_name='监控类型')), + ('value', models.CharField(max_length=500, verbose_name='监控值')), + ('name', models.CharField(blank=True, max_length=200, null=True, verbose_name='名称')), + ('note', models.CharField(blank=True, max_length=100, null=True, verbose_name='方向标注')), + ('is_active', models.BooleanField(db_index=True, default=True, verbose_name='启用')), + ('days', models.IntegerField(default=30, verbose_name='回看窗口(天)')), + ('last_run', models.DateTimeField(blank=True, null=True, verbose_name='上次运行时间')), + ('last_count', models.IntegerField(default=0, verbose_name='上次拉取篇数(窗口内)')), + ('create_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_create_by', to=settings.AUTH_USER_MODEL, verbose_name='创建人')), + ('update_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_update_by', to=settings.AUTH_USER_MODEL, verbose_name='最后编辑人')), + ], + options={ + 'verbose_name': '论文监控', + 'verbose_name_plural': '论文监控', + }, + ), + ] diff --git a/apps/resm/migrations/0009_seed_monitors_and_schedule.py b/apps/resm/migrations/0009_seed_monitors_and_schedule.py new file mode 100644 index 0000000..98e69d8 --- /dev/null +++ b/apps/resm/migrations/0009_seed_monitors_and_schedule.py @@ -0,0 +1,85 @@ +"""种子数据:无机非金属材料方向的期刊 / 关键词监控,并注册每周的监控周期任务。 + +- 8 本方向期刊(按 ISSN,OpenAlex 跨出版商都能抓) +- 若干英文搜索词(OpenAlex 语料是英文,中文词搜不到,故用英文;note 标注方向) +- 注册 PeriodicTask: apps.resm.tasks.monitor_papers,每天 05:00 跑一次 +get_or_create / update_or_create 保证迁移可安全重跑。 +""" +from django.db import migrations +from apps.utils.snowflake import idWorker + +NOTE = "无机非金属材料" + +JOURNALS = [ + ("0272-8842", "Ceramics International"), + ("0955-2219", "Journal of the European Ceramic Society"), + ("0008-8846", "Cement and Concrete Research"), + ("0958-9465", "Cement and Concrete Composites"), + ("0950-0618", "Construction and Building Materials"), + ("0022-3093", "Journal of Non-Crystalline Solids"), + ("0002-7820", "Journal of the American Ceramic Society"), + ("0022-2461", "Journal of Materials Science"), +] + +SEARCHES = [ + ("ceramics material", "陶瓷材料"), + ("glass material", "玻璃材料"), + ("cement", "水泥"), + ("refractory material", "耐火材料"), + ("crystalline material", "晶体材料"), +] + +MONITOR_TASK = "apps.resm.tasks.monitor_papers" +MONITOR_NAME = "resm: 论文监控(期刊/关键词)" + + +def seed(apps, schema_editor): + PaperMonitor = apps.get_model("resm", "PaperMonitor") + for issn, name in JOURNALS: + PaperMonitor.objects.get_or_create( + type="journal", value=issn, + defaults={"id": idWorker.get_id(), "name": name, "note": NOTE, + "is_active": True, "days": 7}, + ) + for term, name in SEARCHES: + PaperMonitor.objects.get_or_create( + type="search", value=term, + defaults={"id": idWorker.get_id(), "name": name, "note": NOTE, + "is_active": True, "days": 7}, + ) + + CrontabSchedule = apps.get_model("django_celery_beat", "CrontabSchedule") + PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask") + sched, _ = CrontabSchedule.objects.get_or_create( + minute="0", hour="5", day_of_week="*", day_of_month="*", month_of_year="*", + ) + PeriodicTask.objects.update_or_create( + name=MONITOR_NAME, + defaults={ + "task": MONITOR_TASK, + "crontab": sched, + "interval": None, + "enabled": True, + "description": "每天 05:00 拉取监控期刊/关键词的最新论文元数据(OpenAlex,跨出版商)", + }, + ) + + +def unseed(apps, schema_editor): + PaperMonitor = apps.get_model("resm", "PaperMonitor") + PaperMonitor.objects.filter(type="journal", value__in=[i for i, _ in JOURNALS]).delete() + PaperMonitor.objects.filter(type="search", value__in=[t for t, _ in SEARCHES]).delete() + PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask") + PeriodicTask.objects.filter(name=MONITOR_NAME).delete() + + +class Migration(migrations.Migration): + + dependencies = [ + ("resm", "0008_papermonitor"), + ("django_celery_beat", "__latest__"), + ] + + operations = [ + migrations.RunPython(seed, unseed), + ] diff --git a/apps/resm/models.py b/apps/resm/models.py index 0cd3bb2..e294066 100644 --- a/apps/resm/models.py +++ b/apps/resm/models.py @@ -1,5 +1,5 @@ from django.db import models -from apps.utils.models import BaseModel +from apps.utils.models import BaseModel, CommonAModel from django.conf import settings import os # Create your models here. @@ -107,3 +107,32 @@ class PaperAbstract(BaseModel): max_length=20, verbose_name="摘要来源" # openalex / elsevier / crossref ) + + +class PaperMonitor(CommonAModel): + """论文监控订阅:监控任务遍历启用项,按 type 拼 OpenAlex 过滤,用 + from_publication_date 拉最近 days 天的最新论文元数据入库(走通用核心,自动去重)。 + 期刊监控与关键词监控共用本表,靠 type 区分。""" + TYPE_JOURNAL = "journal" + TYPE_SEARCH = "search" + TYPE_KEYWORD = "keyword" + TYPE_CHOICES = ( + (TYPE_JOURNAL, "期刊(ISSN)"), + (TYPE_SEARCH, "搜索词(标题/摘要)"), + (TYPE_KEYWORD, "OpenAlex关键词ID"), + ) + type = models.CharField("监控类型", max_length=20, choices=TYPE_CHOICES, db_index=True) + value = models.CharField("监控值", max_length=500) # ISSN / 搜索词 / keyword id + name = models.CharField("名称", max_length=200, null=True, blank=True) + note = models.CharField("方向标注", max_length=100, null=True, blank=True) # 如 无机非金属材料 + is_active = models.BooleanField("启用", default=True, db_index=True) + days = models.IntegerField("回看窗口(天)", default=30) + last_run = models.DateTimeField("上次运行时间", null=True, blank=True) + last_count = models.IntegerField("上次拉取篇数(窗口内)", default=0) + + class Meta: + verbose_name = "论文监控" + verbose_name_plural = verbose_name + + def __str__(self): + return f"{self.get_type_display()}:{self.name or self.value}" diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index 9748b41..3d91283 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -3,7 +3,7 @@ from __future__ import absolute_import, unicode_literals from apps.utils.tasks import CustomTask from celery import shared_task from pyalex import Works, config -from apps.resm.models import Paper, PaperAbstract +from apps.resm.models import Paper, PaperAbstract, PaperMonitor from apps.utils.snowflake import idWorker from django.core.cache import cache import requests @@ -299,6 +299,43 @@ def backfill_paper_meta_from_openalex(from_publication_date: str, to_publication return f"combo {combo_index}/{len(combos)} done ({kw!r},{search!r}) -> next" +@shared_task(base=CustomTask) +def monitor_papers(monitor_id: str = None): + """期刊 / 关键词监控:遍历启用的 PaperMonitor,按 type 拼 OpenAlex 过滤,用 + from_publication_date 拉最近 days 天的最新论文元数据入库。期刊与关键词监控共用本任务。 + + - journal: value=ISSN -> 过滤 primary_location.source.issn(o_keywords/o_search 留空) + - search : value=英文搜索词 -> title_and_abstract.search(写回 o_search) + - keyword: value=OpenAlex keyword id -> keywords.id(写回 o_keywords) + 复用 _crawl_openalex_query(短窗口,cache_key=None 每次从头扫);bulk_create + ignore_conflicts 自动与现有 Paper 去重。monitor_id 给定则只跑该条。由 beat 每天调度。 + """ + qs = PaperMonitor.objects.filter(is_active=True) + if monitor_id: + qs = qs.filter(id=monitor_id) + results = [] + for m in qs: + from_pub = (timezone.now() - timedelta(days=m.days or 30)).date().isoformat() + base = Works().filter(has_doi=True, type="article", from_publication_date=from_pub) + kw, search = "", "" + if m.type == PaperMonitor.TYPE_JOURNAL: + base = base.filter(primary_location={"source": {"issn": m.value}}) + elif m.type == PaperMonitor.TYPE_SEARCH: + search = m.value + base = _apply_keyword_search(base, "", search) + elif m.type == PaperMonitor.TYPE_KEYWORD: + kw = m.value + base = _apply_keyword_search(base, kw, "") + else: + continue + seen = _crawl_openalex_query(base, kw, search) + m.last_run = timezone.now() + m.last_count = seen + m.save(update_fields=["last_run", "last_count", "update_time"]) + results.append(f"{m.type}:{m.name or m.value}={seen}") + return "; ".join(results) or "no active monitors" + + SCIENCEDIRECT_SEARCH_URL = "https://api.elsevier.com/content/search/sciencedirect"