feat(resm): 期刊/关键词监控 PaperMonitor + 移除每日增量周期任务
- 新增 PaperMonitor model(type=journal/search/keyword、value、name、note、is_active、days、last_run、last_count)+ admin 管理 - 新增 monitor_papers 任务:遍历启用监控,journal→primary_location.source.issn / search→title_and_abstract / keyword→keywords.id,复用 _crawl_openalex_query 入库去重,每天 05:00 调度 - 迁移 0008 建表;0009 种子(8 本无机非金属材料期刊 + 5 英文方向词,note=无机非金属材料)并注册监控周期任务 - 移除 0007:update_paper_meta_from_openalex/elsevier 不再注册为每日周期任务(只需一次性回补,用 backfill_paper_meta_from_openalex);两任务函数保留供手动/回补调用 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
7b38d4d234
commit
c5636b5131
|
|
@ -1,3 +1,14 @@
|
|||
from django.contrib import admin
|
||||
from apps.resm.models import PaperMonitor
|
||||
|
||||
# Register your models here.
|
||||
|
||||
|
||||
@admin.register(PaperMonitor)
|
||||
class PaperMonitorAdmin(admin.ModelAdmin):
|
||||
list_display = ("type", "name", "value", "note", "is_active",
|
||||
"days", "last_run", "last_count")
|
||||
list_filter = ("type", "is_active", "note")
|
||||
search_fields = ("name", "value", "note")
|
||||
list_editable = ("is_active", "days")
|
||||
ordering = ("type", "name")
|
||||
|
|
|
|||
|
|
@ -1,71 +0,0 @@
|
|||
"""注册“自动更新论文索引”的周期任务(django-celery-beat,DB 调度)。
|
||||
|
||||
本项目用 DatabaseScheduler,周期任务存在 DB 里。这里用数据迁移幂等地建两条
|
||||
每天跑一次的 PeriodicTask:
|
||||
- apps.resm.tasks.update_paper_meta_from_openalex (03:00,主力增量源)
|
||||
- apps.resm.tasks.update_paper_meta_from_elsevier (04:00,ScienceDirect 补充)
|
||||
错开整点,避免同时打两个外部 API。update_or_create 保证迁移可安全重跑。
|
||||
"""
|
||||
import json
|
||||
from django.db import migrations
|
||||
|
||||
OPENALEX_TASK = "apps.resm.tasks.update_paper_meta_from_openalex"
|
||||
ELSEVIER_TASK = "apps.resm.tasks.update_paper_meta_from_elsevier"
|
||||
OPENALEX_NAME = "resm: 自动增量更新论文索引 (OpenAlex)"
|
||||
ELSEVIER_NAME = "resm: 自动增量更新论文索引 (Elsevier 补充)"
|
||||
|
||||
|
||||
def _crontab(CrontabSchedule, hour):
|
||||
schedule, _ = CrontabSchedule.objects.get_or_create(
|
||||
minute="0",
|
||||
hour=str(hour),
|
||||
day_of_week="*",
|
||||
day_of_month="*",
|
||||
month_of_year="*",
|
||||
)
|
||||
return schedule
|
||||
|
||||
|
||||
def create_periodic_tasks(apps, schema_editor):
|
||||
CrontabSchedule = apps.get_model("django_celery_beat", "CrontabSchedule")
|
||||
PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask")
|
||||
|
||||
PeriodicTask.objects.update_or_create(
|
||||
name=OPENALEX_NAME,
|
||||
defaults={
|
||||
"task": OPENALEX_TASK,
|
||||
"crontab": _crontab(CrontabSchedule, 3),
|
||||
"interval": None,
|
||||
"kwargs": json.dumps({"days": 30}),
|
||||
"enabled": True,
|
||||
"description": "每天用 from_publication_date 拉取最近 30 天发表的论文,保持 resm_paper 索引更新(from_created_date 需 Premium)",
|
||||
},
|
||||
)
|
||||
PeriodicTask.objects.update_or_create(
|
||||
name=ELSEVIER_NAME,
|
||||
defaults={
|
||||
"task": ELSEVIER_TASK,
|
||||
"crontab": _crontab(CrontabSchedule, 4),
|
||||
"interval": None,
|
||||
"kwargs": json.dumps({"days": 7}),
|
||||
"enabled": True,
|
||||
"description": "每天用 ScienceDirect Search 补充 Elsevier(10.1016)新刊,补 OpenAlex 收录延迟",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def remove_periodic_tasks(apps, schema_editor):
|
||||
PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask")
|
||||
PeriodicTask.objects.filter(name__in=[OPENALEX_NAME, ELSEVIER_NAME]).delete()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("resm", "0006_pg_trgm_index"),
|
||||
("django_celery_beat", "__latest__"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(create_periodic_tasks, remove_periodic_tasks),
|
||||
]
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
# Generated by Django 4.2.27 on 2026-06-21 15:23
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
('resm', '0006_pg_trgm_index'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='PaperMonitor',
|
||||
fields=[
|
||||
('id', models.CharField(editable=False, help_text='主键ID', max_length=20, primary_key=True, serialize=False, verbose_name='主键ID')),
|
||||
('create_time', models.DateTimeField(default=django.utils.timezone.now, help_text='创建时间', verbose_name='创建时间')),
|
||||
('update_time', models.DateTimeField(auto_now=True, help_text='修改时间', verbose_name='修改时间')),
|
||||
('is_deleted', models.BooleanField(default=False, help_text='删除标记', verbose_name='删除标记')),
|
||||
('type', models.CharField(choices=[('journal', '期刊(ISSN)'), ('search', '搜索词(标题/摘要)'), ('keyword', 'OpenAlex关键词ID')], db_index=True, max_length=20, verbose_name='监控类型')),
|
||||
('value', models.CharField(max_length=500, verbose_name='监控值')),
|
||||
('name', models.CharField(blank=True, max_length=200, null=True, verbose_name='名称')),
|
||||
('note', models.CharField(blank=True, max_length=100, null=True, verbose_name='方向标注')),
|
||||
('is_active', models.BooleanField(db_index=True, default=True, verbose_name='启用')),
|
||||
('days', models.IntegerField(default=30, verbose_name='回看窗口(天)')),
|
||||
('last_run', models.DateTimeField(blank=True, null=True, verbose_name='上次运行时间')),
|
||||
('last_count', models.IntegerField(default=0, verbose_name='上次拉取篇数(窗口内)')),
|
||||
('create_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_create_by', to=settings.AUTH_USER_MODEL, verbose_name='创建人')),
|
||||
('update_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_update_by', to=settings.AUTH_USER_MODEL, verbose_name='最后编辑人')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': '论文监控',
|
||||
'verbose_name_plural': '论文监控',
|
||||
},
|
||||
),
|
||||
]
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
"""种子数据:无机非金属材料方向的期刊 / 关键词监控,并注册每周的监控周期任务。
|
||||
|
||||
- 8 本方向期刊(按 ISSN,OpenAlex 跨出版商都能抓)
|
||||
- 若干英文搜索词(OpenAlex 语料是英文,中文词搜不到,故用英文;note 标注方向)
|
||||
- 注册 PeriodicTask: apps.resm.tasks.monitor_papers,每天 05:00 跑一次
|
||||
get_or_create / update_or_create 保证迁移可安全重跑。
|
||||
"""
|
||||
from django.db import migrations
|
||||
from apps.utils.snowflake import idWorker
|
||||
|
||||
NOTE = "无机非金属材料"
|
||||
|
||||
JOURNALS = [
|
||||
("0272-8842", "Ceramics International"),
|
||||
("0955-2219", "Journal of the European Ceramic Society"),
|
||||
("0008-8846", "Cement and Concrete Research"),
|
||||
("0958-9465", "Cement and Concrete Composites"),
|
||||
("0950-0618", "Construction and Building Materials"),
|
||||
("0022-3093", "Journal of Non-Crystalline Solids"),
|
||||
("0002-7820", "Journal of the American Ceramic Society"),
|
||||
("0022-2461", "Journal of Materials Science"),
|
||||
]
|
||||
|
||||
SEARCHES = [
|
||||
("ceramics material", "陶瓷材料"),
|
||||
("glass material", "玻璃材料"),
|
||||
("cement", "水泥"),
|
||||
("refractory material", "耐火材料"),
|
||||
("crystalline material", "晶体材料"),
|
||||
]
|
||||
|
||||
MONITOR_TASK = "apps.resm.tasks.monitor_papers"
|
||||
MONITOR_NAME = "resm: 论文监控(期刊/关键词)"
|
||||
|
||||
|
||||
def seed(apps, schema_editor):
|
||||
PaperMonitor = apps.get_model("resm", "PaperMonitor")
|
||||
for issn, name in JOURNALS:
|
||||
PaperMonitor.objects.get_or_create(
|
||||
type="journal", value=issn,
|
||||
defaults={"id": idWorker.get_id(), "name": name, "note": NOTE,
|
||||
"is_active": True, "days": 7},
|
||||
)
|
||||
for term, name in SEARCHES:
|
||||
PaperMonitor.objects.get_or_create(
|
||||
type="search", value=term,
|
||||
defaults={"id": idWorker.get_id(), "name": name, "note": NOTE,
|
||||
"is_active": True, "days": 7},
|
||||
)
|
||||
|
||||
CrontabSchedule = apps.get_model("django_celery_beat", "CrontabSchedule")
|
||||
PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask")
|
||||
sched, _ = CrontabSchedule.objects.get_or_create(
|
||||
minute="0", hour="5", day_of_week="*", day_of_month="*", month_of_year="*",
|
||||
)
|
||||
PeriodicTask.objects.update_or_create(
|
||||
name=MONITOR_NAME,
|
||||
defaults={
|
||||
"task": MONITOR_TASK,
|
||||
"crontab": sched,
|
||||
"interval": None,
|
||||
"enabled": True,
|
||||
"description": "每天 05:00 拉取监控期刊/关键词的最新论文元数据(OpenAlex,跨出版商)",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def unseed(apps, schema_editor):
|
||||
PaperMonitor = apps.get_model("resm", "PaperMonitor")
|
||||
PaperMonitor.objects.filter(type="journal", value__in=[i for i, _ in JOURNALS]).delete()
|
||||
PaperMonitor.objects.filter(type="search", value__in=[t for t, _ in SEARCHES]).delete()
|
||||
PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask")
|
||||
PeriodicTask.objects.filter(name=MONITOR_NAME).delete()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("resm", "0008_papermonitor"),
|
||||
("django_celery_beat", "__latest__"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(seed, unseed),
|
||||
]
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
from django.db import models
|
||||
from apps.utils.models import BaseModel
|
||||
from apps.utils.models import BaseModel, CommonAModel
|
||||
from django.conf import settings
|
||||
import os
|
||||
# Create your models here.
|
||||
|
|
@ -107,3 +107,32 @@ class PaperAbstract(BaseModel):
|
|||
max_length=20,
|
||||
verbose_name="摘要来源" # openalex / elsevier / crossref
|
||||
)
|
||||
|
||||
|
||||
class PaperMonitor(CommonAModel):
|
||||
"""论文监控订阅:监控任务遍历启用项,按 type 拼 OpenAlex 过滤,用
|
||||
from_publication_date 拉最近 days 天的最新论文元数据入库(走通用核心,自动去重)。
|
||||
期刊监控与关键词监控共用本表,靠 type 区分。"""
|
||||
TYPE_JOURNAL = "journal"
|
||||
TYPE_SEARCH = "search"
|
||||
TYPE_KEYWORD = "keyword"
|
||||
TYPE_CHOICES = (
|
||||
(TYPE_JOURNAL, "期刊(ISSN)"),
|
||||
(TYPE_SEARCH, "搜索词(标题/摘要)"),
|
||||
(TYPE_KEYWORD, "OpenAlex关键词ID"),
|
||||
)
|
||||
type = models.CharField("监控类型", max_length=20, choices=TYPE_CHOICES, db_index=True)
|
||||
value = models.CharField("监控值", max_length=500) # ISSN / 搜索词 / keyword id
|
||||
name = models.CharField("名称", max_length=200, null=True, blank=True)
|
||||
note = models.CharField("方向标注", max_length=100, null=True, blank=True) # 如 无机非金属材料
|
||||
is_active = models.BooleanField("启用", default=True, db_index=True)
|
||||
days = models.IntegerField("回看窗口(天)", default=30)
|
||||
last_run = models.DateTimeField("上次运行时间", null=True, blank=True)
|
||||
last_count = models.IntegerField("上次拉取篇数(窗口内)", default=0)
|
||||
|
||||
class Meta:
|
||||
verbose_name = "论文监控"
|
||||
verbose_name_plural = verbose_name
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.get_type_display()}:{self.name or self.value}"
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ from __future__ import absolute_import, unicode_literals
|
|||
from apps.utils.tasks import CustomTask
|
||||
from celery import shared_task
|
||||
from pyalex import Works, config
|
||||
from apps.resm.models import Paper, PaperAbstract
|
||||
from apps.resm.models import Paper, PaperAbstract, PaperMonitor
|
||||
from apps.utils.snowflake import idWorker
|
||||
from django.core.cache import cache
|
||||
import requests
|
||||
|
|
@ -299,6 +299,43 @@ def backfill_paper_meta_from_openalex(from_publication_date: str, to_publication
|
|||
return f"combo {combo_index}/{len(combos)} done ({kw!r},{search!r}) -> next"
|
||||
|
||||
|
||||
@shared_task(base=CustomTask)
|
||||
def monitor_papers(monitor_id: str = None):
|
||||
"""期刊 / 关键词监控:遍历启用的 PaperMonitor,按 type 拼 OpenAlex 过滤,用
|
||||
from_publication_date 拉最近 days 天的最新论文元数据入库。期刊与关键词监控共用本任务。
|
||||
|
||||
- journal: value=ISSN -> 过滤 primary_location.source.issn(o_keywords/o_search 留空)
|
||||
- search : value=英文搜索词 -> title_and_abstract.search(写回 o_search)
|
||||
- keyword: value=OpenAlex keyword id -> keywords.id(写回 o_keywords)
|
||||
复用 _crawl_openalex_query(短窗口,cache_key=None 每次从头扫);bulk_create
|
||||
ignore_conflicts 自动与现有 Paper 去重。monitor_id 给定则只跑该条。由 beat 每天调度。
|
||||
"""
|
||||
qs = PaperMonitor.objects.filter(is_active=True)
|
||||
if monitor_id:
|
||||
qs = qs.filter(id=monitor_id)
|
||||
results = []
|
||||
for m in qs:
|
||||
from_pub = (timezone.now() - timedelta(days=m.days or 30)).date().isoformat()
|
||||
base = Works().filter(has_doi=True, type="article", from_publication_date=from_pub)
|
||||
kw, search = "", ""
|
||||
if m.type == PaperMonitor.TYPE_JOURNAL:
|
||||
base = base.filter(primary_location={"source": {"issn": m.value}})
|
||||
elif m.type == PaperMonitor.TYPE_SEARCH:
|
||||
search = m.value
|
||||
base = _apply_keyword_search(base, "", search)
|
||||
elif m.type == PaperMonitor.TYPE_KEYWORD:
|
||||
kw = m.value
|
||||
base = _apply_keyword_search(base, kw, "")
|
||||
else:
|
||||
continue
|
||||
seen = _crawl_openalex_query(base, kw, search)
|
||||
m.last_run = timezone.now()
|
||||
m.last_count = seen
|
||||
m.save(update_fields=["last_run", "last_count", "update_time"])
|
||||
results.append(f"{m.type}:{m.name or m.value}={seen}")
|
||||
return "; ".join(results) or "no active monitors"
|
||||
|
||||
|
||||
SCIENCEDIRECT_SEARCH_URL = "https://api.elsevier.com/content/search/sciencedirect"
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue