feat(resm): 期刊/关键词监控 PaperMonitor + 移除每日增量周期任务

- 新增 PaperMonitor model(type=journal/search/keyword、value、name、note、is_active、days、last_run、last_count)+ admin 管理
- 新增 monitor_papers 任务:遍历启用监控,journal→primary_location.source.issn / search→title_and_abstract / keyword→keywords.id,复用 _crawl_openalex_query 入库去重,每天 05:00 调度
- 迁移 0008 建表;0009 种子(8 本无机非金属材料期刊 + 5 英文方向词,note=无机非金属材料)并注册监控周期任务
- 移除 0007:update_paper_meta_from_openalex/elsevier 不再注册为每日周期任务(只需一次性回补,用 backfill_paper_meta_from_openalex);两任务函数保留供手动/回补调用

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
caoqianming 2026-06-21 23:43:58 +08:00
parent 7b38d4d234
commit c5636b5131
6 changed files with 204 additions and 73 deletions

View File

@ -1,3 +1,14 @@
from django.contrib import admin from django.contrib import admin
from apps.resm.models import PaperMonitor
# Register your models here. # Register your models here.
@admin.register(PaperMonitor)
class PaperMonitorAdmin(admin.ModelAdmin):
list_display = ("type", "name", "value", "note", "is_active",
"days", "last_run", "last_count")
list_filter = ("type", "is_active", "note")
search_fields = ("name", "value", "note")
list_editable = ("is_active", "days")
ordering = ("type", "name")

View File

@ -1,71 +0,0 @@
"""注册“自动更新论文索引”的周期任务(django-celery-beat,DB 调度)。
本项目用 DatabaseScheduler,周期任务存在 DB 这里用数据迁移幂等地建两条
每天跑一次的 PeriodicTask:
- apps.resm.tasks.update_paper_meta_from_openalex (03:00,主力增量源)
- apps.resm.tasks.update_paper_meta_from_elsevier (04:00,ScienceDirect 补充)
错开整点,避免同时打两个外部 APIupdate_or_create 保证迁移可安全重跑
"""
import json
from django.db import migrations
OPENALEX_TASK = "apps.resm.tasks.update_paper_meta_from_openalex"
ELSEVIER_TASK = "apps.resm.tasks.update_paper_meta_from_elsevier"
OPENALEX_NAME = "resm: 自动增量更新论文索引 (OpenAlex)"
ELSEVIER_NAME = "resm: 自动增量更新论文索引 (Elsevier 补充)"
def _crontab(CrontabSchedule, hour):
schedule, _ = CrontabSchedule.objects.get_or_create(
minute="0",
hour=str(hour),
day_of_week="*",
day_of_month="*",
month_of_year="*",
)
return schedule
def create_periodic_tasks(apps, schema_editor):
CrontabSchedule = apps.get_model("django_celery_beat", "CrontabSchedule")
PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask")
PeriodicTask.objects.update_or_create(
name=OPENALEX_NAME,
defaults={
"task": OPENALEX_TASK,
"crontab": _crontab(CrontabSchedule, 3),
"interval": None,
"kwargs": json.dumps({"days": 30}),
"enabled": True,
"description": "每天用 from_publication_date 拉取最近 30 天发表的论文,保持 resm_paper 索引更新(from_created_date 需 Premium)",
},
)
PeriodicTask.objects.update_or_create(
name=ELSEVIER_NAME,
defaults={
"task": ELSEVIER_TASK,
"crontab": _crontab(CrontabSchedule, 4),
"interval": None,
"kwargs": json.dumps({"days": 7}),
"enabled": True,
"description": "每天用 ScienceDirect Search 补充 Elsevier(10.1016)新刊,补 OpenAlex 收录延迟",
},
)
def remove_periodic_tasks(apps, schema_editor):
PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask")
PeriodicTask.objects.filter(name__in=[OPENALEX_NAME, ELSEVIER_NAME]).delete()
class Migration(migrations.Migration):
dependencies = [
("resm", "0006_pg_trgm_index"),
("django_celery_beat", "__latest__"),
]
operations = [
migrations.RunPython(create_periodic_tasks, remove_periodic_tasks),
]

View File

@ -0,0 +1,40 @@
# Generated by Django 4.2.27 on 2026-06-21 15:23
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
('resm', '0006_pg_trgm_index'),
]
operations = [
migrations.CreateModel(
name='PaperMonitor',
fields=[
('id', models.CharField(editable=False, help_text='主键ID', max_length=20, primary_key=True, serialize=False, verbose_name='主键ID')),
('create_time', models.DateTimeField(default=django.utils.timezone.now, help_text='创建时间', verbose_name='创建时间')),
('update_time', models.DateTimeField(auto_now=True, help_text='修改时间', verbose_name='修改时间')),
('is_deleted', models.BooleanField(default=False, help_text='删除标记', verbose_name='删除标记')),
('type', models.CharField(choices=[('journal', '期刊(ISSN)'), ('search', '搜索词(标题/摘要)'), ('keyword', 'OpenAlex关键词ID')], db_index=True, max_length=20, verbose_name='监控类型')),
('value', models.CharField(max_length=500, verbose_name='监控值')),
('name', models.CharField(blank=True, max_length=200, null=True, verbose_name='名称')),
('note', models.CharField(blank=True, max_length=100, null=True, verbose_name='方向标注')),
('is_active', models.BooleanField(db_index=True, default=True, verbose_name='启用')),
('days', models.IntegerField(default=30, verbose_name='回看窗口(天)')),
('last_run', models.DateTimeField(blank=True, null=True, verbose_name='上次运行时间')),
('last_count', models.IntegerField(default=0, verbose_name='上次拉取篇数(窗口内)')),
('create_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_create_by', to=settings.AUTH_USER_MODEL, verbose_name='创建人')),
('update_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_update_by', to=settings.AUTH_USER_MODEL, verbose_name='最后编辑人')),
],
options={
'verbose_name': '论文监控',
'verbose_name_plural': '论文监控',
},
),
]

View File

@ -0,0 +1,85 @@
"""种子数据:无机非金属材料方向的期刊 / 关键词监控,并注册每周的监控周期任务。
- 8 本方向期刊( ISSN,OpenAlex 跨出版商都能抓)
- 若干英文搜索词(OpenAlex 语料是英文,中文词搜不到,故用英文;note 标注方向)
- 注册 PeriodicTask: apps.resm.tasks.monitor_papers,每天 05:00 跑一次
get_or_create / update_or_create 保证迁移可安全重跑
"""
from django.db import migrations
from apps.utils.snowflake import idWorker
NOTE = "无机非金属材料"
JOURNALS = [
("0272-8842", "Ceramics International"),
("0955-2219", "Journal of the European Ceramic Society"),
("0008-8846", "Cement and Concrete Research"),
("0958-9465", "Cement and Concrete Composites"),
("0950-0618", "Construction and Building Materials"),
("0022-3093", "Journal of Non-Crystalline Solids"),
("0002-7820", "Journal of the American Ceramic Society"),
("0022-2461", "Journal of Materials Science"),
]
SEARCHES = [
("ceramics material", "陶瓷材料"),
("glass material", "玻璃材料"),
("cement", "水泥"),
("refractory material", "耐火材料"),
("crystalline material", "晶体材料"),
]
MONITOR_TASK = "apps.resm.tasks.monitor_papers"
MONITOR_NAME = "resm: 论文监控(期刊/关键词)"
def seed(apps, schema_editor):
PaperMonitor = apps.get_model("resm", "PaperMonitor")
for issn, name in JOURNALS:
PaperMonitor.objects.get_or_create(
type="journal", value=issn,
defaults={"id": idWorker.get_id(), "name": name, "note": NOTE,
"is_active": True, "days": 7},
)
for term, name in SEARCHES:
PaperMonitor.objects.get_or_create(
type="search", value=term,
defaults={"id": idWorker.get_id(), "name": name, "note": NOTE,
"is_active": True, "days": 7},
)
CrontabSchedule = apps.get_model("django_celery_beat", "CrontabSchedule")
PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask")
sched, _ = CrontabSchedule.objects.get_or_create(
minute="0", hour="5", day_of_week="*", day_of_month="*", month_of_year="*",
)
PeriodicTask.objects.update_or_create(
name=MONITOR_NAME,
defaults={
"task": MONITOR_TASK,
"crontab": sched,
"interval": None,
"enabled": True,
"description": "每天 05:00 拉取监控期刊/关键词的最新论文元数据(OpenAlex,跨出版商)",
},
)
def unseed(apps, schema_editor):
PaperMonitor = apps.get_model("resm", "PaperMonitor")
PaperMonitor.objects.filter(type="journal", value__in=[i for i, _ in JOURNALS]).delete()
PaperMonitor.objects.filter(type="search", value__in=[t for t, _ in SEARCHES]).delete()
PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask")
PeriodicTask.objects.filter(name=MONITOR_NAME).delete()
class Migration(migrations.Migration):
dependencies = [
("resm", "0008_papermonitor"),
("django_celery_beat", "__latest__"),
]
operations = [
migrations.RunPython(seed, unseed),
]

View File

@ -1,5 +1,5 @@
from django.db import models from django.db import models
from apps.utils.models import BaseModel from apps.utils.models import BaseModel, CommonAModel
from django.conf import settings from django.conf import settings
import os import os
# Create your models here. # Create your models here.
@ -107,3 +107,32 @@ class PaperAbstract(BaseModel):
max_length=20, max_length=20,
verbose_name="摘要来源" # openalex / elsevier / crossref verbose_name="摘要来源" # openalex / elsevier / crossref
) )
class PaperMonitor(CommonAModel):
"""论文监控订阅:监控任务遍历启用项,按 type 拼 OpenAlex 过滤,用
from_publication_date 拉最近 days 天的最新论文元数据入库(走通用核心,自动去重)
期刊监控与关键词监控共用本表, type 区分"""
TYPE_JOURNAL = "journal"
TYPE_SEARCH = "search"
TYPE_KEYWORD = "keyword"
TYPE_CHOICES = (
(TYPE_JOURNAL, "期刊(ISSN)"),
(TYPE_SEARCH, "搜索词(标题/摘要)"),
(TYPE_KEYWORD, "OpenAlex关键词ID"),
)
type = models.CharField("监控类型", max_length=20, choices=TYPE_CHOICES, db_index=True)
value = models.CharField("监控值", max_length=500) # ISSN / 搜索词 / keyword id
name = models.CharField("名称", max_length=200, null=True, blank=True)
note = models.CharField("方向标注", max_length=100, null=True, blank=True) # 如 无机非金属材料
is_active = models.BooleanField("启用", default=True, db_index=True)
days = models.IntegerField("回看窗口(天)", default=30)
last_run = models.DateTimeField("上次运行时间", null=True, blank=True)
last_count = models.IntegerField("上次拉取篇数(窗口内)", default=0)
class Meta:
verbose_name = "论文监控"
verbose_name_plural = verbose_name
def __str__(self):
return f"{self.get_type_display()}:{self.name or self.value}"

View File

@ -3,7 +3,7 @@ from __future__ import absolute_import, unicode_literals
from apps.utils.tasks import CustomTask from apps.utils.tasks import CustomTask
from celery import shared_task from celery import shared_task
from pyalex import Works, config from pyalex import Works, config
from apps.resm.models import Paper, PaperAbstract from apps.resm.models import Paper, PaperAbstract, PaperMonitor
from apps.utils.snowflake import idWorker from apps.utils.snowflake import idWorker
from django.core.cache import cache from django.core.cache import cache
import requests import requests
@ -299,6 +299,43 @@ def backfill_paper_meta_from_openalex(from_publication_date: str, to_publication
return f"combo {combo_index}/{len(combos)} done ({kw!r},{search!r}) -> next" return f"combo {combo_index}/{len(combos)} done ({kw!r},{search!r}) -> next"
@shared_task(base=CustomTask)
def monitor_papers(monitor_id: str = None):
"""期刊 / 关键词监控:遍历启用的 PaperMonitor,按 type 拼 OpenAlex 过滤,用
from_publication_date 拉最近 days 天的最新论文元数据入库期刊与关键词监控共用本任务
- journal: value=ISSN -> 过滤 primary_location.source.issn(o_keywords/o_search 留空)
- search : value=英文搜索词 -> title_and_abstract.search(写回 o_search)
- keyword: value=OpenAlex keyword id -> keywords.id(写回 o_keywords)
复用 _crawl_openalex_query(短窗口,cache_key=None 每次从头扫);bulk_create
ignore_conflicts 自动与现有 Paper 去重monitor_id 给定则只跑该条 beat 每天调度
"""
qs = PaperMonitor.objects.filter(is_active=True)
if monitor_id:
qs = qs.filter(id=monitor_id)
results = []
for m in qs:
from_pub = (timezone.now() - timedelta(days=m.days or 30)).date().isoformat()
base = Works().filter(has_doi=True, type="article", from_publication_date=from_pub)
kw, search = "", ""
if m.type == PaperMonitor.TYPE_JOURNAL:
base = base.filter(primary_location={"source": {"issn": m.value}})
elif m.type == PaperMonitor.TYPE_SEARCH:
search = m.value
base = _apply_keyword_search(base, "", search)
elif m.type == PaperMonitor.TYPE_KEYWORD:
kw = m.value
base = _apply_keyword_search(base, kw, "")
else:
continue
seen = _crawl_openalex_query(base, kw, search)
m.last_run = timezone.now()
m.last_count = seen
m.save(update_fields=["last_run", "last_count", "update_time"])
results.append(f"{m.type}:{m.name or m.value}={seen}")
return "; ".join(results) or "no active monitors"
SCIENCEDIRECT_SEARCH_URL = "https://api.elsevier.com/content/search/sciencedirect" SCIENCEDIRECT_SEARCH_URL = "https://api.elsevier.com/content/search/sciencedirect"