diff --git a/apps/resm/__init__.py b/apps/resm/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/resm/admin.py b/apps/resm/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/apps/resm/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/apps/resm/apps.py b/apps/resm/apps.py new file mode 100644 index 0000000..1543200 --- /dev/null +++ b/apps/resm/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class ResmConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'apps.resm' diff --git a/apps/resm/migrations/0001_initial.py b/apps/resm/migrations/0001_initial.py new file mode 100644 index 0000000..9a3472a --- /dev/null +++ b/apps/resm/migrations/0001_initial.py @@ -0,0 +1,59 @@ +# Generated by Django 4.2.27 on 2026-01-23 01:53 + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='Paper', + fields=[ + ('id', models.CharField(editable=False, help_text='主键ID', max_length=20, primary_key=True, serialize=False, verbose_name='主键ID')), + ('create_time', models.DateTimeField(default=django.utils.timezone.now, help_text='创建时间', verbose_name='创建时间')), + ('update_time', models.DateTimeField(auto_now=True, help_text='修改时间', verbose_name='修改时间')), + ('is_deleted', models.BooleanField(default=False, help_text='删除标记', verbose_name='删除标记')), + ('openalex_id', models.TextField(blank=True, null=True, unique=True, verbose_name='OpenAlex ID')), + ('doi', models.TextField(unique=True, verbose_name='DOI')), + ('type', models.CharField(db_index=True, max_length=20)), + ('title', models.TextField()), + ('publication_date', models.DateField(blank=True, null=True)), + ('publication_year', models.IntegerField(db_index=True)), + ('first_author', models.TextField(blank=True, null=True)), + ('first_author_institution', models.TextField(blank=True, null=True)), + ('publication_name', models.TextField(blank=True, null=True)), + ('is_oa', models.BooleanField(db_index=True, default=False)), + ('oa_url', models.TextField(blank=True, null=True)), + ('has_abstract', models.BooleanField(db_index=True, default=False)), + ('has_fulltext', models.BooleanField(db_index=True, default=False)), + ('fetch_status', models.CharField(db_index=True, default='meta_only', max_length=20)), + ('fail_reason', models.CharField(blank=True, max_length=50, null=True)), + ('source', models.CharField(default='openalex', max_length=20, verbose_name='元数据来源')), + ], + options={ + 'abstract': False, + }, + ), + migrations.CreateModel( + name='PaperAbstract', + fields=[ + ('id', models.CharField(editable=False, help_text='主键ID', max_length=20, primary_key=True, serialize=False, verbose_name='主键ID')), + ('create_time', models.DateTimeField(default=django.utils.timezone.now, help_text='创建时间', verbose_name='创建时间')), + ('update_time', models.DateTimeField(auto_now=True, help_text='修改时间', verbose_name='修改时间')), + ('is_deleted', models.BooleanField(default=False, help_text='删除标记', verbose_name='删除标记')), + ('abstract', models.TextField()), + ('source', models.CharField(max_length=20, verbose_name='摘要来源')), + ('paper', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='abstract', to='resm.paper')), + ], + options={ + 'abstract': False, + }, + ), + ] diff --git a/apps/resm/migrations/__init__.py b/apps/resm/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/resm/models.py b/apps/resm/models.py new file mode 100644 index 0000000..c542bfd --- /dev/null +++ b/apps/resm/models.py @@ -0,0 +1,53 @@ +from django.db import models +from apps.utils.models import BaseModel +# Create your models here. + +class Paper(BaseModel): + # ===== 全局唯一标识 ===== + openalex_id = models.TextField(unique=True, verbose_name="OpenAlex ID", null=True, blank=True) + doi = models.TextField(unique=True, verbose_name="DOI") + # ===== 基本信息 ===== + type = models.CharField(max_length=20, db_index=True) + title = models.TextField() + publication_date = models.DateField(null=True, blank=True) + publication_year = models.IntegerField(db_index=True) + # ===== 作者(最小可用集)===== + first_author = models.TextField(null=True, blank=True) + first_author_institution = models.TextField(null=True, blank=True) + # ===== 期刊 ===== + publication_name = models.TextField(null=True, blank=True) + # ===== OA 元信息 ===== + is_oa = models.BooleanField(default=False, db_index=True) + oa_url = models.TextField(null=True, blank=True) + # ===== 状态位(调度核心)===== + has_abstract = models.BooleanField(default=False, db_index=True) + has_fulltext = models.BooleanField(default=False, db_index=True) + fetch_status = models.CharField( + max_length=20, + default="meta_only", # meta_only / abstract_ready / fulltext_ready / parsed / failed + db_index=True + ) + fail_reason = models.CharField( + max_length=50, + null=True, + blank=True + ) + + source = models.CharField( + max_length=20, + default="openalex", + verbose_name="元数据来源" + ) + +class PaperAbstract(BaseModel): + paper = models.OneToOneField( + Paper, + on_delete=models.CASCADE, + related_name="abstract" + ) + + abstract = models.TextField() + source = models.CharField( + max_length=20, + verbose_name="摘要来源" # openalex / elsevier / crossref + ) diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py new file mode 100644 index 0000000..14c754f --- /dev/null +++ b/apps/resm/tasks.py @@ -0,0 +1,46 @@ +# Create your tasks here +from __future__ import absolute_import, unicode_literals +from apps.utils.tasks import CustomTask +from celery import shared_task +from pyalex import Works, config +from itertools import chain +from apps.resm.models import Paper +from apps.utils.snowflake import idWorker + +config.email = "caoqianming@foxmail.com" +config.max_retries = 0 +config.retry_backoff_factor = 0.1 +config.retry_http_codes = [429, 500, 503] +config.api_key = "4KJZdkCFA0uFb6IsYKc8cd" + +@shared_task(base=CustomTask) +def get_paper_meta_from_openalex(publication_year:int, search_key:str): + query = Works().filter( + publication_year=publication_year, + type="article" # 将 type 移到 filter 中 + ).search(search_key).select([ + "id", "doi", "title", "publication_date", + "open_access", "authorships", "primary_location", "publication_year" + ]) + papers = [] + for record in chain(*query.paginate(per_page=200)): + if record["doi"]: + paper = Paper() + paper.id = idWorker.get_id() + paper.type = "article" + paper.openalex_id = record["id"].split("/")[-1] + paper.doi = record["doi"].replace("https://doi.org/", "") + paper.title = record["title"] + paper.publication_date = record["publication_date"] + paper.publication_year = record["publication_year"] + if record["open_access"]: + paper.is_oa = record["open_access"]["is_oa"] + paper.oa_url = record["open_access"]["oa_url"] + if record["authorships"]: + paper.first_author = record["authorships"][0]["author"]["display_name"] + if record["authorships"][0]["institutions"]: + paper.first_author_institution = record["authorships"][0]["institutions"][0]["display_name"] + if record["primary_location"] and record["primary_location"]["source"]: + paper.publication_name = record["primary_location"]["source"]["display_name"] + papers.append(paper) + Paper.objects.bulk_create(papers, ignore_conflicts=True) diff --git a/apps/resm/tests.py b/apps/resm/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/apps/resm/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/apps/resm/urls.py b/apps/resm/urls.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/resm/views.py b/apps/resm/views.py new file mode 100644 index 0000000..91ea44a --- /dev/null +++ b/apps/resm/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/server/settings.py b/server/settings.py index d65a116..d44e34e 100755 --- a/server/settings.py +++ b/server/settings.py @@ -58,7 +58,8 @@ INSTALLED_APPS = [ 'apps.system', 'apps.auth1', 'apps.wf', - 'apps.ops' + 'apps.ops', + 'apps.resm' ] MIDDLEWARE = [