diff --git a/apps/resm/migrations/0002_paper_has_abstract_xml_paper_has_fulltext_pdf_and_more.py b/apps/resm/migrations/0002_paper_has_abstract_xml_paper_has_fulltext_pdf_and_more.py index b1af5a3..b41dfd8 100644 --- a/apps/resm/migrations/0002_paper_has_abstract_xml_paper_has_fulltext_pdf_and_more.py +++ b/apps/resm/migrations/0002_paper_has_abstract_xml_paper_has_fulltext_pdf_and_more.py @@ -27,7 +27,7 @@ class Migration(migrations.Migration): ), migrations.AddField( model_name='paper', - name='search_word_first', + name='o_search', field=models.TextField(default='cement'), ), ] diff --git a/apps/resm/migrations/0003_paper_o_keywords.py b/apps/resm/migrations/0003_paper_o_keywords.py new file mode 100644 index 0000000..3b534a2 --- /dev/null +++ b/apps/resm/migrations/0003_paper_o_keywords.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.27 on 2026-01-28 02:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('resm', '0002_paper_has_abstract_xml_paper_has_fulltext_pdf_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='paper', + name='o_keywords', + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/apps/resm/models.py b/apps/resm/models.py index 88ef40b..3eec872 100644 --- a/apps/resm/models.py +++ b/apps/resm/models.py @@ -43,7 +43,8 @@ class Paper(BaseModel): default="openalex", verbose_name="元数据来源" ) - search_word_first = models.TextField(default="cement") + o_search = models.TextField(default="cement") + o_keywords = models.TextField(null=True, blank=True) def init_save_dir(self): publication_date = self.publication_date diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index 83d2b51..13d55e6 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -3,14 +3,11 @@ from __future__ import absolute_import, unicode_literals from apps.utils.tasks import CustomTask from celery import shared_task from pyalex import Works, config -from itertools import chain from apps.resm.models import Paper, PaperAbstract from apps.utils.snowflake import idWorker from django.core.cache import cache import requests from lxml import etree -from django.conf import settings -import os from celery import current_app from datetime import datetime @@ -21,17 +18,35 @@ config.retry_http_codes = [429, 500, 503] config.api_key = "4KJZdkCFA0uFb6IsYKc8cd" @shared_task(base=CustomTask) -def get_paper_meta_from_openalex(publication_year:int, search_key:str, end_year:int=None): - cache_key = f"openalex_cursor_{publication_year}_{search_key}" +def get_paper_meta_from_openalex(publication_year:int, keywords:str="", search:str="", end_year:int=None): + cache_key = f"openalex_cursor_{publication_year}_{keywords}{search}" cache_cursor = cache.get(cache_key, "*") + if keywords or search: + pass + else: + raise Exception("keywords or search must be provided") + # filter=keywords.id:clinker|cement pager = Works().filter( publication_year=publication_year, has_doi=True, - type="article" # 将 type 移到 filter 中 - ).search(search_key).select([ + type="article" + ) + if keywords: + if "|" in keywords: + keywords_list = keywords.split("|") + else: + keywords_list = [keywords] + pager = pager.filter( + keywords={"id": keywords_list} + ) + if search: + pager = pager.filter( + search=search + ) + pager = pager.select([ "id", "doi", "title", "publication_date", "open_access", "authorships", "primary_location", "publication_year", - "display_name" + "display_name", "content_urls" ]).paginate(per_page=200, n_max=None, cursor=cache_cursor) next_cursor = pager._next_value for page in pager: @@ -40,7 +55,8 @@ def get_paper_meta_from_openalex(publication_year:int, search_key:str, end_year: if record["doi"] and (record["display_name"] or record["title"]): paper = Paper() paper.id = idWorker.get_id() - paper.search_word_first = search_key + paper.o_keywords = keywords + paper.o_search = search paper.source = "openalex" paper.type = "article" paper.openalex_id = record["id"].split("/")[-1] @@ -67,7 +83,8 @@ def get_paper_meta_from_openalex(publication_year:int, search_key:str, end_year: "apps.resm.tasks.get_paper_meta_from_openalex", kwargs={ "publication_year": publication_year + 1, - "search_key": search_key, + "keywords": keywords, + "search": search, "end_year": end_year }, countdown=5