feat: 添加o_keywords 字段

This commit is contained in:
caoqianming 2026-01-28 10:26:41 +08:00
parent 5b6e4ee591
commit f57f624b65
4 changed files with 48 additions and 12 deletions

View File

@ -27,7 +27,7 @@ class Migration(migrations.Migration):
), ),
migrations.AddField( migrations.AddField(
model_name='paper', model_name='paper',
name='search_word_first', name='o_search',
field=models.TextField(default='cement'), field=models.TextField(default='cement'),
), ),
] ]

View File

@ -0,0 +1,18 @@
# Generated by Django 4.2.27 on 2026-01-28 02:25
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('resm', '0002_paper_has_abstract_xml_paper_has_fulltext_pdf_and_more'),
]
operations = [
migrations.AddField(
model_name='paper',
name='o_keywords',
field=models.TextField(blank=True, null=True),
),
]

View File

@ -43,7 +43,8 @@ class Paper(BaseModel):
default="openalex", default="openalex",
verbose_name="元数据来源" verbose_name="元数据来源"
) )
search_word_first = models.TextField(default="cement") o_search = models.TextField(default="cement")
o_keywords = models.TextField(null=True, blank=True)
def init_save_dir(self): def init_save_dir(self):
publication_date = self.publication_date publication_date = self.publication_date

View File

@ -3,14 +3,11 @@ from __future__ import absolute_import, unicode_literals
from apps.utils.tasks import CustomTask from apps.utils.tasks import CustomTask
from celery import shared_task from celery import shared_task
from pyalex import Works, config from pyalex import Works, config
from itertools import chain
from apps.resm.models import Paper, PaperAbstract from apps.resm.models import Paper, PaperAbstract
from apps.utils.snowflake import idWorker from apps.utils.snowflake import idWorker
from django.core.cache import cache from django.core.cache import cache
import requests import requests
from lxml import etree from lxml import etree
from django.conf import settings
import os
from celery import current_app from celery import current_app
from datetime import datetime from datetime import datetime
@ -21,17 +18,35 @@ config.retry_http_codes = [429, 500, 503]
config.api_key = "4KJZdkCFA0uFb6IsYKc8cd" config.api_key = "4KJZdkCFA0uFb6IsYKc8cd"
@shared_task(base=CustomTask) @shared_task(base=CustomTask)
def get_paper_meta_from_openalex(publication_year:int, search_key:str, end_year:int=None): def get_paper_meta_from_openalex(publication_year:int, keywords:str="", search:str="", end_year:int=None):
cache_key = f"openalex_cursor_{publication_year}_{search_key}" cache_key = f"openalex_cursor_{publication_year}_{keywords}{search}"
cache_cursor = cache.get(cache_key, "*") cache_cursor = cache.get(cache_key, "*")
if keywords or search:
pass
else:
raise Exception("keywords or search must be provided")
# filter=keywords.id:clinker|cement
pager = Works().filter( pager = Works().filter(
publication_year=publication_year, publication_year=publication_year,
has_doi=True, has_doi=True,
type="article" # 将 type 移到 filter 中 type="article"
).search(search_key).select([ )
if keywords:
if "|" in keywords:
keywords_list = keywords.split("|")
else:
keywords_list = [keywords]
pager = pager.filter(
keywords={"id": keywords_list}
)
if search:
pager = pager.filter(
search=search
)
pager = pager.select([
"id", "doi", "title", "publication_date", "id", "doi", "title", "publication_date",
"open_access", "authorships", "primary_location", "publication_year", "open_access", "authorships", "primary_location", "publication_year",
"display_name" "display_name", "content_urls"
]).paginate(per_page=200, n_max=None, cursor=cache_cursor) ]).paginate(per_page=200, n_max=None, cursor=cache_cursor)
next_cursor = pager._next_value next_cursor = pager._next_value
for page in pager: for page in pager:
@ -40,7 +55,8 @@ def get_paper_meta_from_openalex(publication_year:int, search_key:str, end_year:
if record["doi"] and (record["display_name"] or record["title"]): if record["doi"] and (record["display_name"] or record["title"]):
paper = Paper() paper = Paper()
paper.id = idWorker.get_id() paper.id = idWorker.get_id()
paper.search_word_first = search_key paper.o_keywords = keywords
paper.o_search = search
paper.source = "openalex" paper.source = "openalex"
paper.type = "article" paper.type = "article"
paper.openalex_id = record["id"].split("/")[-1] paper.openalex_id = record["id"].split("/")[-1]
@ -67,7 +83,8 @@ def get_paper_meta_from_openalex(publication_year:int, search_key:str, end_year:
"apps.resm.tasks.get_paper_meta_from_openalex", "apps.resm.tasks.get_paper_meta_from_openalex",
kwargs={ kwargs={
"publication_year": publication_year + 1, "publication_year": publication_year + 1,
"search_key": search_key, "keywords": keywords,
"search": search,
"end_year": end_year "end_year": end_year
}, },
countdown=5 countdown=5