feat: 优化get_paper_meta_from_openalex

This commit is contained in:
caoqianming 2026-01-23 13:47:42 +08:00
parent ed213260ca
commit 656cf39988
1 changed files with 30 additions and 27 deletions

View File

@ -6,6 +6,7 @@ from pyalex import Works, config
from itertools import chain from itertools import chain
from apps.resm.models import Paper from apps.resm.models import Paper
from apps.utils.snowflake import idWorker from apps.utils.snowflake import idWorker
from django.cache import cache
config.email = "caoqianming@foxmail.com" config.email = "caoqianming@foxmail.com"
config.max_retries = 0 config.max_retries = 0
@ -15,22 +16,26 @@ config.api_key = "4KJZdkCFA0uFb6IsYKc8cd"
@shared_task(base=CustomTask) @shared_task(base=CustomTask)
def get_paper_meta_from_openalex(publication_year:int, search_key:str): def get_paper_meta_from_openalex(publication_year:int, search_key:str):
query = Works().filter( cache_key = f"openalex_cursor_{publication_year}_{search_key}"
cache_cursor = cache.get(cache_key, "*")
pager = Works().filter(
publication_year=publication_year, publication_year=publication_year,
type="article" # 将 type 移到 filter 中 type="article" # 将 type 移到 filter 中
).search(search_key).select([ ).search(search_key).select([
"id", "doi", "title", "publication_date", "id", "doi", "title", "publication_date",
"open_access", "authorships", "primary_location", "publication_year" "open_access", "authorships", "primary_location", "publication_year"
]) ]).paginate(per_page=200, n_max=None, cursor=cache_cursor)
next_cursor = pager._next_value
for page in pager:
papers = [] papers = []
for record in chain(*query.paginate(per_page=200)): for record in page:
if record["doi"]: if record["doi"]:
paper = Paper() paper = Paper()
paper.id = idWorker.get_id() paper.id = idWorker.get_id()
paper.type = "article" paper.type = "article"
paper.openalex_id = record["id"].split("/")[-1] paper.openalex_id = record["id"].split("/")[-1]
paper.doi = record["doi"].replace("https://doi.org/", "") paper.doi = record["doi"].replace("https://doi.org/", "")
paper.title = record["title"] paper.title = record["display_name"]
paper.publication_date = record["publication_date"] paper.publication_date = record["publication_date"]
paper.publication_year = record["publication_year"] paper.publication_year = record["publication_year"]
if record["open_access"]: if record["open_access"]:
@ -43,7 +48,5 @@ def get_paper_meta_from_openalex(publication_year:int, search_key:str):
if record["primary_location"] and record["primary_location"]["source"]: if record["primary_location"] and record["primary_location"]["source"]:
paper.publication_name = record["primary_location"]["source"]["display_name"] paper.publication_name = record["primary_location"]["source"]["display_name"]
papers.append(paper) papers.append(paper)
if len(papers) >= 100:
Paper.objects.bulk_create(papers, ignore_conflicts=True)
papers = []
Paper.objects.bulk_create(papers, ignore_conflicts=True) Paper.objects.bulk_create(papers, ignore_conflicts=True)
cache.set(cache_key, next_cursor, timeout=None)