feat: get_pdf_from_openalex

This commit is contained in:
caoqianming 2026-02-10 13:56:44 +08:00
parent 76e8204680
commit 1ddca4d34d
1 changed files with 31 additions and 0 deletions

View File

@ -15,6 +15,7 @@ from .d_oaurl import download_from_url_playwright
import asyncio
import sys
import os
from django.db.models import Q
# config.email = "caoqianming@foxmail.com"
config.email = "caoqianming@ctc.ac.cn"
@ -144,6 +145,36 @@ def get_random_headers():
def show_task_run(def_name: str):
return cache.get(def_name, True)
def get_pdf_from_openalex(number_of_task: int =10):
def_name = get_pdf_from_openalex.name
if not show_task_run(def_name):
return "stoped"
count = 0
qs = Paper.objects.filter(is_oa=True, has_fulltext=False).exclude(
fetch_status="downloading")[:number_of_task]
if not qs.exists():
return "done"
for paper in qs:
if not show_task_run(def_name):
break
paper.fetch("downloading")
save_pdf_from_openalex(paper)
paper.fetch_end()
if paper.has_fulltext_pdf:
count += 1
countdown = 2
if cache.get("openalex_api_exceed"):
countdown = 5 * 60 # 5分钟后重试
if show_task_run(def_name):
current_app.send_task(
"apps.resm.tasks.get_pdf_from_openalex",
kwargs={
"number_of_task": number_of_task,
},
countdown=countdown,
)
return count
@shared_task(base=CustomTask)
def get_abstract_from_elsevier(number_of_task:int = 20, exclude_failed:bool=True):
def_name = get_abstract_from_elsevier.name