From 1ddca4d34daae11447a83df9b3fc9ed01efea14c Mon Sep 17 00:00:00 2001 From: caoqianming Date: Tue, 10 Feb 2026 13:56:44 +0800 Subject: [PATCH] feat: get_pdf_from_openalex --- apps/resm/tasks.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index 8685726..d2172fa 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -15,6 +15,7 @@ from .d_oaurl import download_from_url_playwright import asyncio import sys import os +from django.db.models import Q # config.email = "caoqianming@foxmail.com" config.email = "caoqianming@ctc.ac.cn" @@ -144,6 +145,36 @@ def get_random_headers(): def show_task_run(def_name: str): return cache.get(def_name, True) +def get_pdf_from_openalex(number_of_task: int =10): + def_name = get_pdf_from_openalex.name + if not show_task_run(def_name): + return "stoped" + count = 0 + qs = Paper.objects.filter(is_oa=True, has_fulltext=False).exclude( + fetch_status="downloading")[:number_of_task] + if not qs.exists(): + return "done" + for paper in qs: + if not show_task_run(def_name): + break + paper.fetch("downloading") + save_pdf_from_openalex(paper) + paper.fetch_end() + if paper.has_fulltext_pdf: + count += 1 + countdown = 2 + if cache.get("openalex_api_exceed"): + countdown = 5 * 60 # 5分钟后重试 + if show_task_run(def_name): + current_app.send_task( + "apps.resm.tasks.get_pdf_from_openalex", + kwargs={ + "number_of_task": number_of_task, + }, + countdown=countdown, + ) + return count + @shared_task(base=CustomTask) def get_abstract_from_elsevier(number_of_task:int = 20, exclude_failed:bool=True): def_name = get_abstract_from_elsevier.name