diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index f41a4e6..fb2f39b 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -4,9 +4,13 @@ from apps.utils.tasks import CustomTask from celery import shared_task from pyalex import Works, config from itertools import chain -from apps.resm.models import Paper +from apps.resm.models import Paper, PaperAbstract from apps.utils.snowflake import idWorker from django.core.cache import cache +import requests +from lxml import etree +from django.conf import settings +import os config.email = "caoqianming@foxmail.com" config.max_retries = 0 @@ -57,5 +61,86 @@ def get_paper_meta_from_openalex(publication_year:int, search_key:str): ELSEVIER_APIKEY = 'aa8868cac9e27d6153ab0a0acd7b50bf' @shared_task(base=CustomTask) -def get_abstract_from_elsevier(publication_year:int): - pass \ No newline at end of file +def get_abstract_from_elsevier(publication_year: int, number_of_task:int = 100): + qs = Paper.objects.filter( + publication_year=publication_year, + has_abstract=False + ).exclude( + fail_reason="elsevier_doi_not_found" + ).order_by("publication_date") + + if not qs.exists(): + return "done" + + params = { + "apiKey": ELSEVIER_APIKEY, + "httpAccept": "text/xml" + } + err_msg = "" + req = requests.Session() + for paper in qs[:number_of_task]: + try: + res = req.get( + f"https://api.elsevier.com/content/article/doi/{paper.doi}", + params=params, + timeout=(3, 15) + ) + except requests.RequestException: + err_msg = "elsevier_request_error" + break + + if res.status_code == 200: + xml_str = res.text + try: + root = etree.fromstring(xml_str.encode("utf-8")) + except etree.XMLSyntaxError: + paper.fail_reason = "elsevier_xml_error" + paper.save(update_fields=["fail_reason"]) + continue + + ns = {"dc": "http://purl.org/dc/elements/1.1/", + "ce": "http://www.elsevier.com/xml/common/dtd"} + abstract = root.xpath("//dc:description/text()", namespaces=ns) + if abstract: + PaperAbstract.objects.update_or_create( + paper=paper, + defaults={ + "abstract": abstract[0].strip(), + "source": "elsevier" + } + ) + paper.has_abstract = True + paper.has_abstract_xml = True + + paras = root.xpath("//ce:para", namespaces=ns) + has_fulltext = len(paras) > 0 + if has_fulltext: + paper.has_fulltext = True + paper.has_fulltext_xml = True + + publication_date = paper.publication_date + paper_dir = os.path.join( + settings.BASE_DIR, + "media/papers", + str(publication_date.year), + str(publication_date.month), + str(publication_date.day) + ) + os.makedirs(paper_dir, exist_ok=True) + + safe_doi = paper.doi.replace("/", "_") + paper_file = os.path.join(paper_dir, f"{safe_doi}.xml") + with open(paper_file, "wb") as f: + f.write(xml_str.encode("utf-8")) + + paper.save(update_fields=["has_abstract", "has_abstract_xml", "has_fulltext", "has_fulltext_xml", "update_time"]) + + elif res.status_code == 404: + paper.fail_reason = "elsevier_doi_not_found" + paper.save(update_fields=["fail_reason"]) + + remaining_count = qs.count() + if remaining_count == 0: + return "done" + else: + return f'{err_msg}, remaining {remaining_count} papers' \ No newline at end of file