diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index fb2f39b..e0b5a52 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -11,6 +11,7 @@ import requests from lxml import etree from django.conf import settings import os +from celery import current_app config.email = "caoqianming@foxmail.com" config.max_retries = 0 @@ -61,11 +62,11 @@ def get_paper_meta_from_openalex(publication_year:int, search_key:str): ELSEVIER_APIKEY = 'aa8868cac9e27d6153ab0a0acd7b50bf' @shared_task(base=CustomTask) -def get_abstract_from_elsevier(publication_year: int, number_of_task:int = 100): - qs = Paper.objects.filter( - publication_year=publication_year, - has_abstract=False - ).exclude( +def get_abstract_from_elsevier(publication_year: int = None, number_of_task:int = 100): + qs = Paper.objects.filter(has_abstract=True) + if publication_year is not None: + qs = qs.filter(publication_year=publication_year) + qs = qs.exclude( fail_reason="elsevier_doi_not_found" ).order_by("publication_date") @@ -77,70 +78,84 @@ def get_abstract_from_elsevier(publication_year: int, number_of_task:int = 100): "httpAccept": "text/xml" } err_msg = "" - req = requests.Session() - for paper in qs[:number_of_task]: - try: - res = req.get( - f"https://api.elsevier.com/content/article/doi/{paper.doi}", - params=params, - timeout=(3, 15) - ) - except requests.RequestException: - err_msg = "elsevier_request_error" - break - - if res.status_code == 200: - xml_str = res.text + if number_of_task is None: + papers = qs.all() + else: + papers = qs[:number_of_task] + with requests.Session() as req: + for paper in papers: try: - root = etree.fromstring(xml_str.encode("utf-8")) - except etree.XMLSyntaxError: - paper.fail_reason = "elsevier_xml_error" - paper.save(update_fields=["fail_reason"]) - continue - - ns = {"dc": "http://purl.org/dc/elements/1.1/", - "ce": "http://www.elsevier.com/xml/common/dtd"} - abstract = root.xpath("//dc:description/text()", namespaces=ns) - if abstract: - PaperAbstract.objects.update_or_create( - paper=paper, - defaults={ - "abstract": abstract[0].strip(), - "source": "elsevier" - } + res = req.get( + f"https://api.elsevier.com/content/article/doi/{paper.doi}", + params=params, + timeout=(3, 15) ) - paper.has_abstract = True - paper.has_abstract_xml = True + except requests.RequestException: + err_msg = "elsevier_request_error" + break - paras = root.xpath("//ce:para", namespaces=ns) - has_fulltext = len(paras) > 0 - if has_fulltext: - paper.has_fulltext = True - paper.has_fulltext_xml = True - - publication_date = paper.publication_date - paper_dir = os.path.join( - settings.BASE_DIR, - "media/papers", - str(publication_date.year), - str(publication_date.month), - str(publication_date.day) - ) - os.makedirs(paper_dir, exist_ok=True) + if res.status_code == 200: + xml_str = res.text + try: + root = etree.fromstring(xml_str.encode("utf-8")) + except etree.XMLSyntaxError: + paper.fail_reason = "elsevier_xml_error" + paper.save(update_fields=["fail_reason"]) + continue - safe_doi = paper.doi.replace("/", "_") - paper_file = os.path.join(paper_dir, f"{safe_doi}.xml") - with open(paper_file, "wb") as f: - f.write(xml_str.encode("utf-8")) + ns = {"dc": "http://purl.org/dc/elements/1.1/", + "ce": "http://www.elsevier.com/xml/common/dtd"} + abstract = root.xpath("//dc:description/text()", namespaces=ns) + if abstract: + PaperAbstract.objects.update_or_create( + paper=paper, + defaults={ + "abstract": abstract[0].strip(), + "source": "elsevier" + } + ) + paper.has_abstract = True + paper.has_abstract_xml = True + paper.fetch_status = "abstract_ready" - paper.save(update_fields=["has_abstract", "has_abstract_xml", "has_fulltext", "has_fulltext_xml", "update_time"]) + paras = root.xpath("//ce:para", namespaces=ns) + has_fulltext = len(paras) > 0 + if has_fulltext: + paper.has_fulltext = True + paper.has_fulltext_xml = True + paper.fetch_status = "fulltext_ready" + + publication_date = paper.publication_date + paper_dir = os.path.join( + settings.BASE_DIR, + "media/papers", + str(publication_date.year), + str(publication_date.month), + str(publication_date.day) + ) + os.makedirs(paper_dir, exist_ok=True) - elif res.status_code == 404: - paper.fail_reason = "elsevier_doi_not_found" - paper.save(update_fields=["fail_reason"]) + safe_doi = paper.doi.replace("/", "_") + paper_file = os.path.join(paper_dir, f"{safe_doi}.xml") + with open(paper_file, "wb") as f: + f.write(xml_str.encode("utf-8")) + + paper.save(update_fields=["has_abstract", "has_abstract_xml", "has_fulltext", "has_fulltext_xml", "update_time"]) + + elif res.status_code == 404: + paper.fail_reason = "elsevier_doi_not_found" + paper.save(update_fields=["fail_reason"]) remaining_count = qs.count() if remaining_count == 0: return "done" else: + current_app.send_task( + "apps.resm.tasks.get_abstract_from_elsevier", + kwargs={ + "publication_year": publication_year, + "number_of_task": number_of_task, + }, + countdown=5, + ) return f'{err_msg}, remaining {remaining_count} papers' \ No newline at end of file