diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index 756d620..5d9a11d 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -139,60 +139,70 @@ def get_abstract_from_elsevier(number_of_task:int = 20): for paper in qs[:number_of_task]: if not show_task_run(def_name): break + original_status = paper.fetch_status + if original_status == "downloading": + return f"paper {paper.id} is already downloading" + paper.fetch_status = "downloading" + paper.save(update_fields=["fetch_status", "update_time"]) try: - res = req.get( - f"https://api.elsevier.com/content/article/doi/{paper.doi}", - params=params, - timeout=(3, 15) - ) - except requests.RequestException: - err_msg = "elsevier_request_error" - break - if res.status_code == 200: - xml_str = res.text try: - root = etree.fromstring(xml_str.encode("utf-8")) - except etree.XMLSyntaxError: - paper.save_fail_reason("elsevier_xml_error") - continue - - ns = {"dc": "http://purl.org/dc/elements/1.1/", - "ce": "http://www.elsevier.com/xml/common/dtd", - "xocs": "http://www.elsevier.com/xml/xocs/dtd",} - abstract = root.xpath("//dc:description/text()", namespaces=ns) - if abstract: - PaperAbstract.objects.update_or_create( - paper=paper, - defaults={ - "abstract": abstract[0].strip(), - "source": "elsevier" - } + res = req.get( + f"https://api.elsevier.com/content/article/doi/{paper.doi}", + params=params, + timeout=(3, 15) ) - paper.has_abstract = True - paper.has_abstract_xml = True - paper.fetch_status = "abstract_ready" - else: - paper.save_fail_reason("elsevier_abstract_not_found") - continue + except requests.RequestException: + err_msg = "elsevier_request_error" + break + if res.status_code == 200: + xml_str = res.text + try: + root = etree.fromstring(xml_str.encode("utf-8")) + except etree.XMLSyntaxError: + paper.save_fail_reason("elsevier_xml_error") + continue - paras = root.xpath("//ce:para", namespaces=ns) - has_fulltext = len(paras) > 0 - if has_fulltext is False: - rawtexts = root.xpath("//xocs:rawtext/text()",namespaces=ns) - if rawtexts and len(rawtexts[0].strip()) > 2000: - has_fulltext = True - if has_fulltext: - paper.has_fulltext = True - paper.has_fulltext_xml = True - paper.fetch_status = "fulltext_ready" - - paper.save_file_xml(xml_str) - paper.save(update_fields=["has_abstract", - "has_abstract_xml", "has_fulltext", - "has_fulltext_xml", "update_time", "fetch_status"]) + ns = {"dc": "http://purl.org/dc/elements/1.1/", + "ce": "http://www.elsevier.com/xml/common/dtd", + "xocs": "http://www.elsevier.com/xml/xocs/dtd",} + abstract = root.xpath("//dc:description/text()", namespaces=ns) + if abstract: + PaperAbstract.objects.update_or_create( + paper=paper, + defaults={ + "abstract": abstract[0].strip(), + "source": "elsevier" + } + ) + paper.has_abstract = True + paper.has_abstract_xml = True + paper.fetch_status = "abstract_ready" + else: + paper.save_fail_reason("elsevier_abstract_not_found") + continue - elif res.status_code == 404: - paper.save_fail_reason("elsevier_doi_not_found") + paras = root.xpath("//ce:para", namespaces=ns) + has_fulltext = len(paras) > 0 + if has_fulltext is False: + rawtexts = root.xpath("//xocs:rawtext/text()",namespaces=ns) + if rawtexts and len(rawtexts[0].strip()) > 2000: + has_fulltext = True + if has_fulltext: + paper.has_fulltext = True + paper.has_fulltext_xml = True + paper.fetch_status = "fulltext_ready" + + paper.save_file_xml(xml_str) + paper.save(update_fields=["has_abstract", + "has_abstract_xml", "has_fulltext", + "has_fulltext_xml", "update_time", "fetch_status"]) + + elif res.status_code == 404: + paper.save_fail_reason("elsevier_doi_not_found") + finally: + if paper.fetch_status == "downloading": + paper.fetch_status = original_status + paper.save(update_fields=["fetch_status", "update_time"]) qs_count = qs.count() if show_task_run(def_name) and qs_count > 0: @@ -315,8 +325,7 @@ def download_pdf(paper_id): msg = save_pdf_from_openalex(paper) return msg, current_from finally: - # 出错时恢复到原状态 - if paper.fetch_status == "downloading" and paper.has_fulltext_pdf is False: + if paper.fetch_status == "downloading": paper.fetch_status = original_status paper.save(update_fields=['fetch_status', 'update_time'])