From 90a54d2279a3e93682df925522c74472042566ae Mon Sep 17 00:00:00 2001 From: caoqianming Date: Wed, 28 Jan 2026 11:24:52 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0get=5Fpdf=5Ffrom=5Foa?= =?UTF-8?q?=5Furl?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/resm/models.py | 7 +++++ apps/resm/tasks.py | 75 ++++++++++++++++++++++++++++++++------------- 2 files changed, 60 insertions(+), 22 deletions(-) diff --git a/apps/resm/models.py b/apps/resm/models.py index 3eec872..666617e 100644 --- a/apps/resm/models.py +++ b/apps/resm/models.py @@ -70,6 +70,13 @@ class Paper(BaseModel): with open(paper_file, "wb") as f: f.write(content) + def save_fail_reason(self, reason): + if self.fail_reason: + self.fail_reason += f";{reason}" + else: + self.fail_reason = reason + self.save(update_fields=["fail_reason", "update_time"]) + class PaperAbstract(BaseModel): diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index 13d55e6..8610cf1 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -92,19 +92,20 @@ def get_paper_meta_from_openalex(publication_year:int, keywords:str="", search:s ELSEVIER_APIKEY = 'aa8868cac9e27d6153ab0a0acd7b50bf' -def is_elsevier_abstract_task_enabled(): - return cache.get("elsevier_abstract_task_enabled", True) +def show_task_run(def_name: str): + return cache.get(def_name, True) @shared_task(base=CustomTask) def get_abstract_from_elsevier(publication_year: int = None, number_of_task:int = 100): - if not is_elsevier_abstract_task_enabled(): + def_name = get_abstract_from_elsevier.__name__ + if not show_task_run(def_name): return "stoped" qs = Paper.objects.filter(has_abstract=False) if publication_year is not None: qs = qs.filter(publication_year=publication_year) qs = qs.exclude( - fail_reason="elsevier_doi_not_found" - ).exclude(fail_reason="elsevier_abstract_not_found").order_by("publication_date") + fail_reason__contains="elsevier_doi_not_found" + ).exclude(fail_reason__contains="elsevier_abstract_not_found").order_by("publication_date") if not qs.exists(): return "done" @@ -116,7 +117,7 @@ def get_abstract_from_elsevier(publication_year: int = None, number_of_task:int err_msg = "" with requests.Session() as req: for paper in qs[:number_of_task]: - if not is_elsevier_abstract_task_enabled(): + if not show_task_run(def_name): break try: res = req.get( @@ -132,8 +133,7 @@ def get_abstract_from_elsevier(publication_year: int = None, number_of_task:int try: root = etree.fromstring(xml_str.encode("utf-8")) except etree.XMLSyntaxError: - paper.fail_reason = "elsevier_xml_error" - paper.save(update_fields=["fail_reason", "update_time"]) + paper.save_fail_reason("elsevier_xml_error") continue ns = {"dc": "http://purl.org/dc/elements/1.1/", @@ -152,8 +152,7 @@ def get_abstract_from_elsevier(publication_year: int = None, number_of_task:int paper.has_abstract_xml = True paper.fetch_status = "abstract_ready" else: - paper.fail_reason = "elsevier_abstract_not_found" - paper.save(update_fields=["fail_reason", "update_time"]) + paper.save_fail_reason("elsevier_abstract_not_found") continue paras = root.xpath("//ce:para", namespaces=ns) @@ -170,13 +169,13 @@ def get_abstract_from_elsevier(publication_year: int = None, number_of_task:int paper.save_file_xml(xml_str) paper.save(update_fields=["has_abstract", "has_abstract_xml", "has_fulltext", - "has_fulltext_xml", "update_time", "fetch_status", "fail_reason"]) + "has_fulltext_xml", "update_time", "fetch_status"]) elif res.status_code == 404: - paper.fail_reason = "elsevier_doi_not_found" - paper.save(update_fields=["fail_reason", "update_time"]) + paper.save_fail_reason("elsevier_doi_not_found") + qs_count = qs.count() - if is_elsevier_abstract_task_enabled() and qs_count > 0: + if show_task_run(def_name) and qs_count > 0: current_app.send_task( "apps.resm.tasks.get_abstract_from_elsevier", kwargs={ @@ -188,22 +187,19 @@ def get_abstract_from_elsevier(publication_year: int = None, number_of_task:int return f'{err_msg}, remaining {qs_count} papers' -def is_elsevier_pdf_task_enabled(): - return cache.get("elsevier_pdf_task_enabled", True) - - @shared_task(base=CustomTask) def get_pdf_from_elsevier(number_of_task=100): """ 获取elsevier全文 """ - if not is_elsevier_pdf_task_enabled(): + def_name = get_pdf_from_elsevier.__name__ + if not show_task_run(def_name): return "stoped" qs = Paper.objects.filter(has_fulltext=True, has_fulltext_pdf=False) err_msg = "" with requests.Session() as req: for paper in qs[:number_of_task]: - if not is_elsevier_pdf_task_enabled(): + if not show_task_run(def_name): break params = { "apiKey": ELSEVIER_APIKEY, @@ -223,7 +219,7 @@ def get_pdf_from_elsevier(number_of_task=100): paper.has_fulltext_pdf = True paper.save(update_fields=["has_fulltext_pdf", "update_time"]) qs_count = qs.count() - if is_elsevier_pdf_task_enabled() and qs_count > 0: + if show_task_run(def_name) and qs_count > 0: current_app.send_task( "apps.resm.tasks.get_pdf_from_elsevier", kwargs={ @@ -232,4 +228,39 @@ def get_pdf_from_elsevier(number_of_task=100): countdown=5, ) return f'{err_msg}, remaining {qs_count} papers' - \ No newline at end of file + + +@shared_task(base=CustomTask) +def get_pdf_from_oa_url(number_of_task=100): + def_name = get_pdf_from_oa_url.__name__ + if not show_task_run(def_name): + return "stoped" + qs = Paper.objects.filter(is_oa=True, has_fulltext=False).exclude( + fail_reason__contains="oa_url_request_error" + ).exclude(fail_reason__contains="oa_url_not_pdf") + err_msg = "" + for paper in qs[:number_of_task]: + if paper.oa_url: + try: + res = requests.get(paper.oa_url, timeout=(3, 15)) + except requests.RequestException: + paper.save_fail_reason("oa_url_request_error") + continue + if res.status_code == 200 and res.headers["content-type"] == "application/pdf": + paper.save_file_pdf(res.content) + paper.has_fulltext = True + paper.has_fulltext_pdf = True + paper.fetch_status = "fulltext_ready" + paper.save(update_fields=["has_fulltext", "has_fulltext_pdf", "fetch_status", "update_time"]) + else: + paper.save_fail_reason("oa_url_not_pdf") + qs_count = qs.count() + if show_task_run(def_name) and qs_count > 0: + current_app.send_task( + "apps.resm.tasks.get_pdf_from_oa_url", + kwargs={ + "number_of_task": number_of_task, + }, + countdown=5, + ) + return f'{err_msg}, remaining {qs_count} papers' \ No newline at end of file