diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index bc1437b..1248acb 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -215,7 +215,7 @@ def get_pdf_from_elsevier(number_of_task=100): def_name = get_pdf_from_elsevier.name if not show_task_run(def_name): return "stoped" - qs = Paper.objects.filter(has_fulltext=True, has_fulltext_pdf=False) + qs = Paper.objects.filter(has_fulltext=True, has_fulltext_pdf=False, has_abstract=True) err_msg = "" with requests.Session() as req: for paper in qs[:number_of_task]: @@ -257,13 +257,11 @@ def get_pdf_from_elsevier(number_of_task=100): @shared_task(base=CustomTask) -def get_pdf_from_oa_url(number_of_task=100): - def_name = get_pdf_from_oa_url.name +def send_download_fulltext_task(number_of_task=100): + def_name = send_download_fulltext_task.name if not show_task_run(def_name): return "stoped" - qs = Paper.objects.filter(is_oa=True, has_fulltext=False).exclude( - fail_reason__contains="oa_url_request_error" - ).exclude(fail_reason__contains="oa_url_not_pdf") + qs = Paper.objects.filter(is_oa=True, has_fulltext=False, fail_reason=None) if not qs.exists(): return "done" @@ -290,7 +288,7 @@ def get_pdf_from_oa_url(number_of_task=100): qs_count = qs.count() if show_task_run(def_name) and qs_count > 0: current_app.send_task( - "apps.resm.tasks.get_pdf_from_oa_url", + def_name, kwargs={ "number_of_task": number_of_task, }, @@ -317,12 +315,19 @@ def download_pdf(paper_id): # 设置处理中标记,防止并发重复处理 cache.set(cache_key, True, timeout=3600) + msg = save_pdf_from_oa_url(paper) + if paper.has_fulltext_pdf is False: + msg = save_pdf_from_openalex(paper) + return msg + + +def save_pdf_from_oa_url(paper:Paper): try: headers = get_random_headers() res = requests.get(paper.oa_url, headers=headers, timeout=(3, 15)) except requests.RequestException as e: paper.save_fail_reason("oa_url_request_error") - return save_pdf_from_openalex(paper) + return f"oa_url_request_error: {str(e)}" if res.status_code == 200: # 检查是否是PDF文件:检查魔数 %PDF 或 content-type @@ -338,11 +343,6 @@ def download_pdf(paper_id): paper.fetch_status = "fulltext_ready" paper.save(update_fields=["has_fulltext", "has_fulltext_pdf", "fetch_status", "update_time"]) return "success" - else: - return save_pdf_from_openalex(paper) - else: - return save_pdf_from_openalex(paper) - def save_pdf_from_openalex(paper:Paper): # 尝试openalex下载 @@ -352,7 +352,7 @@ def save_pdf_from_openalex(paper:Paper): "api_key": OPENALEX_KEY }) except requests.RequestException as e: - paper.save_fail_reason("oa_url_not_pdf;openalex_pdf_error") + paper.save_fail_reason("openalex_pdf_error") return f"openalex_pdf_error: {str(e)}" if res.status_code == 200: paper.save_file_pdf(res.content) @@ -361,6 +361,6 @@ def save_pdf_from_openalex(paper:Paper): paper.fetch_status = "fulltext_ready" paper.save(update_fields=["has_fulltext", "has_fulltext_pdf", "fetch_status", "update_time"]) return "success" - else: - paper.save_fail_reason("oa_url_not_pdf;openalex_pdf_error") - return f"openalex_pdf_error: {res.status_code}" \ No newline at end of file + + +# https://sci.bban.top/pdf/10.1016/j.conbuildmat.2020.121016.pdf?download=true \ No newline at end of file