feat: send_download_fulltext_task
This commit is contained in:
parent
67b3a4d8c3
commit
eda1435b58
|
|
@ -215,7 +215,7 @@ def get_pdf_from_elsevier(number_of_task=100):
|
||||||
def_name = get_pdf_from_elsevier.name
|
def_name = get_pdf_from_elsevier.name
|
||||||
if not show_task_run(def_name):
|
if not show_task_run(def_name):
|
||||||
return "stoped"
|
return "stoped"
|
||||||
qs = Paper.objects.filter(has_fulltext=True, has_fulltext_pdf=False)
|
qs = Paper.objects.filter(has_fulltext=True, has_fulltext_pdf=False, has_abstract=True)
|
||||||
err_msg = ""
|
err_msg = ""
|
||||||
with requests.Session() as req:
|
with requests.Session() as req:
|
||||||
for paper in qs[:number_of_task]:
|
for paper in qs[:number_of_task]:
|
||||||
|
|
@ -257,13 +257,11 @@ def get_pdf_from_elsevier(number_of_task=100):
|
||||||
|
|
||||||
|
|
||||||
@shared_task(base=CustomTask)
|
@shared_task(base=CustomTask)
|
||||||
def get_pdf_from_oa_url(number_of_task=100):
|
def send_download_fulltext_task(number_of_task=100):
|
||||||
def_name = get_pdf_from_oa_url.name
|
def_name = send_download_fulltext_task.name
|
||||||
if not show_task_run(def_name):
|
if not show_task_run(def_name):
|
||||||
return "stoped"
|
return "stoped"
|
||||||
qs = Paper.objects.filter(is_oa=True, has_fulltext=False).exclude(
|
qs = Paper.objects.filter(is_oa=True, has_fulltext=False, fail_reason=None)
|
||||||
fail_reason__contains="oa_url_request_error"
|
|
||||||
).exclude(fail_reason__contains="oa_url_not_pdf")
|
|
||||||
|
|
||||||
if not qs.exists():
|
if not qs.exists():
|
||||||
return "done"
|
return "done"
|
||||||
|
|
@ -290,7 +288,7 @@ def get_pdf_from_oa_url(number_of_task=100):
|
||||||
qs_count = qs.count()
|
qs_count = qs.count()
|
||||||
if show_task_run(def_name) and qs_count > 0:
|
if show_task_run(def_name) and qs_count > 0:
|
||||||
current_app.send_task(
|
current_app.send_task(
|
||||||
"apps.resm.tasks.get_pdf_from_oa_url",
|
def_name,
|
||||||
kwargs={
|
kwargs={
|
||||||
"number_of_task": number_of_task,
|
"number_of_task": number_of_task,
|
||||||
},
|
},
|
||||||
|
|
@ -317,12 +315,19 @@ def download_pdf(paper_id):
|
||||||
# 设置处理中标记,防止并发重复处理
|
# 设置处理中标记,防止并发重复处理
|
||||||
cache.set(cache_key, True, timeout=3600)
|
cache.set(cache_key, True, timeout=3600)
|
||||||
|
|
||||||
|
msg = save_pdf_from_oa_url(paper)
|
||||||
|
if paper.has_fulltext_pdf is False:
|
||||||
|
msg = save_pdf_from_openalex(paper)
|
||||||
|
return msg
|
||||||
|
|
||||||
|
|
||||||
|
def save_pdf_from_oa_url(paper:Paper):
|
||||||
try:
|
try:
|
||||||
headers = get_random_headers()
|
headers = get_random_headers()
|
||||||
res = requests.get(paper.oa_url, headers=headers, timeout=(3, 15))
|
res = requests.get(paper.oa_url, headers=headers, timeout=(3, 15))
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
paper.save_fail_reason("oa_url_request_error")
|
paper.save_fail_reason("oa_url_request_error")
|
||||||
return save_pdf_from_openalex(paper)
|
return f"oa_url_request_error: {str(e)}"
|
||||||
|
|
||||||
if res.status_code == 200:
|
if res.status_code == 200:
|
||||||
# 检查是否是PDF文件:检查魔数 %PDF 或 content-type
|
# 检查是否是PDF文件:检查魔数 %PDF 或 content-type
|
||||||
|
|
@ -338,11 +343,6 @@ def download_pdf(paper_id):
|
||||||
paper.fetch_status = "fulltext_ready"
|
paper.fetch_status = "fulltext_ready"
|
||||||
paper.save(update_fields=["has_fulltext", "has_fulltext_pdf", "fetch_status", "update_time"])
|
paper.save(update_fields=["has_fulltext", "has_fulltext_pdf", "fetch_status", "update_time"])
|
||||||
return "success"
|
return "success"
|
||||||
else:
|
|
||||||
return save_pdf_from_openalex(paper)
|
|
||||||
else:
|
|
||||||
return save_pdf_from_openalex(paper)
|
|
||||||
|
|
||||||
|
|
||||||
def save_pdf_from_openalex(paper:Paper):
|
def save_pdf_from_openalex(paper:Paper):
|
||||||
# 尝试openalex下载
|
# 尝试openalex下载
|
||||||
|
|
@ -352,7 +352,7 @@ def save_pdf_from_openalex(paper:Paper):
|
||||||
"api_key": OPENALEX_KEY
|
"api_key": OPENALEX_KEY
|
||||||
})
|
})
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
paper.save_fail_reason("oa_url_not_pdf;openalex_pdf_error")
|
paper.save_fail_reason("openalex_pdf_error")
|
||||||
return f"openalex_pdf_error: {str(e)}"
|
return f"openalex_pdf_error: {str(e)}"
|
||||||
if res.status_code == 200:
|
if res.status_code == 200:
|
||||||
paper.save_file_pdf(res.content)
|
paper.save_file_pdf(res.content)
|
||||||
|
|
@ -361,6 +361,6 @@ def save_pdf_from_openalex(paper:Paper):
|
||||||
paper.fetch_status = "fulltext_ready"
|
paper.fetch_status = "fulltext_ready"
|
||||||
paper.save(update_fields=["has_fulltext", "has_fulltext_pdf", "fetch_status", "update_time"])
|
paper.save(update_fields=["has_fulltext", "has_fulltext_pdf", "fetch_status", "update_time"])
|
||||||
return "success"
|
return "success"
|
||||||
else:
|
|
||||||
paper.save_fail_reason("oa_url_not_pdf;openalex_pdf_error")
|
|
||||||
return f"openalex_pdf_error: {res.status_code}"
|
# https://sci.bban.top/pdf/10.1016/j.conbuildmat.2020.121016.pdf?download=true
|
||||||
Loading…
Reference in New Issue