feat: 恢复状态

This commit is contained in:
caoqianming 2026-01-29 18:08:32 +08:00
parent df7dbc6717
commit f922685561
1 changed files with 60 additions and 51 deletions

View File

@ -139,60 +139,70 @@ def get_abstract_from_elsevier(number_of_task:int = 20):
for paper in qs[:number_of_task]: for paper in qs[:number_of_task]:
if not show_task_run(def_name): if not show_task_run(def_name):
break break
original_status = paper.fetch_status
if original_status == "downloading":
return f"paper {paper.id} is already downloading"
paper.fetch_status = "downloading"
paper.save(update_fields=["fetch_status", "update_time"])
try: try:
res = req.get(
f"https://api.elsevier.com/content/article/doi/{paper.doi}",
params=params,
timeout=(3, 15)
)
except requests.RequestException:
err_msg = "elsevier_request_error"
break
if res.status_code == 200:
xml_str = res.text
try: try:
root = etree.fromstring(xml_str.encode("utf-8")) res = req.get(
except etree.XMLSyntaxError: f"https://api.elsevier.com/content/article/doi/{paper.doi}",
paper.save_fail_reason("elsevier_xml_error") params=params,
continue timeout=(3, 15)
ns = {"dc": "http://purl.org/dc/elements/1.1/",
"ce": "http://www.elsevier.com/xml/common/dtd",
"xocs": "http://www.elsevier.com/xml/xocs/dtd",}
abstract = root.xpath("//dc:description/text()", namespaces=ns)
if abstract:
PaperAbstract.objects.update_or_create(
paper=paper,
defaults={
"abstract": abstract[0].strip(),
"source": "elsevier"
}
) )
paper.has_abstract = True except requests.RequestException:
paper.has_abstract_xml = True err_msg = "elsevier_request_error"
paper.fetch_status = "abstract_ready" break
else: if res.status_code == 200:
paper.save_fail_reason("elsevier_abstract_not_found") xml_str = res.text
continue try:
root = etree.fromstring(xml_str.encode("utf-8"))
except etree.XMLSyntaxError:
paper.save_fail_reason("elsevier_xml_error")
continue
paras = root.xpath("//ce:para", namespaces=ns) ns = {"dc": "http://purl.org/dc/elements/1.1/",
has_fulltext = len(paras) > 0 "ce": "http://www.elsevier.com/xml/common/dtd",
if has_fulltext is False: "xocs": "http://www.elsevier.com/xml/xocs/dtd",}
rawtexts = root.xpath("//xocs:rawtext/text()",namespaces=ns) abstract = root.xpath("//dc:description/text()", namespaces=ns)
if rawtexts and len(rawtexts[0].strip()) > 2000: if abstract:
has_fulltext = True PaperAbstract.objects.update_or_create(
if has_fulltext: paper=paper,
paper.has_fulltext = True defaults={
paper.has_fulltext_xml = True "abstract": abstract[0].strip(),
paper.fetch_status = "fulltext_ready" "source": "elsevier"
}
)
paper.has_abstract = True
paper.has_abstract_xml = True
paper.fetch_status = "abstract_ready"
else:
paper.save_fail_reason("elsevier_abstract_not_found")
continue
paper.save_file_xml(xml_str) paras = root.xpath("//ce:para", namespaces=ns)
paper.save(update_fields=["has_abstract", has_fulltext = len(paras) > 0
"has_abstract_xml", "has_fulltext", if has_fulltext is False:
"has_fulltext_xml", "update_time", "fetch_status"]) rawtexts = root.xpath("//xocs:rawtext/text()",namespaces=ns)
if rawtexts and len(rawtexts[0].strip()) > 2000:
has_fulltext = True
if has_fulltext:
paper.has_fulltext = True
paper.has_fulltext_xml = True
paper.fetch_status = "fulltext_ready"
elif res.status_code == 404: paper.save_file_xml(xml_str)
paper.save_fail_reason("elsevier_doi_not_found") paper.save(update_fields=["has_abstract",
"has_abstract_xml", "has_fulltext",
"has_fulltext_xml", "update_time", "fetch_status"])
elif res.status_code == 404:
paper.save_fail_reason("elsevier_doi_not_found")
finally:
if paper.fetch_status == "downloading":
paper.fetch_status = original_status
paper.save(update_fields=["fetch_status", "update_time"])
qs_count = qs.count() qs_count = qs.count()
if show_task_run(def_name) and qs_count > 0: if show_task_run(def_name) and qs_count > 0:
@ -315,8 +325,7 @@ def download_pdf(paper_id):
msg = save_pdf_from_openalex(paper) msg = save_pdf_from_openalex(paper)
return msg, current_from return msg, current_from
finally: finally:
# 出错时恢复到原状态 if paper.fetch_status == "downloading":
if paper.fetch_status == "downloading" and paper.has_fulltext_pdf is False:
paper.fetch_status = original_status paper.fetch_status = original_status
paper.save(update_fields=['fetch_status', 'update_time']) paper.save(update_fields=['fetch_status', 'update_time'])