feat: 添加任务get_abstract_from_elsevier

This commit is contained in:
caoqianming 2026-01-26 16:38:40 +08:00
parent c39fd7d990
commit a11dce1358
1 changed files with 88 additions and 3 deletions

View File

@ -4,9 +4,13 @@ from apps.utils.tasks import CustomTask
from celery import shared_task
from pyalex import Works, config
from itertools import chain
from apps.resm.models import Paper
from apps.resm.models import Paper, PaperAbstract
from apps.utils.snowflake import idWorker
from django.core.cache import cache
import requests
from lxml import etree
from django.conf import settings
import os
config.email = "caoqianming@foxmail.com"
config.max_retries = 0
@ -57,5 +61,86 @@ def get_paper_meta_from_openalex(publication_year:int, search_key:str):
ELSEVIER_APIKEY = 'aa8868cac9e27d6153ab0a0acd7b50bf'
@shared_task(base=CustomTask)
def get_abstract_from_elsevier(publication_year:int):
pass
def get_abstract_from_elsevier(publication_year: int, number_of_task:int = 100):
qs = Paper.objects.filter(
publication_year=publication_year,
has_abstract=False
).exclude(
fail_reason="elsevier_doi_not_found"
).order_by("publication_date")
if not qs.exists():
return "done"
params = {
"apiKey": ELSEVIER_APIKEY,
"httpAccept": "text/xml"
}
err_msg = ""
req = requests.Session()
for paper in qs[:number_of_task]:
try:
res = req.get(
f"https://api.elsevier.com/content/article/doi/{paper.doi}",
params=params,
timeout=(3, 15)
)
except requests.RequestException:
err_msg = "elsevier_request_error"
break
if res.status_code == 200:
xml_str = res.text
try:
root = etree.fromstring(xml_str.encode("utf-8"))
except etree.XMLSyntaxError:
paper.fail_reason = "elsevier_xml_error"
paper.save(update_fields=["fail_reason"])
continue
ns = {"dc": "http://purl.org/dc/elements/1.1/",
"ce": "http://www.elsevier.com/xml/common/dtd"}
abstract = root.xpath("//dc:description/text()", namespaces=ns)
if abstract:
PaperAbstract.objects.update_or_create(
paper=paper,
defaults={
"abstract": abstract[0].strip(),
"source": "elsevier"
}
)
paper.has_abstract = True
paper.has_abstract_xml = True
paras = root.xpath("//ce:para", namespaces=ns)
has_fulltext = len(paras) > 0
if has_fulltext:
paper.has_fulltext = True
paper.has_fulltext_xml = True
publication_date = paper.publication_date
paper_dir = os.path.join(
settings.BASE_DIR,
"media/papers",
str(publication_date.year),
str(publication_date.month),
str(publication_date.day)
)
os.makedirs(paper_dir, exist_ok=True)
safe_doi = paper.doi.replace("/", "_")
paper_file = os.path.join(paper_dir, f"{safe_doi}.xml")
with open(paper_file, "wb") as f:
f.write(xml_str.encode("utf-8"))
paper.save(update_fields=["has_abstract", "has_abstract_xml", "has_fulltext", "has_fulltext_xml", "update_time"])
elif res.status_code == 404:
paper.fail_reason = "elsevier_doi_not_found"
paper.save(update_fields=["fail_reason"])
remaining_count = qs.count()
if remaining_count == 0:
return "done"
else:
return f'{err_msg}, remaining {remaining_count} papers'