diff --git a/apps/resm/models.py b/apps/resm/models.py index bbff44d..88ef40b 100644 --- a/apps/resm/models.py +++ b/apps/resm/models.py @@ -1,5 +1,7 @@ from django.db import models from apps.utils.models import BaseModel +from django.conf import settings +import os # Create your models here. class Paper(BaseModel): @@ -43,6 +45,32 @@ class Paper(BaseModel): ) search_word_first = models.TextField(default="cement") + def init_save_dir(self): + publication_date = self.publication_date + paper_dir = os.path.join( + settings.BASE_DIR, + "media/papers", + str(publication_date.year), + str(publication_date.month), + str(publication_date.day) + ) + os.makedirs(paper_dir, exist_ok=True) + return paper_dir + + def save_file_xml(self, content): + safe_doi = self.doi.replace("/", "_") + paper_file = os.path.join(self.init_save_dir(), f"{safe_doi}.xml") + with open(paper_file, "wb") as f: + f.write(content.encode("utf-8")) + + def save_file_pdf(self, content): + safe_doi = self.doi.replace("/", "_") + paper_file = os.path.join(self.init_save_dir(), f"{safe_doi}.pdf") + with open(paper_file, "wb") as f: + f.write(content) + + + class PaperAbstract(BaseModel): paper = models.OneToOneField( Paper, diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index 0b3d529..9a39e8a 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -120,7 +120,8 @@ def get_abstract_from_elsevier(publication_year: int = None, number_of_task:int continue ns = {"dc": "http://purl.org/dc/elements/1.1/", - "ce": "http://www.elsevier.com/xml/common/dtd"} + "ce": "http://www.elsevier.com/xml/common/dtd", + "xocs": "http://www.elsevier.com/xml/xocs/dtd",} abstract = root.xpath("//dc:description/text()", namespaces=ns) if abstract: PaperAbstract.objects.update_or_create( @@ -140,25 +141,16 @@ def get_abstract_from_elsevier(publication_year: int = None, number_of_task:int paras = root.xpath("//ce:para", namespaces=ns) has_fulltext = len(paras) > 0 + if has_fulltext is False: + rawtexts = root.xpath("//xocs:rawtext/text()",namespaces=ns) + if rawtexts and len(rawtexts[0].strip()) > 2000: + has_fulltext = True if has_fulltext: paper.has_fulltext = True paper.has_fulltext_xml = True paper.fetch_status = "fulltext_ready" - publication_date = paper.publication_date - paper_dir = os.path.join( - settings.BASE_DIR, - "media/papers", - str(publication_date.year), - str(publication_date.month), - str(publication_date.day) - ) - os.makedirs(paper_dir, exist_ok=True) - - safe_doi = paper.doi.replace("/", "_") - paper_file = os.path.join(paper_dir, f"{safe_doi}.xml") - with open(paper_file, "wb") as f: - f.write(xml_str.encode("utf-8")) + paper.save_file_xml(xml_str) paper.save(update_fields=["has_abstract", "has_abstract_xml", "has_fulltext", "has_fulltext_xml", "update_time", "fetch_status", "fail_reason"]) @@ -175,4 +167,50 @@ def get_abstract_from_elsevier(publication_year: int = None, number_of_task:int }, countdown=5, ) - return f'{err_msg}, remaining {qs.count()} papers' \ No newline at end of file + return f'{err_msg}, remaining {qs.count()} papers' + + +def is_elsevier_pdf_task_enabled(): + return cache.get("elsevier_pdf_task_enabled", True) + + +@shared_task(base=CustomTask) +def get_pdf_from_elsevier(number_of_task=100): + """ + 获取elsevier全文 + """ + if not is_elsevier_pdf_task_enabled(): + return "stoped" + qs = Paper.objects.filter(has_fulltext=True, has_fulltext_pdf=False) + err_msg = "" + with requests.Session() as req: + for paper in qs[:number_of_task]: + if not is_elsevier_pdf_task_enabled(): + break + params = { + "apiKey": ELSEVIER_APIKEY, + "httpAccept": "application/pdf" + } + try: + res = req.get( + f"https://api.elsevier.com/content/article/doi/{paper.doi}", + params=params, + timeout=(3, 15) + ) + except requests.RequestException: + err_msg = "elsevier_request_error" + break + if res.status_code == 200 and res.headers["content-type"] == "application/pdf": + paper.save_file_pdf(res.content) + paper.has_fulltext_pdf = True + paper.save(update_fields=["has_fulltext_pdf", "update_time"]) + if is_elsevier_pdf_task_enabled(): + current_app.send_task( + "apps.resm.tasks.get_pdf_from_elsevier", + kwargs={ + "number_of_task": number_of_task, + }, + countdown=5, + ) + return f'{err_msg}, remaining {qs.count()} papers' + \ No newline at end of file