diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index bb9b7b0..18fc75d 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -712,9 +712,12 @@ def can_send_more(max_running): @shared_task(base=CustomTask) def send_download_fulltext_task(number_of_task=100): - qs = Paper.objects.filter(has_fulltext=False, fail_reason=None, is_oa=True).exclude( + # 只排除"本下载链路已尝试过"的论文(download_pdf 终态会打 download_pdf_tried 标记), + # 不再用 fail_reason=None —— 否则被 openalex 保活链失败标记蹭上 fail_reason 的论文会被 + # 永久遮蔽, 其 oa_url/elsevier/scihub 兜底路径永远不会被尝试。 + qs = Paper.objects.filter(has_fulltext=False, is_oa=True).exclude( fetch_status='downloading' - ) + ).exclude(fail_reason__contains="download_pdf_tried") if not qs.exists(): return "done" qs0 = qs.order_by("?") @@ -772,8 +775,12 @@ def download_pdf(paper_id): # if paper.has_fulltext_pdf is False and cache.get("openalex_api_exceed") is None: # current_from = "openalex" # msg = save_pdf_from_openalex(paper) - if paper.fail_reason is None and paper.has_fulltext_pdf is False: - paper.save_fail_reason(msg) + if paper.has_fulltext_pdf is False: + # 终态标记: 无论该论文之前是否已被 openalex 保活链写过 fail_reason, 这里都追加 + # 一条带 download_pdf_tried 的终态, 供 send_download_fulltext_task 据此排除。 + # 既防本下载链路(oa_url/elsevier/scihub)对同一篇无限重试, 也避免 openalex 链 + # 留下的 fail_reason 把本链路的首次尝试给遮蔽掉。 + paper.save_fail_reason(f"download_pdf_tried:{msg}") return msg, current_from finally: paper.fetch_end()