From 75d02814c6d794b3a531ee3ad7dd9135b5d0861a Mon Sep 17 00:00:00 2001 From: caoqianming Date: Tue, 23 Jun 2026 10:44:34 +0800 Subject: [PATCH] =?UTF-8?q?fix(resm):=20=E4=BF=AE=E5=A4=8D=20openalex=20?= =?UTF-8?q?=E9=93=BE=E5=A4=B1=E8=B4=A5=E9=81=AE=E8=94=BD=20PDF=20=E5=85=9C?= =?UTF-8?q?=E5=BA=95=E4=B8=8B=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit send_download_fulltext_task 原用 fail_reason=None 选取, 导致被 openalex 保活链写过 fail_reason 的论文被永久遮蔽, oa_url/elsevier/scihub 兜底路径 永不尝试。改为 download_pdf 终态打稳定标记 download_pdf_tried, 选取时据此 排除 —— 既解除遮蔽, 又防本链路对同一篇无限重试。 Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/resm/tasks.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index bb9b7b0..18fc75d 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -712,9 +712,12 @@ def can_send_more(max_running): @shared_task(base=CustomTask) def send_download_fulltext_task(number_of_task=100): - qs = Paper.objects.filter(has_fulltext=False, fail_reason=None, is_oa=True).exclude( + # 只排除"本下载链路已尝试过"的论文(download_pdf 终态会打 download_pdf_tried 标记), + # 不再用 fail_reason=None —— 否则被 openalex 保活链失败标记蹭上 fail_reason 的论文会被 + # 永久遮蔽, 其 oa_url/elsevier/scihub 兜底路径永远不会被尝试。 + qs = Paper.objects.filter(has_fulltext=False, is_oa=True).exclude( fetch_status='downloading' - ) + ).exclude(fail_reason__contains="download_pdf_tried") if not qs.exists(): return "done" qs0 = qs.order_by("?") @@ -772,8 +775,12 @@ def download_pdf(paper_id): # if paper.has_fulltext_pdf is False and cache.get("openalex_api_exceed") is None: # current_from = "openalex" # msg = save_pdf_from_openalex(paper) - if paper.fail_reason is None and paper.has_fulltext_pdf is False: - paper.save_fail_reason(msg) + if paper.has_fulltext_pdf is False: + # 终态标记: 无论该论文之前是否已被 openalex 保活链写过 fail_reason, 这里都追加 + # 一条带 download_pdf_tried 的终态, 供 send_download_fulltext_task 据此排除。 + # 既防本下载链路(oa_url/elsevier/scihub)对同一篇无限重试, 也避免 openalex 链 + # 留下的 fail_reason 把本链路的首次尝试给遮蔽掉。 + paper.save_fail_reason(f"download_pdf_tried:{msg}") return msg, current_from finally: paper.fetch_end()