fix(resm): 修复 openalex 链失败遮蔽 PDF 兜底下载

send_download_fulltext_task 原用 fail_reason=None 选取, 导致被 openalex
保活链写过 fail_reason 的论文被永久遮蔽, oa_url/elsevier/scihub 兜底路径
永不尝试。改为 download_pdf 终态打稳定标记 download_pdf_tried, 选取时据此
排除 —— 既解除遮蔽, 又防本链路对同一篇无限重试。

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
caoqianming 2026-06-23 10:44:34 +08:00
parent 12f97fc47f
commit 75d02814c6
1 changed files with 11 additions and 4 deletions

View File

@ -712,9 +712,12 @@ def can_send_more(max_running):
@shared_task(base=CustomTask)
def send_download_fulltext_task(number_of_task=100):
qs = Paper.objects.filter(has_fulltext=False, fail_reason=None, is_oa=True).exclude(
# 只排除"本下载链路已尝试过"的论文(download_pdf 终态会打 download_pdf_tried 标记),
# 不再用 fail_reason=None —— 否则被 openalex 保活链失败标记蹭上 fail_reason 的论文会被
# 永久遮蔽, 其 oa_url/elsevier/scihub 兜底路径永远不会被尝试。
qs = Paper.objects.filter(has_fulltext=False, is_oa=True).exclude(
fetch_status='downloading'
)
).exclude(fail_reason__contains="download_pdf_tried")
if not qs.exists():
return "done"
qs0 = qs.order_by("?")
@ -772,8 +775,12 @@ def download_pdf(paper_id):
# if paper.has_fulltext_pdf is False and cache.get("openalex_api_exceed") is None:
# current_from = "openalex"
# msg = save_pdf_from_openalex(paper)
if paper.fail_reason is None and paper.has_fulltext_pdf is False:
paper.save_fail_reason(msg)
if paper.has_fulltext_pdf is False:
# 终态标记: 无论该论文之前是否已被 openalex 保活链写过 fail_reason, 这里都追加
# 一条带 download_pdf_tried 的终态, 供 send_download_fulltext_task 据此排除。
# 既防本下载链路(oa_url/elsevier/scihub)对同一篇无限重试, 也避免 openalex 链
# 留下的 fail_reason 把本链路的首次尝试给遮蔽掉。
paper.save_fail_reason(f"download_pdf_tried:{msg}")
return msg, current_from
finally:
paper.fetch_end()