feat: 优化save_pdf_from_scihub

This commit is contained in:
caoqianming 2026-02-04 11:26:55 +08:00
parent b9f06b4859
commit d7aa8f8ada
2 changed files with 32 additions and 32 deletions

View File

@ -140,39 +140,39 @@ async def download_pdf_with_playwright(url: str, output: str = "paper.pdf", head
logger.info("主动等待 PDF 响应超时,继续其他方式")
# 尝试通过页面下载按钮
download_selectors = ["a[href*='.pdf']", "button:has-text('Download')", "a:has-text('PDF')"]
for sel in download_selectors:
try:
if await page.locator(sel).count() > 0:
logger.info(f"尝试点击下载元素: {sel}")
async with page.expect_download() as di:
await page.click(sel)
download = await di.value
await download.save_as(output)
logger.info(f"已保存 PDF: {output}")
with open(output, "rb") as f:
pdf_content = f.read()
break
except Exception:
logger.exception(f"通过选择器下载失败: {sel}")
# download_selectors = ["a[href*='.pdf']", "button:has-text('Download')", "a:has-text('PDF')"]
# for sel in download_selectors:
# try:
# if await page.locator(sel).count() > 0:
# logger.info(f"尝试点击下载元素: {sel}")
# async with page.expect_download() as di:
# await page.click(sel)
# download = await di.value
# await download.save_as(output)
# logger.info(f"已保存 PDF: {output}")
# with open(output, "rb") as f:
# pdf_content = f.read()
# break
# except Exception:
# logger.exception(f"通过选择器下载失败: {sel}")
# 回退:查找页面内 PDF 链接并直接访问
if not pdf_content:
logger.info("尝试查找页面内 PDF 链接")
try:
links = await page.eval_on_selector_all("a[href]", "els => els.map(e=>e.href)")
candidates = [u for u in links if ".pdf" in u]
if candidates:
pdf_url = candidates[0]
logger.info(f"直接访问 PDF 链接: {pdf_url}")
resp = await page.goto(pdf_url, wait_until="networkidle")
if resp and resp.status == 200:
pdf_content = await resp.body()
with open(output, "wb") as f:
f.write(pdf_content)
logger.info(f"已保存 PDF: {output}")
except Exception:
logger.exception("直接访问 PDF 链接失败")
# if not pdf_content:
# logger.info("尝试查找页面内 PDF 链接")
# try:
# links = await page.eval_on_selector_all("a[href]", "els => els.map(e=>e.href)")
# candidates = [u for u in links if ".pdf" in u]
# if candidates:
# pdf_url = candidates[0]
# logger.info(f"直接访问 PDF 链接: {pdf_url}")
# resp = await page.goto(pdf_url, wait_until="networkidle")
# if resp and resp.status == 200:
# pdf_content = await resp.body()
# with open(output, "wb") as f:
# f.write(pdf_content)
# logger.info(f"已保存 PDF: {output}")
# except Exception:
# logger.exception("直接访问 PDF 链接失败")
if pdf_content:
logger.info(f"下载成功,大小: {len(pdf_content)} bytes")

View File

@ -412,7 +412,7 @@ def save_pdf_from_scihub(paper:Paper):
from .d_scihub import download_paper_by_doi
is_ok, err_msg = download_paper_by_doi(paper.doi, paper.init_paper_path("pdf"))
if is_ok:
paper.has_fulltext_pdf = True
paper.has_fulltext = True
paper.has_fulltext_pdf = True
paper.save(update_fields=["has_fulltext", "has_fulltext_pdf"])
else: