feat: 优化save_pdf_from_scihub
This commit is contained in:
parent
b9f06b4859
commit
d7aa8f8ada
|
|
@ -140,39 +140,39 @@ async def download_pdf_with_playwright(url: str, output: str = "paper.pdf", head
|
|||
logger.info("主动等待 PDF 响应超时,继续其他方式")
|
||||
|
||||
# 尝试通过页面下载按钮
|
||||
download_selectors = ["a[href*='.pdf']", "button:has-text('Download')", "a:has-text('PDF')"]
|
||||
for sel in download_selectors:
|
||||
try:
|
||||
if await page.locator(sel).count() > 0:
|
||||
logger.info(f"尝试点击下载元素: {sel}")
|
||||
async with page.expect_download() as di:
|
||||
await page.click(sel)
|
||||
download = await di.value
|
||||
await download.save_as(output)
|
||||
logger.info(f"已保存 PDF: {output}")
|
||||
with open(output, "rb") as f:
|
||||
pdf_content = f.read()
|
||||
break
|
||||
except Exception:
|
||||
logger.exception(f"通过选择器下载失败: {sel}")
|
||||
# download_selectors = ["a[href*='.pdf']", "button:has-text('Download')", "a:has-text('PDF')"]
|
||||
# for sel in download_selectors:
|
||||
# try:
|
||||
# if await page.locator(sel).count() > 0:
|
||||
# logger.info(f"尝试点击下载元素: {sel}")
|
||||
# async with page.expect_download() as di:
|
||||
# await page.click(sel)
|
||||
# download = await di.value
|
||||
# await download.save_as(output)
|
||||
# logger.info(f"已保存 PDF: {output}")
|
||||
# with open(output, "rb") as f:
|
||||
# pdf_content = f.read()
|
||||
# break
|
||||
# except Exception:
|
||||
# logger.exception(f"通过选择器下载失败: {sel}")
|
||||
|
||||
# 回退:查找页面内 PDF 链接并直接访问
|
||||
if not pdf_content:
|
||||
logger.info("尝试查找页面内 PDF 链接")
|
||||
try:
|
||||
links = await page.eval_on_selector_all("a[href]", "els => els.map(e=>e.href)")
|
||||
candidates = [u for u in links if ".pdf" in u]
|
||||
if candidates:
|
||||
pdf_url = candidates[0]
|
||||
logger.info(f"直接访问 PDF 链接: {pdf_url}")
|
||||
resp = await page.goto(pdf_url, wait_until="networkidle")
|
||||
if resp and resp.status == 200:
|
||||
pdf_content = await resp.body()
|
||||
with open(output, "wb") as f:
|
||||
f.write(pdf_content)
|
||||
logger.info(f"已保存 PDF: {output}")
|
||||
except Exception:
|
||||
logger.exception("直接访问 PDF 链接失败")
|
||||
# if not pdf_content:
|
||||
# logger.info("尝试查找页面内 PDF 链接")
|
||||
# try:
|
||||
# links = await page.eval_on_selector_all("a[href]", "els => els.map(e=>e.href)")
|
||||
# candidates = [u for u in links if ".pdf" in u]
|
||||
# if candidates:
|
||||
# pdf_url = candidates[0]
|
||||
# logger.info(f"直接访问 PDF 链接: {pdf_url}")
|
||||
# resp = await page.goto(pdf_url, wait_until="networkidle")
|
||||
# if resp and resp.status == 200:
|
||||
# pdf_content = await resp.body()
|
||||
# with open(output, "wb") as f:
|
||||
# f.write(pdf_content)
|
||||
# logger.info(f"已保存 PDF: {output}")
|
||||
# except Exception:
|
||||
# logger.exception("直接访问 PDF 链接失败")
|
||||
|
||||
if pdf_content:
|
||||
logger.info(f"下载成功,大小: {len(pdf_content)} bytes")
|
||||
|
|
|
|||
|
|
@ -412,7 +412,7 @@ def save_pdf_from_scihub(paper:Paper):
|
|||
from .d_scihub import download_paper_by_doi
|
||||
is_ok, err_msg = download_paper_by_doi(paper.doi, paper.init_paper_path("pdf"))
|
||||
if is_ok:
|
||||
paper.has_fulltext_pdf = True
|
||||
paper.has_fulltext = True
|
||||
paper.has_fulltext_pdf = True
|
||||
paper.save(update_fields=["has_fulltext", "has_fulltext_pdf"])
|
||||
else:
|
||||
|
|
|
|||
Loading…
Reference in New Issue