feat: 优化save_pdf_from_scihub
This commit is contained in:
parent
b9f06b4859
commit
d7aa8f8ada
|
|
@ -140,39 +140,39 @@ async def download_pdf_with_playwright(url: str, output: str = "paper.pdf", head
|
||||||
logger.info("主动等待 PDF 响应超时,继续其他方式")
|
logger.info("主动等待 PDF 响应超时,继续其他方式")
|
||||||
|
|
||||||
# 尝试通过页面下载按钮
|
# 尝试通过页面下载按钮
|
||||||
download_selectors = ["a[href*='.pdf']", "button:has-text('Download')", "a:has-text('PDF')"]
|
# download_selectors = ["a[href*='.pdf']", "button:has-text('Download')", "a:has-text('PDF')"]
|
||||||
for sel in download_selectors:
|
# for sel in download_selectors:
|
||||||
try:
|
# try:
|
||||||
if await page.locator(sel).count() > 0:
|
# if await page.locator(sel).count() > 0:
|
||||||
logger.info(f"尝试点击下载元素: {sel}")
|
# logger.info(f"尝试点击下载元素: {sel}")
|
||||||
async with page.expect_download() as di:
|
# async with page.expect_download() as di:
|
||||||
await page.click(sel)
|
# await page.click(sel)
|
||||||
download = await di.value
|
# download = await di.value
|
||||||
await download.save_as(output)
|
# await download.save_as(output)
|
||||||
logger.info(f"已保存 PDF: {output}")
|
# logger.info(f"已保存 PDF: {output}")
|
||||||
with open(output, "rb") as f:
|
# with open(output, "rb") as f:
|
||||||
pdf_content = f.read()
|
# pdf_content = f.read()
|
||||||
break
|
# break
|
||||||
except Exception:
|
# except Exception:
|
||||||
logger.exception(f"通过选择器下载失败: {sel}")
|
# logger.exception(f"通过选择器下载失败: {sel}")
|
||||||
|
|
||||||
# 回退:查找页面内 PDF 链接并直接访问
|
# 回退:查找页面内 PDF 链接并直接访问
|
||||||
if not pdf_content:
|
# if not pdf_content:
|
||||||
logger.info("尝试查找页面内 PDF 链接")
|
# logger.info("尝试查找页面内 PDF 链接")
|
||||||
try:
|
# try:
|
||||||
links = await page.eval_on_selector_all("a[href]", "els => els.map(e=>e.href)")
|
# links = await page.eval_on_selector_all("a[href]", "els => els.map(e=>e.href)")
|
||||||
candidates = [u for u in links if ".pdf" in u]
|
# candidates = [u for u in links if ".pdf" in u]
|
||||||
if candidates:
|
# if candidates:
|
||||||
pdf_url = candidates[0]
|
# pdf_url = candidates[0]
|
||||||
logger.info(f"直接访问 PDF 链接: {pdf_url}")
|
# logger.info(f"直接访问 PDF 链接: {pdf_url}")
|
||||||
resp = await page.goto(pdf_url, wait_until="networkidle")
|
# resp = await page.goto(pdf_url, wait_until="networkidle")
|
||||||
if resp and resp.status == 200:
|
# if resp and resp.status == 200:
|
||||||
pdf_content = await resp.body()
|
# pdf_content = await resp.body()
|
||||||
with open(output, "wb") as f:
|
# with open(output, "wb") as f:
|
||||||
f.write(pdf_content)
|
# f.write(pdf_content)
|
||||||
logger.info(f"已保存 PDF: {output}")
|
# logger.info(f"已保存 PDF: {output}")
|
||||||
except Exception:
|
# except Exception:
|
||||||
logger.exception("直接访问 PDF 链接失败")
|
# logger.exception("直接访问 PDF 链接失败")
|
||||||
|
|
||||||
if pdf_content:
|
if pdf_content:
|
||||||
logger.info(f"下载成功,大小: {len(pdf_content)} bytes")
|
logger.info(f"下载成功,大小: {len(pdf_content)} bytes")
|
||||||
|
|
|
||||||
|
|
@ -412,7 +412,7 @@ def save_pdf_from_scihub(paper:Paper):
|
||||||
from .d_scihub import download_paper_by_doi
|
from .d_scihub import download_paper_by_doi
|
||||||
is_ok, err_msg = download_paper_by_doi(paper.doi, paper.init_paper_path("pdf"))
|
is_ok, err_msg = download_paper_by_doi(paper.doi, paper.init_paper_path("pdf"))
|
||||||
if is_ok:
|
if is_ok:
|
||||||
paper.has_fulltext_pdf = True
|
paper.has_fulltext = True
|
||||||
paper.has_fulltext_pdf = True
|
paper.has_fulltext_pdf = True
|
||||||
paper.save(update_fields=["has_fulltext", "has_fulltext_pdf"])
|
paper.save(update_fields=["has_fulltext", "has_fulltext_pdf"])
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue