diff --git a/apps/resm/d_scihub.py b/apps/resm/d_scihub.py index 5039ec5..853a415 100644 --- a/apps/resm/d_scihub.py +++ b/apps/resm/d_scihub.py @@ -140,39 +140,39 @@ async def download_pdf_with_playwright(url: str, output: str = "paper.pdf", head logger.info("主动等待 PDF 响应超时,继续其他方式") # 尝试通过页面下载按钮 - download_selectors = ["a[href*='.pdf']", "button:has-text('Download')", "a:has-text('PDF')"] - for sel in download_selectors: - try: - if await page.locator(sel).count() > 0: - logger.info(f"尝试点击下载元素: {sel}") - async with page.expect_download() as di: - await page.click(sel) - download = await di.value - await download.save_as(output) - logger.info(f"已保存 PDF: {output}") - with open(output, "rb") as f: - pdf_content = f.read() - break - except Exception: - logger.exception(f"通过选择器下载失败: {sel}") + # download_selectors = ["a[href*='.pdf']", "button:has-text('Download')", "a:has-text('PDF')"] + # for sel in download_selectors: + # try: + # if await page.locator(sel).count() > 0: + # logger.info(f"尝试点击下载元素: {sel}") + # async with page.expect_download() as di: + # await page.click(sel) + # download = await di.value + # await download.save_as(output) + # logger.info(f"已保存 PDF: {output}") + # with open(output, "rb") as f: + # pdf_content = f.read() + # break + # except Exception: + # logger.exception(f"通过选择器下载失败: {sel}") # 回退:查找页面内 PDF 链接并直接访问 - if not pdf_content: - logger.info("尝试查找页面内 PDF 链接") - try: - links = await page.eval_on_selector_all("a[href]", "els => els.map(e=>e.href)") - candidates = [u for u in links if ".pdf" in u] - if candidates: - pdf_url = candidates[0] - logger.info(f"直接访问 PDF 链接: {pdf_url}") - resp = await page.goto(pdf_url, wait_until="networkidle") - if resp and resp.status == 200: - pdf_content = await resp.body() - with open(output, "wb") as f: - f.write(pdf_content) - logger.info(f"已保存 PDF: {output}") - except Exception: - logger.exception("直接访问 PDF 链接失败") + # if not pdf_content: + # logger.info("尝试查找页面内 PDF 链接") + # try: + # links = await page.eval_on_selector_all("a[href]", "els => els.map(e=>e.href)") + # candidates = [u for u in links if ".pdf" in u] + # if candidates: + # pdf_url = candidates[0] + # logger.info(f"直接访问 PDF 链接: {pdf_url}") + # resp = await page.goto(pdf_url, wait_until="networkidle") + # if resp and resp.status == 200: + # pdf_content = await resp.body() + # with open(output, "wb") as f: + # f.write(pdf_content) + # logger.info(f"已保存 PDF: {output}") + # except Exception: + # logger.exception("直接访问 PDF 链接失败") if pdf_content: logger.info(f"下载成功,大小: {len(pdf_content)} bytes") diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index 88e59e3..b1b7ef5 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -412,7 +412,7 @@ def save_pdf_from_scihub(paper:Paper): from .d_scihub import download_paper_by_doi is_ok, err_msg = download_paper_by_doi(paper.doi, paper.init_paper_path("pdf")) if is_ok: - paper.has_fulltext_pdf = True + paper.has_fulltext = True paper.has_fulltext_pdf = True paper.save(update_fields=["has_fulltext", "has_fulltext_pdf"]) else: