feat: 优化save_pdf_from_scihub

2026-02-04 11:26:55 +08:00 · 2026-02-04 11:26:55 +08:00 · d7aa8f8ada
parent b9f06b4859
commit d7aa8f8ada
2 changed files with 32 additions and 32 deletions
--- a/apps/resm/d_scihub.py
+++ b/apps/resm/d_scihub.py
@ -140,39 +140,39 @@ async def download_pdf_with_playwright(url: str, output: str = "paper.pdf", head
                    logger.info("主动等待 PDF 响应超时，继续其他方式")

            # 尝试通过页面下载按钮
-            download_selectors = ["a[href*='.pdf']", "button:has-text('Download')", "a:has-text('PDF')"]
-            for sel in download_selectors:
-                try:
-                    if await page.locator(sel).count() > 0:
-                        logger.info(f"尝试点击下载元素: {sel}")
-                        async with page.expect_download() as di:
-                            await page.click(sel)
-                        download = await di.value
-                        await download.save_as(output)
-                        logger.info(f"已保存 PDF: {output}")
-                        with open(output, "rb") as f:
-                            pdf_content = f.read()
-                        break
-                except Exception:
-                    logger.exception(f"通过选择器下载失败: {sel}")
+            # download_selectors = ["a[href*='.pdf']", "button:has-text('Download')", "a:has-text('PDF')"]
+            # for sel in download_selectors:
+            #     try:
+            #         if await page.locator(sel).count() > 0:
+            #             logger.info(f"尝试点击下载元素: {sel}")
+            #             async with page.expect_download() as di:
+            #                 await page.click(sel)
+            #             download = await di.value
+            #             await download.save_as(output)
+            #             logger.info(f"已保存 PDF: {output}")
+            #             with open(output, "rb") as f:
+            #                 pdf_content = f.read()
+            #             break
+            #     except Exception:
+            #         logger.exception(f"通过选择器下载失败: {sel}")

            # 回退：查找页面内 PDF 链接并直接访问
-            if not pdf_content:
-                logger.info("尝试查找页面内 PDF 链接")
-                try:
-                    links = await page.eval_on_selector_all("a[href]", "els => els.map(e=>e.href)")
-                    candidates = [u for u in links if ".pdf" in u]
-                    if candidates:
-                        pdf_url = candidates[0]
-                        logger.info(f"直接访问 PDF 链接: {pdf_url}")
-                        resp = await page.goto(pdf_url, wait_until="networkidle")
-                        if resp and resp.status == 200:
-                            pdf_content = await resp.body()
-                            with open(output, "wb") as f:
-                                f.write(pdf_content)
-                            logger.info(f"已保存 PDF: {output}")
-                except Exception:
-                    logger.exception("直接访问 PDF 链接失败")
+            # if not pdf_content:
+            #     logger.info("尝试查找页面内 PDF 链接")
+            #     try:
+            #         links = await page.eval_on_selector_all("a[href]", "els => els.map(e=>e.href)")
+            #         candidates = [u for u in links if ".pdf" in u]
+            #         if candidates:
+            #             pdf_url = candidates[0]
+            #             logger.info(f"直接访问 PDF 链接: {pdf_url}")
+            #             resp = await page.goto(pdf_url, wait_until="networkidle")
+            #             if resp and resp.status == 200:
+            #                 pdf_content = await resp.body()
+            #                 with open(output, "wb") as f:
+            #                     f.write(pdf_content)
+            #                 logger.info(f"已保存 PDF: {output}")
+            #     except Exception:
+            #         logger.exception("直接访问 PDF 链接失败")

            if pdf_content:
                logger.info(f"下载成功，大小: {len(pdf_content)} bytes")
--- a/apps/resm/tasks.py
+++ b/apps/resm/tasks.py
@ -412,7 +412,7 @@ def save_pdf_from_scihub(paper:Paper):
    from .d_scihub import download_paper_by_doi
    is_ok, err_msg = download_paper_by_doi(paper.doi, paper.init_paper_path("pdf"))
    if is_ok:
-        paper.has_fulltext_pdf = True
+        paper.has_fulltext = True
        paper.has_fulltext_pdf = True
        paper.save(update_fields=["has_fulltext", "has_fulltext_pdf"])
    else: