paper_server/apps/resm/d_scihub.py

import argparse
import asyncio
import logging
from pathlib import Path
from typing import Optional
from playwright.async_api import async_playwright, Page, Browser

# 尝试导入 playwright-stealth
try:
    from playwright_stealth import stealth_async
except ImportError:
    stealth_async = None

# 初始化日志
Path("log").mkdir(parents=True, exist_ok=True)
LOG_PATH = Path("log") / "scihub_downloader.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
    handlers=[logging.FileHandler(LOG_PATH, encoding="utf-8"), logging.StreamHandler()],
)
logger = logging.getLogger("scihub")

# 隐藏自动化特征的浏览器启动参数
_STEALTH_ARGS = [
    "--disable-blink-features=AutomationControlled",
    "--no-first-run",
    "--no-default-browser-check",
    "--disable-infobars",
    "--disable-extensions",
    "--disable-notifications",
    "--disable-popup-blocking",
]

_USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/131.0.0.0 Safari/537.36"
)


async def _wait_for_user_to_solve_challenge(page: Page):
    logger.info("请在浏览器中完成验证（如果需要），完成后按回车继续...")
    await asyncio.get_event_loop().run_in_executor(None, input)


async def _try_click_robot_button(page: Page, headless: bool) -> bool:
    """尝试点击验证按钮（可选步骤，如果找不到则直接继续）"""
    selectors = ["text=/are you a robot/i", "div.ask", "div.altcha-checkbox", "text=Are you a robot"]
    for sel in selectors:
        try:
            loc = page.locator(sel)
            if await loc.count() > 0:
                logger.info(f"找到验证元素，尝试点击: {sel}")
                try:
                    await loc.first.click()
                except Exception as e:
                    try:
                        await page.click(sel)
                    except Exception as click_err:
                        logger.warning(f"点击验证元素失败: {click_err}")
                        pass
                await page.wait_for_timeout(1500)
                # 等待可能的导航/重定向
                try:
                    await page.wait_for_load_state("domcontentloaded", timeout=8000)
                    logger.info("点击验证后检测到导航完成")
                except Exception:
                    await page.wait_for_timeout(500)
                # 如果出现需要人工的 captcha，提示用户
                if any("recaptcha" in f.url for f in page.frames):
                    if not headless:
                        await _wait_for_user_to_solve_challenge(page)
                return True
        except Exception as e:
            logger.debug(f"尝试点击验证元素失败: {sel}: {e}")
    logger.info("页面上未发现验证按钮，直接继续")
    return False


async def _click_no_button(page: Page) -> bool:
    """尝试点击 'No' 按钮（可选步骤，如果找不到则直接继续）"""
    selectors = ["div.answer[onclick=\"check()\"]", "div.answer:has-text('No')", "text=No"]
    for sel in selectors:
        try:
            loc = page.locator(sel)
            if await loc.count() > 0:
                logger.info(f"找到 'No' 元素，尝试点击: {sel}")
                try:
                    await loc.first.click()
                except Exception as e:
                    try:
                        await page.click(sel)
                    except Exception as click_err:
                        logger.warning(f"点击 'No' 失败: {click_err}")
                        pass
                await page.wait_for_timeout(1200)
                try:
                    await page.wait_for_load_state("domcontentloaded", timeout=8000)
                    logger.info("点击 No 后检测到导航完成")
                except Exception:
                    pass
                try:
                    await page.screenshot(path="after_click_no.png", full_page=True)
                    html = await page.content()
                    with open("after_click_no.html", "w", encoding="utf-8") as f:
                        f.write(html)
                    logger.info("已保存 after_click_no.png / after_click_no.html")
                except Exception:
                    logger.exception("保存点击 No 的结果失败")
                return True
        except Exception as e:
            logger.debug(f"检查 'No' 按钮时出错: {sel}: {e}")
    logger.info("页面上未发现 'No' 按钮")
    return False


# 各种 bot 检测页面的标题关键词
_CHALLENGE_TITLES = ["just a moment", "cloudflare", "checking your browser", "ddos-guard", "attention required", "ddos guard"]


async def _wait_challenge_clear(page: Page, timeout: int = 45000) -> bool:
    """等待 bot 检测（Cloudflare/DDoS-Guard 等）挑战页面自动通过"""
    try:
        title = await page.title()
    except Exception:
        return False

    title_l = title.lower()
    if not any(kw in title_l for kw in _CHALLENGE_TITLES):
        return True  # 无挑战，直接通过

    logger.info(f"检测到安全挑战页面（标题: {title!r}），等待 JS 自动通过...")

    # 尝试点击 Cloudflare Turnstile 复选框（如果有）
    for frame_sel in [
        'iframe[src*="challenges.cloudflare.com"]',
        'iframe[title*="cloudflare"]',
        'iframe[title*="challenge"]',
    ]:
        try:
            checkbox = page.frame_locator(frame_sel).locator('input[type="checkbox"]')
            if await checkbox.count() > 0:
                await checkbox.click(timeout=3000)
                logger.info("点击了验证复选框")
                break
        except Exception:
            pass

    try:
        await page.wait_for_function(
            """() => {
                const t = document.title.toLowerCase();
                const keywords = ['just a moment', 'cloudflare', 'checking your browser',
                                  'ddos-guard', 'ddos guard', 'attention required'];
                return !keywords.some(kw => t.includes(kw));
            }""",
            timeout=timeout,
        )
        logger.info(f"挑战已通过，当前标题: {await page.title()!r}")
        return True
    except Exception as e:
        logger.warning(f"等待挑战超时（{timeout}ms）: {e}")
        return False


async def download_pdf_with_playwright(url: str, output: str = "paper.pdf", headless: bool = False) -> Optional[bytes]:
    async with async_playwright() as p:
        browser: Browser = await p.chromium.launch(
            headless=headless,
            args=_STEALTH_ARGS,
        )
        context = await browser.new_context(
            viewport={"width": 1920, "height": 1080},
            user_agent=_USER_AGENT,
            locale="zh-CN",
            timezone_id="Asia/Shanghai",
        )
        page = await context.new_page()

        # 应用 stealth 模式
        if stealth_async:
            await stealth_async(page)
        else:
            await page.add_init_script(
                "Object.defineProperty(navigator,'webdriver',{get:()=>false});"
            )

        pdf_url: Optional[str] = None
        pdf_content: Optional[bytes] = None

        async def on_response(response):
            nonlocal pdf_url
            ct = response.headers.get("content-type", "")
            resp_url = response.url
            # 忽略 chrome-extension 等内部 URL
            if "application/pdf" in ct and resp_url.startswith("http"):
                logger.info(f"检测到 PDF 响应 URL: {resp_url}")
                pdf_url = resp_url

        page.on("response", on_response)

        try:
            logger.info(f"打开: {url}")
            try:
                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
            except Exception:
                logger.info("页面加载超时（可能正在处理安全验证），继续等待...")
                await page.wait_for_timeout(3000)

            # 等待 bot 检测挑战完成（DDoS-Guard / Cloudflare 等）
            await _wait_challenge_clear(page)
            await page.wait_for_timeout(1000)

            # 尝试点击验证 & No
            await _try_click_robot_button(page, headless)
            await _click_no_button(page)

            # 等待页面加载并检测 PDF 响应 URL
            logger.info("等待 PDF 响应...")
            await page.wait_for_timeout(3000)

            if not pdf_url:
                try:
                    resp = await page.wait_for_response(
                        lambda r: "application/pdf" in r.headers.get("content-type", "")
                                  and r.url.startswith("http"),
                        timeout=8000,
                    )
                    pdf_url = resp.url
                    logger.info(f"主动等待到 PDF URL: {pdf_url}")
                except Exception:
                    logger.info("等待 PDF 响应超时")

            # 直接导航到 PDF URL 下载完整内容
            if pdf_url:
                logger.info(f"直接请求 PDF: {pdf_url}")
                try:
                    pdf_resp = await page.goto(pdf_url, wait_until="networkidle", timeout=30000)
                    if pdf_resp and pdf_resp.status == 200:
                        pdf_content = await pdf_resp.body()
                        logger.info(f"下载成功，大小: {len(pdf_content)} bytes")
                except Exception as e:
                    logger.warning(f"直接导航下载失败: {e}，尝试 fetch")
                    try:
                        pdf_content = await page.evaluate(f"""
                            async () => {{
                                const r = await fetch({pdf_url!r});
                                const buf = await r.arrayBuffer();
                                return Array.from(new Uint8Array(buf));
                            }}
                        """)
                        if pdf_content:
                            pdf_content = bytes(pdf_content)
                    except Exception as e2:
                        logger.warning(f"fetch 下载也失败: {e2}")

            if pdf_content and len(pdf_content) > 10240:
                logger.info(f"下载成功，大小: {len(pdf_content)} bytes")
                return pdf_content
            else:
                if pdf_content:
                    logger.warning(f"PDF 文件过小（{len(pdf_content)} bytes），可能是错误页")
                logger.warning("未能获取有效 PDF，已保存页面快照供排查")
                try:
                    await page.screenshot(path="scihub_screenshot.png", full_page=True)
                    html = await page.content()
                    with open("scihub_page.html", "w", encoding="utf-8") as f:
                        f.write(html)
                except Exception:
                    logger.exception("保存调试信息失败")
                return None
        finally:
            try:
                await context.close()
            except Exception:
                logger.exception("关闭 context 失败")
            try:
                await browser.close()
            except Exception:
                logger.exception("关闭 browser 失败")


# 按优先级排列的 sci-hub 域名（国内相对可访问）
_SCIHUB_DOMAINS = [
    "sci-hub.ren",
    "sci-hub.ee",
    "sci-hub.st",
    "sci-hub.se",
]


def download_paper_by_doi(doi: str, output: Optional[str] = None, headless: bool = True) -> tuple[bool, str]:
    """
    通过 DOI 下载论文 PDF（供 task 调用）

    参数：
        doi: DOI 字符串，例如 "10.1016/j.conbuildmat.2017.10.091"
        output: 输出文件路径（默认基于 DOI 生成，格式：10.1016_j.xxx.pdf）
        headless: 是否无头模式（默认 True）

    返回：
        (True, "文件路径") 如果成功
        (False, "scihub_error_*: 错误详情") 如果失败，错误码前缀包括：
            - scihub_error_empty_doi: DOI 为空
            - scihub_error_timeout: 网页加载超时
            - scihub_error_load_failed: 加载页面失败
            - scihub_error_pdf_not_found: 无法获取 PDF
            - scihub_error_exception: 其他异常
    """
    try:
        doi = doi.strip()
        if not doi:
            err = "scihub_error_empty_doi: DOI 为空"
            logger.error(err)
            return False, err

        output_path = output or f"{doi.replace('/', '_')}.pdf"

        for domain in _SCIHUB_DOMAINS:
            url = f"https://{domain}/{doi}"
            logger.info(f"尝试域名: {url}")
            try:
                pdf_content = asyncio.run(download_pdf_with_playwright(url, output=output_path, headless=headless))
            except asyncio.TimeoutError:
                logger.warning(f"{domain} 超时，尝试下一个域名")
                continue
            except Exception as e:
                logger.warning(f"{domain} 出错: {e}，尝试下一个域名")
                continue

            if pdf_content:
                # 写入文件
                import os
                os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
                with open(output_path, "wb") as f:
                    f.write(pdf_content)
                logger.info(f"✓ 成功下载（{domain}）: {output_path} ({len(pdf_content)} bytes)")
                return True, output_path
            else:
                logger.warning(f"{domain} 未获取到 PDF，尝试下一个域名")

        err = "scihub_error_pdf_not_found: 所有域名均无法获取 PDF"
        logger.error(err)
        return False, err
    except Exception as e:
        err = f"scihub_error_exception: 执行下载时发生异常 - {str(e)}"
        logger.exception(err)
        return False, err


def _parse_args():
    p = argparse.ArgumentParser(description="简化的 Sci-Hub PDF 下载器，支持 DOI")
    p.add_argument("--doi", help="DOI，例如 10.1016/j.conbuildmat.2017.10.091")
    p.add_argument("-o", "--output", help="输出文件名（默认基于 DOI）")
    p.add_argument("--headless", action="store_true", help="无头模式")
    return p.parse_args()


if __name__ == "__main__":
    args = _parse_args()
    if not args.doi:
        logger.error("请通过 --doi 提供 DOI")
        raise SystemExit(1)
    success, msg = download_paper_by_doi(args.doi, output=args.output, headless=args.headless)
    if success:
        logger.info(f"完成: {msg}")
    else:
        logger.error(f"失败: {msg}")
        raise SystemExit(1)