paper_server/apps/resm/d_oaurl.py

from playwright.async_api import async_playwright, Page, Browser
from typing import Optional
import asyncio
import sys
import os

# 尝试导入 playwright-stealth，如果没有安装则忽略
try:
    from playwright_stealth import stealth_async
except ImportError:
    stealth_async = None

# 隐藏自动化特征的浏览器启动参数
_STEALTH_ARGS = [
    "--disable-blink-features=AutomationControlled",
    "--no-first-run",
    "--no-default-browser-check",
    "--disable-infobars",
    "--disable-extensions",
    "--disable-notifications",
]

_USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/131.0.0.0 Safari/537.36"
)


async def download_from_url_playwright(url: str, save_path: str) -> tuple[bool, str]:
    async with async_playwright() as p:
        try:
            browser = await p.chromium.launch(
                headless=False,
                args=_STEALTH_ARGS,
            )
            context = await browser.new_context(
                viewport={"width": 1920, "height": 1080},
                user_agent=_USER_AGENT,
                locale="zh-CN",
                timezone_id="Asia/Shanghai",
            )
            page = await context.new_page()
            page.set_default_timeout(60000)

            # 应用 stealth 模式绕过反爬虫检测
            if stealth_async:
                await stealth_async(page)
            else:
                await page.add_init_script(
                    "Object.defineProperty(navigator,'webdriver',{get:()=>false});"
                )

            pdf_content: Optional[bytes] = None

            async def on_response(response):
                nonlocal pdf_content
                if "application/pdf" in response.headers.get("content-type", ""):
                    try:
                        pdf_content = await response.body()
                        print(f"✓ 成功捕获 PDF，大小: {len(pdf_content)} bytes")
                    except Exception as e:
                        print(f"⚠ 读取 PDF 响应体失败: {e}")

            page.on("response", on_response)

            # 加载页面
            try:
                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
            except Exception:
                print("⚠ 页面加载超时，但继续处理...")

            # 等待 Cloudflare 挑战完成
            await handle_cloudflare(page)
            await page.wait_for_timeout(2000)

            # 如果尚未获取 PDF，继续等待响应
            if not pdf_content:
                print("等待 PDF 响应...")
                try:
                    response = await page.wait_for_response(
                        lambda response: "application/pdf" in response.headers.get("content-type", ""),
                        timeout=15000
                    )
                    pdf_content = await response.body()
                    print(f"✓ 通过 wait_for_response 获取 PDF，大小: {len(pdf_content)} bytes")
                except Exception as e:
                    print(f"⚠ 等待 PDF 响应超时: {e}")

            if pdf_content:
                pdf_size = len(pdf_content)
                if pdf_size < 10240:
                    await browser.close()
                    return False, f"PDF 文件过小: {pdf_size} bytes，可能下载不完整"

                with open(save_path, "wb") as f:
                    f.write(pdf_content)
                print(f"✓ PDF 已保存到: {save_path}，大小: {pdf_size} bytes")
                await browser.close()
                return True, ""
            else:
                await browser.close()
                return False, "未能获取 PDF 内容"
        except Exception as e:
            print(f"异常: {e}")
            return False, str(e)
        finally:
            try:
                await browser.close()
            except Exception:
                pass


def download_pdf_with_curl_cffi(url: str, save_path: str) -> tuple[bool, str]:
    """使用 curl-cffi 伪造 Chrome TLS 指纹下载 PDF，可绕过 Cloudflare JS 挑战"""
    try:
        import curl_cffi.requests as cf
        resp = cf.get(
            url,
            impersonate="chrome131",
            headers={
                "Accept": "application/pdf,application/octet-stream,*/*",
                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
                "Referer": url,
            },
            timeout=30,
            allow_redirects=True,
        )
        if resp.status_code != 200:
            return False, f"http_{resp.status_code}"
        content = resp.content
        if not content.startswith(b"%PDF") or len(content) < 10240:
            return False, "not_valid_pdf"
        import os
        os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
        with open(save_path, "wb") as f:
            f.write(content)
        return True, ""
    except ImportError:
        return False, "curl_cffi_not_installed"
    except Exception as e:
        return False, str(e)


_CHALLENGE_TITLES = ["just a moment", "cloudflare", "checking your browser", "ddos-guard", "ddos guard", "attention required"]


async def handle_cloudflare(page: Page, timeout: int = 45000) -> bool:
    """
    处理 bot 检测安全验证（Cloudflare / DDoS-Guard 等）：
    1. 通过页面标题检测是否处于挑战页面
    2. 尝试点击 Turnstile 复选框（iframe 内，如果有）
    3. 等待标题变化，表明挑战已自动通过
    """
    try:
        title = await page.title()
    except Exception:
        return False

    title_l = title.lower()
    if not any(kw in title_l for kw in _CHALLENGE_TITLES):
        return True  # 无挑战

    print(f"检测到安全挑战页面（标题: {title!r}），等待 JS 自动通过...")

    # 尝试点击 Cloudflare Turnstile 复选框（如果存在）
    for frame_sel in [
        'iframe[src*="challenges.cloudflare.com"]',
        'iframe[title*="cloudflare"]',
        'iframe[title*="challenge"]',
    ]:
        try:
            checkbox = page.frame_locator(frame_sel).locator('input[type="checkbox"]')
            if await checkbox.count() > 0:
                await checkbox.click(timeout=3000)
                print("  ✓ 点击了验证复选框")
                break
        except Exception:
            pass

    try:
        await page.wait_for_function(
            """() => {
                const t = document.title.toLowerCase();
                const kws = ['just a moment', 'cloudflare', 'checking your browser',
                             'ddos-guard', 'ddos guard', 'attention required'];
                return !kws.some(kw => t.includes(kw));
            }""",
            timeout=timeout,
        )
        print(f"  ✓ 安全挑战已通过")
        return True
    except Exception as e:
        print(f"⚠ 安全挑战处理超时，继续尝试获取 PDF...")
        return False