from playwright.async_api import async_playwright, Page, Browser from typing import Optional import asyncio import sys import os # 尝试导入 playwright-stealth,如果没有安装则忽略 try: from playwright_stealth import stealth_async except ImportError: stealth_async = None # 隐藏自动化特征的浏览器启动参数 _STEALTH_ARGS = [ "--disable-blink-features=AutomationControlled", "--no-first-run", "--no-default-browser-check", "--disable-infobars", "--disable-extensions", "--disable-notifications", ] _USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/131.0.0.0 Safari/537.36" ) async def download_from_url_playwright(url: str, save_path: str) -> tuple[bool, str]: async with async_playwright() as p: try: browser = await p.chromium.launch( headless=False, args=_STEALTH_ARGS, ) context = await browser.new_context( viewport={"width": 1920, "height": 1080}, user_agent=_USER_AGENT, locale="zh-CN", timezone_id="Asia/Shanghai", ) page = await context.new_page() page.set_default_timeout(60000) # 应用 stealth 模式绕过反爬虫检测 if stealth_async: await stealth_async(page) else: await page.add_init_script( "Object.defineProperty(navigator,'webdriver',{get:()=>false});" ) pdf_content: Optional[bytes] = None async def on_response(response): nonlocal pdf_content if "application/pdf" in response.headers.get("content-type", ""): try: pdf_content = await response.body() print(f"✓ 成功捕获 PDF,大小: {len(pdf_content)} bytes") except Exception as e: print(f"⚠ 读取 PDF 响应体失败: {e}") page.on("response", on_response) # 加载页面 try: await page.goto(url, wait_until="domcontentloaded", timeout=30000) except Exception: print("⚠ 页面加载超时,但继续处理...") # 等待 Cloudflare 挑战完成 await handle_cloudflare(page) await page.wait_for_timeout(2000) # 如果尚未获取 PDF,继续等待响应 if not pdf_content: print("等待 PDF 响应...") try: response = await page.wait_for_response( lambda response: "application/pdf" in response.headers.get("content-type", ""), timeout=15000 ) pdf_content = await response.body() print(f"✓ 通过 wait_for_response 获取 PDF,大小: {len(pdf_content)} bytes") except Exception as e: print(f"⚠ 等待 PDF 响应超时: {e}") if pdf_content: pdf_size = len(pdf_content) if pdf_size < 10240: await browser.close() return False, f"PDF 文件过小: {pdf_size} bytes,可能下载不完整" with open(save_path, "wb") as f: f.write(pdf_content) print(f"✓ PDF 已保存到: {save_path},大小: {pdf_size} bytes") await browser.close() return True, "" else: await browser.close() return False, "未能获取 PDF 内容" except Exception as e: print(f"异常: {e}") return False, str(e) finally: try: await browser.close() except Exception: pass def download_pdf_with_curl_cffi(url: str, save_path: str) -> tuple[bool, str]: """使用 curl-cffi 伪造 Chrome TLS 指纹下载 PDF,可绕过 Cloudflare JS 挑战""" try: import curl_cffi.requests as cf resp = cf.get( url, impersonate="chrome131", headers={ "Accept": "application/pdf,application/octet-stream,*/*", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Referer": url, }, timeout=30, allow_redirects=True, ) if resp.status_code != 200: return False, f"http_{resp.status_code}" content = resp.content if not content.startswith(b"%PDF") or len(content) < 10240: return False, "not_valid_pdf" import os os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True) with open(save_path, "wb") as f: f.write(content) return True, "" except ImportError: return False, "curl_cffi_not_installed" except Exception as e: return False, str(e) _CHALLENGE_TITLES = ["just a moment", "cloudflare", "checking your browser", "ddos-guard", "ddos guard", "attention required"] async def handle_cloudflare(page: Page, timeout: int = 45000) -> bool: """ 处理 bot 检测安全验证(Cloudflare / DDoS-Guard 等): 1. 通过页面标题检测是否处于挑战页面 2. 尝试点击 Turnstile 复选框(iframe 内,如果有) 3. 等待标题变化,表明挑战已自动通过 """ try: title = await page.title() except Exception: return False title_l = title.lower() if not any(kw in title_l for kw in _CHALLENGE_TITLES): return True # 无挑战 print(f"检测到安全挑战页面(标题: {title!r}),等待 JS 自动通过...") # 尝试点击 Cloudflare Turnstile 复选框(如果存在) for frame_sel in [ 'iframe[src*="challenges.cloudflare.com"]', 'iframe[title*="cloudflare"]', 'iframe[title*="challenge"]', ]: try: checkbox = page.frame_locator(frame_sel).locator('input[type="checkbox"]') if await checkbox.count() > 0: await checkbox.click(timeout=3000) print(" ✓ 点击了验证复选框") break except Exception: pass try: await page.wait_for_function( """() => { const t = document.title.toLowerCase(); const kws = ['just a moment', 'cloudflare', 'checking your browser', 'ddos-guard', 'ddos guard', 'attention required']; return !kws.some(kw => t.includes(kw)); }""", timeout=timeout, ) print(f" ✓ 安全挑战已通过") return True except Exception as e: print(f"⚠ 安全挑战处理超时,继续尝试获取 PDF...") return False