196 lines
6.8 KiB
Python
196 lines
6.8 KiB
Python
from playwright.async_api import async_playwright, Page, Browser
|
||
from typing import Optional
|
||
import asyncio
|
||
import sys
|
||
import os
|
||
|
||
# 尝试导入 playwright-stealth,如果没有安装则忽略
|
||
try:
|
||
from playwright_stealth import stealth_async
|
||
except ImportError:
|
||
stealth_async = None
|
||
|
||
# 隐藏自动化特征的浏览器启动参数
|
||
_STEALTH_ARGS = [
|
||
"--disable-blink-features=AutomationControlled",
|
||
"--no-first-run",
|
||
"--no-default-browser-check",
|
||
"--disable-infobars",
|
||
"--disable-extensions",
|
||
"--disable-notifications",
|
||
]
|
||
|
||
_USER_AGENT = (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/131.0.0.0 Safari/537.36"
|
||
)
|
||
|
||
|
||
async def download_from_url_playwright(url: str, save_path: str) -> tuple[bool, str]:
|
||
async with async_playwright() as p:
|
||
try:
|
||
browser = await p.chromium.launch(
|
||
headless=False,
|
||
args=_STEALTH_ARGS,
|
||
)
|
||
context = await browser.new_context(
|
||
viewport={"width": 1920, "height": 1080},
|
||
user_agent=_USER_AGENT,
|
||
locale="zh-CN",
|
||
timezone_id="Asia/Shanghai",
|
||
)
|
||
page = await context.new_page()
|
||
page.set_default_timeout(60000)
|
||
|
||
# 应用 stealth 模式绕过反爬虫检测
|
||
if stealth_async:
|
||
await stealth_async(page)
|
||
else:
|
||
await page.add_init_script(
|
||
"Object.defineProperty(navigator,'webdriver',{get:()=>false});"
|
||
)
|
||
|
||
pdf_content: Optional[bytes] = None
|
||
|
||
async def on_response(response):
|
||
nonlocal pdf_content
|
||
if "application/pdf" in response.headers.get("content-type", ""):
|
||
try:
|
||
pdf_content = await response.body()
|
||
print(f"✓ 成功捕获 PDF,大小: {len(pdf_content)} bytes")
|
||
except Exception as e:
|
||
print(f"⚠ 读取 PDF 响应体失败: {e}")
|
||
|
||
page.on("response", on_response)
|
||
|
||
# 加载页面
|
||
try:
|
||
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||
except Exception:
|
||
print("⚠ 页面加载超时,但继续处理...")
|
||
|
||
# 等待 Cloudflare 挑战完成
|
||
await handle_cloudflare(page)
|
||
await page.wait_for_timeout(2000)
|
||
|
||
# 如果尚未获取 PDF,继续等待响应
|
||
if not pdf_content:
|
||
print("等待 PDF 响应...")
|
||
try:
|
||
response = await page.wait_for_response(
|
||
lambda response: "application/pdf" in response.headers.get("content-type", ""),
|
||
timeout=15000
|
||
)
|
||
pdf_content = await response.body()
|
||
print(f"✓ 通过 wait_for_response 获取 PDF,大小: {len(pdf_content)} bytes")
|
||
except Exception as e:
|
||
print(f"⚠ 等待 PDF 响应超时: {e}")
|
||
|
||
if pdf_content:
|
||
pdf_size = len(pdf_content)
|
||
if pdf_size < 10240:
|
||
await browser.close()
|
||
return False, f"PDF 文件过小: {pdf_size} bytes,可能下载不完整"
|
||
|
||
with open(save_path, "wb") as f:
|
||
f.write(pdf_content)
|
||
print(f"✓ PDF 已保存到: {save_path},大小: {pdf_size} bytes")
|
||
await browser.close()
|
||
return True, ""
|
||
else:
|
||
await browser.close()
|
||
return False, "未能获取 PDF 内容"
|
||
except Exception as e:
|
||
print(f"异常: {e}")
|
||
return False, str(e)
|
||
finally:
|
||
try:
|
||
await browser.close()
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
def download_pdf_with_curl_cffi(url: str, save_path: str) -> tuple[bool, str]:
|
||
"""使用 curl-cffi 伪造 Chrome TLS 指纹下载 PDF,可绕过 Cloudflare JS 挑战"""
|
||
try:
|
||
import curl_cffi.requests as cf
|
||
resp = cf.get(
|
||
url,
|
||
impersonate="chrome131",
|
||
headers={
|
||
"Accept": "application/pdf,application/octet-stream,*/*",
|
||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||
"Referer": url,
|
||
},
|
||
timeout=30,
|
||
allow_redirects=True,
|
||
)
|
||
if resp.status_code != 200:
|
||
return False, f"http_{resp.status_code}"
|
||
content = resp.content
|
||
if not content.startswith(b"%PDF") or len(content) < 10240:
|
||
return False, "not_valid_pdf"
|
||
import os
|
||
os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
|
||
with open(save_path, "wb") as f:
|
||
f.write(content)
|
||
return True, ""
|
||
except ImportError:
|
||
return False, "curl_cffi_not_installed"
|
||
except Exception as e:
|
||
return False, str(e)
|
||
|
||
|
||
_CHALLENGE_TITLES = ["just a moment", "cloudflare", "checking your browser", "ddos-guard", "ddos guard", "attention required"]
|
||
|
||
|
||
async def handle_cloudflare(page: Page, timeout: int = 45000) -> bool:
|
||
"""
|
||
处理 bot 检测安全验证(Cloudflare / DDoS-Guard 等):
|
||
1. 通过页面标题检测是否处于挑战页面
|
||
2. 尝试点击 Turnstile 复选框(iframe 内,如果有)
|
||
3. 等待标题变化,表明挑战已自动通过
|
||
"""
|
||
try:
|
||
title = await page.title()
|
||
except Exception:
|
||
return False
|
||
|
||
title_l = title.lower()
|
||
if not any(kw in title_l for kw in _CHALLENGE_TITLES):
|
||
return True # 无挑战
|
||
|
||
print(f"检测到安全挑战页面(标题: {title!r}),等待 JS 自动通过...")
|
||
|
||
# 尝试点击 Cloudflare Turnstile 复选框(如果存在)
|
||
for frame_sel in [
|
||
'iframe[src*="challenges.cloudflare.com"]',
|
||
'iframe[title*="cloudflare"]',
|
||
'iframe[title*="challenge"]',
|
||
]:
|
||
try:
|
||
checkbox = page.frame_locator(frame_sel).locator('input[type="checkbox"]')
|
||
if await checkbox.count() > 0:
|
||
await checkbox.click(timeout=3000)
|
||
print(" ✓ 点击了验证复选框")
|
||
break
|
||
except Exception:
|
||
pass
|
||
|
||
try:
|
||
await page.wait_for_function(
|
||
"""() => {
|
||
const t = document.title.toLowerCase();
|
||
const kws = ['just a moment', 'cloudflare', 'checking your browser',
|
||
'ddos-guard', 'ddos guard', 'attention required'];
|
||
return !kws.some(kw => t.includes(kw));
|
||
}""",
|
||
timeout=timeout,
|
||
)
|
||
print(f" ✓ 安全挑战已通过")
|
||
return True
|
||
except Exception as e:
|
||
print(f"⚠ 安全挑战处理超时,继续尝试获取 PDF...")
|
||
return False
|