paper_server/apps/resm/d_oaurl.py

196 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from playwright.async_api import async_playwright, Page, Browser
from typing import Optional
import asyncio
import sys
import os
# 尝试导入 playwright-stealth如果没有安装则忽略
try:
from playwright_stealth import stealth_async
except ImportError:
stealth_async = None
# 隐藏自动化特征的浏览器启动参数
_STEALTH_ARGS = [
"--disable-blink-features=AutomationControlled",
"--no-first-run",
"--no-default-browser-check",
"--disable-infobars",
"--disable-extensions",
"--disable-notifications",
]
_USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/131.0.0.0 Safari/537.36"
)
async def download_from_url_playwright(url: str, save_path: str) -> tuple[bool, str]:
async with async_playwright() as p:
try:
browser = await p.chromium.launch(
headless=False,
args=_STEALTH_ARGS,
)
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=_USER_AGENT,
locale="zh-CN",
timezone_id="Asia/Shanghai",
)
page = await context.new_page()
page.set_default_timeout(60000)
# 应用 stealth 模式绕过反爬虫检测
if stealth_async:
await stealth_async(page)
else:
await page.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>false});"
)
pdf_content: Optional[bytes] = None
async def on_response(response):
nonlocal pdf_content
if "application/pdf" in response.headers.get("content-type", ""):
try:
pdf_content = await response.body()
print(f"✓ 成功捕获 PDF大小: {len(pdf_content)} bytes")
except Exception as e:
print(f"⚠ 读取 PDF 响应体失败: {e}")
page.on("response", on_response)
# 加载页面
try:
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
except Exception:
print("⚠ 页面加载超时,但继续处理...")
# 等待 Cloudflare 挑战完成
await handle_cloudflare(page)
await page.wait_for_timeout(2000)
# 如果尚未获取 PDF继续等待响应
if not pdf_content:
print("等待 PDF 响应...")
try:
response = await page.wait_for_response(
lambda response: "application/pdf" in response.headers.get("content-type", ""),
timeout=15000
)
pdf_content = await response.body()
print(f"✓ 通过 wait_for_response 获取 PDF大小: {len(pdf_content)} bytes")
except Exception as e:
print(f"⚠ 等待 PDF 响应超时: {e}")
if pdf_content:
pdf_size = len(pdf_content)
if pdf_size < 10240:
await browser.close()
return False, f"PDF 文件过小: {pdf_size} bytes可能下载不完整"
with open(save_path, "wb") as f:
f.write(pdf_content)
print(f"✓ PDF 已保存到: {save_path},大小: {pdf_size} bytes")
await browser.close()
return True, ""
else:
await browser.close()
return False, "未能获取 PDF 内容"
except Exception as e:
print(f"异常: {e}")
return False, str(e)
finally:
try:
await browser.close()
except Exception:
pass
def download_pdf_with_curl_cffi(url: str, save_path: str) -> tuple[bool, str]:
"""使用 curl-cffi 伪造 Chrome TLS 指纹下载 PDF可绕过 Cloudflare JS 挑战"""
try:
import curl_cffi.requests as cf
resp = cf.get(
url,
impersonate="chrome131",
headers={
"Accept": "application/pdf,application/octet-stream,*/*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Referer": url,
},
timeout=30,
allow_redirects=True,
)
if resp.status_code != 200:
return False, f"http_{resp.status_code}"
content = resp.content
if not content.startswith(b"%PDF") or len(content) < 10240:
return False, "not_valid_pdf"
import os
os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
with open(save_path, "wb") as f:
f.write(content)
return True, ""
except ImportError:
return False, "curl_cffi_not_installed"
except Exception as e:
return False, str(e)
_CHALLENGE_TITLES = ["just a moment", "cloudflare", "checking your browser", "ddos-guard", "ddos guard", "attention required"]
async def handle_cloudflare(page: Page, timeout: int = 45000) -> bool:
"""
处理 bot 检测安全验证Cloudflare / DDoS-Guard 等):
1. 通过页面标题检测是否处于挑战页面
2. 尝试点击 Turnstile 复选框iframe 内,如果有)
3. 等待标题变化,表明挑战已自动通过
"""
try:
title = await page.title()
except Exception:
return False
title_l = title.lower()
if not any(kw in title_l for kw in _CHALLENGE_TITLES):
return True # 无挑战
print(f"检测到安全挑战页面(标题: {title!r}),等待 JS 自动通过...")
# 尝试点击 Cloudflare Turnstile 复选框(如果存在)
for frame_sel in [
'iframe[src*="challenges.cloudflare.com"]',
'iframe[title*="cloudflare"]',
'iframe[title*="challenge"]',
]:
try:
checkbox = page.frame_locator(frame_sel).locator('input[type="checkbox"]')
if await checkbox.count() > 0:
await checkbox.click(timeout=3000)
print(" ✓ 点击了验证复选框")
break
except Exception:
pass
try:
await page.wait_for_function(
"""() => {
const t = document.title.toLowerCase();
const kws = ['just a moment', 'cloudflare', 'checking your browser',
'ddos-guard', 'ddos guard', 'attention required'];
return !kws.some(kw => t.includes(kw));
}""",
timeout=timeout,
)
print(f" ✓ 安全挑战已通过")
return True
except Exception as e:
print(f"⚠ 安全挑战处理超时,继续尝试获取 PDF...")
return False