From 94f269626ddb40baf2327353591ad6994bed8dcc Mon Sep 17 00:00:00 2001 From: caoqianming Date: Mon, 9 Feb 2026 15:17:02 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0pyautogui=E8=B0=83?= =?UTF-8?q?=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/resm/cloudflare_checkbox2.png | Bin 0 -> 492 bytes apps/resm/d_oaurl.py | 196 +++++++++++++++++++++++++++++ apps/resm/tasks.py | 37 +++++- requirements.txt | 5 + 4 files changed, 236 insertions(+), 2 deletions(-) create mode 100644 apps/resm/cloudflare_checkbox2.png create mode 100644 apps/resm/d_oaurl.py diff --git a/apps/resm/cloudflare_checkbox2.png b/apps/resm/cloudflare_checkbox2.png new file mode 100644 index 0000000000000000000000000000000000000000..e49579928c311bfb0b706f3a0bd33451c210f7d5 GIT binary patch literal 492 zcmeAS@N?(olHy`uVBq!ia0vp^UO?>6!2~1^*B_Y*q&N#aB8wRq_>O=u<5X=vX$A(y zd7dtgAr*7p&f0jY*+8T<_~-JzcYW&~%@Q|QKVkXnIpH1p3U`lOHLP}T=X)s6CDd}u zL%HhI&t=^^PyFm~YpJfZoByc%WFOxK{VUFGRVM=YC0{8z+or5&KXZRs-t(C2rH?J_ z=Gx>Z`I`z)vJ`F_|yB6(TntjBw+ITtJSJ+`V?zyS=ghXIf<-GG>4S3?0c)3X^_iR`2TCa0@r`fSDwe=6= zs^0(JXMXHV_@N5!MQTh-r+okYH`M=f$?m&BPkh>@e7YNS<(;VO!YN)-nvz~Cr%rr) zWAW0=S;A6vGG_M-y#GF4{HWu@<~}6r|1fVk-Fmz^aOXB)^f7q4`njxgN@xNAKs)9Y literal 0 HcmV?d00001 diff --git a/apps/resm/d_oaurl.py b/apps/resm/d_oaurl.py new file mode 100644 index 0000000..a3c4397 --- /dev/null +++ b/apps/resm/d_oaurl.py @@ -0,0 +1,196 @@ +from playwright.async_api import async_playwright, Page, Browser +from typing import Optional +import asyncio +import sys +import os +import pyautogui +import time +from PIL import Image +import io + +# 尝试导入 OpenCV 用于更好的模板匹配 +try: + import cv2 + HAS_CV2 = True +except ImportError: + HAS_CV2 = False + +# 尝试导入 playwright-stealth,如果没有安装则忽略 +try: + from playwright_stealth import stealth_async +except ImportError: + stealth_async = None + + +async def download_from_url_playwright(url: str, save_path: str) -> tuple[bool, str]: + async with async_playwright() as p: + try: + browser = await p.chromium.launch(headless=False) + context = await browser.new_context(viewport={"width": 1920, "height": 1080}) + page = await context.new_page() + page.set_default_timeout(60000) + + # 应用 stealth 模式绕过反爬虫检测 + if stealth_async: + await stealth_async(page) + else: + # 如果没有 stealth,手动设置一些反爬虫对抗 + await page.add_init_script(""" + Object.defineProperty(navigator, 'webdriver', { + get: () => false, + }); + """) + + pdf_content: Optional[bytes] = None + + async def on_response(response): + nonlocal pdf_content + if "application/pdf" in response.headers.get("content-type", ""): + pdf_content = await response.body() + + page.on("response", on_response) + + # 先用较宽松的等待条件加载页面,避免卡在 Cloudflare + try: + await page.goto(url, wait_until="domcontentloaded", timeout=5000) + except: + print("⚠ 页面加载超时,但继续处理...") + await page.wait_for_timeout(5000) + + # 处理 Cloudflare 校验 + print("开始处理 Cloudflare 校验...") + await page.wait_for_timeout(3000) + + # Cloudflare 可能需要连续点击多次,最多尝试5次 + max_cloudflare_attempts = 5 + + for attempt in range(max_cloudflare_attempts): + # 检查是否已获取到 PDF,如果已获取则无需继续验证 + if pdf_content: + print("✓ 已获取到 PDF 内容,停止验证框处理") + break + + print(f"\nCloudflare 验证尝试 {attempt + 1}/{max_cloudflare_attempts}") + success = await handle_cloudflare_with_image(page) + + if success: + print("✓ 成功处理一次验证框") + # 等待新验证框出现或页面刷新 + await page.wait_for_timeout(2000) + # 检查是否还有验证框,如果没有则说明验证完成 + # 这里简单地继续尝试,直到达到最大次数 + if attempt < max_cloudflare_attempts - 1: + print(" 检查是否还有验证框...") + await page.wait_for_timeout(1000) + else: + print("⚠ 未找到验证框,可能已完成验证或验证框已消失") + break + + print("✓ Cloudflare 验证处理完成") + await page.wait_for_timeout(2000) + + # 如果尚未获取 PDF,继续等待响应 + if not pdf_content: + print("等待 PDF 响应...") + try: + await page.wait_for_response( + lambda response: "application/pdf" in response.headers.get("content-type", ""), + timeout=15000 + ) + except Exception as e: + print(f"⚠ 等待 PDF 响应超时: {e}") + + if pdf_content: + with open(save_path, "wb") as f: + f.write(pdf_content) + await browser.close() + return True, "" + else: + await browser.close() + return False, "未能获取 PDF 内容" + except Exception as e: + print(f"异常: {e}") + return False, str(e) + finally: + try: + await browser.close() + except: + pass + + +async def handle_cloudflare_with_image(page: Page) -> bool: + """ + 使用图像识别方式处理 Cloudflare 验证框 + 支持模板匹配和颜色识别两种方式 + """ + # 在尝试之前先等待2秒,让验证框完全加载 + await page.wait_for_timeout(2000) + + max_retries = 5 + + for retry in range(max_retries): + print(f"图像识别方式尝试第 {retry + 1}/{max_retries} 次") + + try: + # 方式1: 通过模板图像识别(如果有模板文件) + success = await try_template_matching() + if success: + print(" ✓ 模板匹配方式成功") + await page.wait_for_timeout(5000) + return True + + print(f" 等待后重试... ({retry + 1}/{max_retries})") + await page.wait_for_timeout(2000) + + except Exception as e: + print(f" ✗ 图像处理异常: {e}") + await page.wait_for_timeout(2000) + + return False + + +async def try_template_matching() -> bool: + """ + 通过模板匹配查找并点击验证框 + 使用 pyautogui.locateOnScreen 直接在屏幕上定位验证框模板 + """ + template_paths = [ + 'apps/resm/cloudflare_checkbox2.png', + ] + + # pyautogui 定位的准确度阈值(0.0-1.0,越高越严格) + ACCURACY = 0.6 + + for template_path in template_paths: + if not os.path.exists(template_path): + print(f" 模板文件不存在: {template_path}") + continue + + try: + print(f" 尝试在屏幕上定位模板: {template_path} (confidence={ACCURACY})") + + # 直接在屏幕上查找模板,使用 confidence 参数 + loc = pyautogui.locateOnScreen(template_path, confidence=ACCURACY) + + if loc: + # loc 是 (left, top, width, height) 或 (x, y, w, h) + # pyautogui.center(loc) 返回中心坐标 + center_x, center_y = pyautogui.center(loc) + print(f" 找到验证框位置: ({center_x}, {center_y})") + print(f" 模板匹配区域: {loc}") + pyautogui.click(center_x, center_y, clicks=1, interval=0.1) + return True + else: + print(f" 未找到模板 (confidence={ACCURACY})") + return False + + except Exception as e: + # 捕获所有异常,包括 ImageNotFoundException + error_type = type(e).__name__ + if "ImageNotFoundException" in error_type: + print(f" 模板匹配异常: {error_type} - 屏幕上找不到模板,停止尝试") + else: + print(f" 模板匹配异常: {error_type} - {e}") + return False + + return False diff --git a/apps/resm/tasks.py b/apps/resm/tasks.py index 86bbeca..e6d6896 100644 --- a/apps/resm/tasks.py +++ b/apps/resm/tasks.py @@ -11,7 +11,10 @@ from lxml import etree from celery import current_app from datetime import datetime, timedelta import random -from django.db.models import Q +from .d_oaurl import download_from_url_playwright +import asyncio +import sys +import os # config.email = "caoqianming@foxmail.com" config.email = "caoqianming@ctc.ac.cn" @@ -28,6 +31,25 @@ ELSEVIER_HEADERS = { "X-ELS-APIKey": ELSEVIER_APIKEY, } + +def run_async(coro): + """ + 跨平台运行异步任务,解决 Windows 上 asyncio subprocess 问题 + """ + if sys.platform == 'win32': + # Windows 上需要使用 ProactorEventLoop 来支持 subprocess + policy = asyncio.WindowsProactorEventLoopPolicy() + asyncio.set_event_loop_policy(policy) + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + return loop.run_until_complete(coro) + finally: + loop.close() + else: + # Unix/Linux/Mac 上使用默认方式 + return asyncio.run(coro) + @shared_task(base=CustomTask) def get_paper_meta_from_openalex(publication_year:int, keywords:str="", search:str="", end_year:int=None): cache_key = f"openalex_cursor_{publication_year}_{keywords}{search}" @@ -368,7 +390,18 @@ def save_pdf_from_oa_url(paper:Paper): else: paper.save_fail_reason("oa_url_not_pdf") return "oa_url_not_pdf" - return f"oa_url_pdf_error: {res.status_code}" + elif res.status_code == 403: + paper_path = paper.init_paper_path("pdf") + is_ok, err_msg = run_async(download_from_url_playwright(paper.oa_url, paper_path)) + if is_ok: + paper.has_fulltext = True + paper.has_fulltext_pdf = True + paper.save(update_fields=["has_fulltext", "has_fulltext_pdf", "update_time"]) + return "success" + else: + paper.save_fail_reason(f"oa_url_pdf_play_error: {err_msg}") + return f"oa_url_pdf_play_error: {err_msg}" + return f"oa_url_pdf_oerror: {res.status_code}" def save_pdf_from_openalex(paper:Paper): if cache.get("openalex_api_exceed"): diff --git a/requirements.txt b/requirements.txt index 797ffa9..f797502 100755 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,8 @@ xlwt==1.3.0 openpyxl==3.1.5 cron-descriptor==1.2.35 docxtpl==0.16.7 +playwright==1.58.0 +playwright-stealth==2.0.1 +pyautogui==0.9.54 +pillow>=10.0.0 +opencv-python>=4.8.0