“桌面自动化”解救“浏览器自动化”受阻(反爬虫检测)(pywinauto、pyautogui、playwright)
桌面自动化拯救浏览器自动化
使用playwright自动化访问或爬取一些网页时,网站需要用户登录信息,一些网站在检测到用户是使用浏览器自动化工具登录时会阻止用户进入,怎么办?其实解决办法很简单:使用pywinauto、pyautogui等桌面自动化工具直接操控真实的浏览器(保存有用户登录信息),这样就可以绕过网站的各种反爬虫检测,顺利进入目标网站。再把网页内容通过文件的形式保存下来,之后再通过playwright的set_content等功能,直接加载已经保存在本地的网页文件,其后的网页内容获取流程和一般的playwright抓取网页内容流程就一样了。
一般的playwright爬虫易受网站反爬虫检测拒绝用户获取内容:
使用桌面自动化工具可解决上述难题
实现
import os
import pyperclip
import pywinauto
import pyautogui
import time
import subprocess
from playwright.sync_api import sync_playwright####################################
def chrome_browse(url:str,js:str="")->str:'''使用chrome打开指定url'''#-------------------------------------# 子程序:def graceful_close_chrome():'''优雅关闭Chrome浏览器,避免启动时弹出提醒对话框'''try:# 先尝试正常关闭已有的Chrome窗口app = pywinauto.Application(backend="uia").connect(title_re=".*Chrome.*", class_name="Chrome_WidgetWin_1", timeout=5)chrome_window = app.window(title_re=".*Chrome.*", class_name="Chrome_WidgetWin_1")if chrome_window.exists():chrome_window.set_focus()chrome_window.type_keys("%{F4}") # Alt+F4关闭窗口time.sleep(2)except Exception as e:print(f"尝试正常关闭Chrome时出错: {e}")# 如果还有残留进程,使用taskkill但不加/f参数(允许Chrome自我清理)try:os.system('taskkill /im chrome.exe /t')time.sleep(3)except Exception:pass# 子程序结尾#------------------------------------- graceful_close_chrome() #-------------------------------------# 通过subprocess启动chrome,添加启动参数,禁用弹出对话框chrome_args = [r"C:\Program Files\Google\Chrome\Application\chrome.exe","--force-renderer-accessibility","--disable-features=TranslateUI,BlinkGenPropertyTrees","--disable-session-crashed-bubble", # 禁用崩溃气泡提示"--no-default-browser-check","--no-first-run","--disable-prompt-on-repost","--disable-background-mode"]# 启动chromesubprocess.Popen(chrome_args)time.sleep(3)#-------------------------------------# pywinauto连接chrome窗口app = pywinauto.Application(backend="uia").connect(title_re=".*Chrome.*", class_name="Chrome_WidgetWin_1")time.sleep(3)# # 启动chrome应用# app = pywinauto.Application(backend="uia").start(r"C:\Program Files\Google\Chrome\Application\chrome.exe --force-renderer-accessibility")# time.sleep(3)#-------------------------------------# 找到chrome窗口chrome_window=app.window(title_re=".*Chrome.*", class_name="Chrome_WidgetWin_1")#chrome_window = app.top_window()chrome_window.print_control_identifiers()time.sleep(1)#-------------------------------------# 窗口设置为焦点chrome_window.set_focus()# 按下Ctrl+L组合键,快捷键锁定地址栏chrome_window.type_keys("^l")time.sleep(1)# 输入url# 通过剪贴板输入url,防止中文输入法数字导致输入错误pyperclip.copy(url)pyautogui.hotkey("ctrl", "v")time.sleep(1)# 按下Enter键chrome_window.type_keys("{ENTER}")time.sleep(3)#-------------------------------------# 鼠标移动到指定位置pyautogui.moveTo(100, 300)# 滚屏20次for i in range(20):# 按下Ctrl+PgDn组合键pyautogui.scroll(-2000)print(f'滚屏{i}次')time.sleep(1)#-------------------------------------# 通过快捷键ctrl+s保存当前页面chrome_window.type_keys("^s")time.sleep(3)# 利用时间戳,构造唯一文件名time_stamp=time.time()file_name=f"test.{time_stamp}.html"# 将文件名复制到剪贴板备用pyperclip.copy(file_name) # 先删除旧的文件名pyautogui.press("backspace")# 从剪贴板粘贴文件名pyautogui.hotkey("ctrl", "v")# 保存类型:网页,全部(*.htm;*.html),可先手动操作一遍,浏览器会记住偏好print(f'文件名: {file_name}')time.sleep(1)# 按下Enter键pyautogui.press("enter")time.sleep(1)# pyautogui.press("enter")# 等待文件保存完毕for i in range(20):print(f'已经等待{i}秒')time.sleep(1)#-------------------------------------# 关闭chrome窗口chrome_window.close()print('chrome窗口已关闭')#-------------------------------------# 下载的html文件默认保存在Downloads目录下html_filename=f'C:/Users/admin/Downloads/{file_name}'print(html_filename)#-------------------------------------# playwright打开html文件 with sync_playwright() as p:browser = p.chromium.launch(headless=False)context=browser.new_context(ignore_https_errors=True) page = context.new_page()#直接读取文件内容,并用playwright加载content=""try:with open(html_filename, 'r', encoding='utf-8') as file:content = file.read()# print(content) except Exception as e:print(f"读取保存的网页文件时出错: {e}")page.set_content(content)page.wait_for_load_state("domcontentloaded")#可定制更加精细的内容获取脚本:if js is None or js=="":js="""()=>{return document.body.innerText;}"""else:passresult=""try:result=page.evaluate(js)page.wait_for_timeout(3000)except Exception as e:print(f"执行js脚本时出错: {e}")print(result)browser.close()return result####################################
if __name__ == "__main__":# 运行主程序url="https://example.com"# “\”字符需使用r字符串,如不用r字符串,这写为“\\”js=r"""()=>{let ls=document.querySelectorAll("tr td:nth-child(2)");let r="";for(let l of ls){r+=l.innerText+"\n";}return r;}"""result=chrome_browse(url,js=js)print(result)