Python爬虫第9课:验证码识别与自动化处理
目录
- 课程目标
- 1. 验证码基础知识
- 1.1 验证码类型
- 1.2 验证码识别流程
- 2. OCR文字识别技术
- 2.1 Tesseract OCR
- 2.2 PaddleOCR集成
- 3. 图像验证码处理
- 3.1 点击验证码
- 3.2 拼图验证码
- 4. 滑块验证码破解
- 4.1 基础滑块验证码
- 4.2 高级滑块验证码
- 5. 第三方验证码服务
- 5.1 验证码识别API
- 5.2 集成验证码服务到爬虫
- 6. 实战案例:电商网站登录
- 6.1 综合验证码处理
- 7. 练习与作业
- 7.1 基础练习
- 7.2 进阶练习
- 7.3 实战项目
- 8. 常见问题与解决方案
- 8.1 识别准确率问题
- 8.2 反检测问题
- 8.3 成本控制
- 9. 下节预告
专栏导读
🌸 欢迎来到Python办公自动化专栏—Python处理办公问题,解放您的双手
🏳️🌈 个人博客主页:请点击——> 个人的博客主页 求收藏
🏳️🌈 Github主页:请点击——> Github主页 求Star⭐
🏳️🌈 知乎主页:请点击——> 知乎主页 求关注
🏳️🌈 CSDN博客主页:请点击——> CSDN的博客主页 求关注
👍 该系列文章专栏:请点击——>Python办公自动化专栏 求订阅
🕷 此外还有爬虫专栏:请点击——>Python爬虫基础专栏 求订阅
📕 此外还有python基础专栏:请点击——>Python基础学习专栏 求订阅
文章作者技术和水平有限,如果文中出现错误,希望大家能指正🙏
❤️ 欢迎各位佬关注! ❤️
课程目标
- 了解各种验证码类型和识别技术
- 掌握OCR文字识别技术
- 学会处理图像验证码
- 掌握滑块验证码破解技术
- 了解第三方验证码服务
1. 验证码基础知识
1.1 验证码类型
"""
常见验证码类型:
1. 文字验证码 - 数字、字母、汉字
2. 图像验证码 - 点击指定图像
3. 滑块验证码 - 拖动滑块完成拼图
4. 行为验证码 - 模拟人类行为
5. 语音验证码 - 听音识别
6. 短信验证码 - 手机验证
"""import cv2
import numpy as np
import requests
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract
import base64
import io
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ECclass CaptchaHandler:def __init__(self):# 配置tesseract路径(Windows)# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'self.session = requests.Session()self.session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'})def download_captcha(self, url, save_path=None):"""下载验证码图片"""try:response = self.session.get(url, timeout=10)response.raise_for_status()if save_path:with open(save_path, 'wb') as f:f.write(response.content)# 返回PIL图像对象return Image.open(io.BytesIO(response.content))except Exception as e:print(f"下载验证码失败:{e}")return Nonedef preprocess_image(self, image):"""图像预处理"""# 转换为灰度图if image.mode != 'L':image = image.convert('L')# 增强对比度enhancer = ImageEnhance.Contrast(image)image = enhancer.enhance(2.0)# 增强锐度enhancer = ImageEnhance.Sharpness(image)image = enhancer.enhance(2.0)# 二值化threshold = 128image = image.point(lambda x: 0 if x < threshold else 255, '1')return imagedef remove_noise(self, image):"""去除噪点"""# 转换为numpy数组img_array = np.array(image)# 形态学操作去噪kernel = np.ones((2, 2), np.uint8)img_array = cv2.morphologyEx(img_array, cv2.MORPH_CLOSE, kernel)img_array = cv2.morphologyEx(img_array, cv2.MORPH_OPEN, kernel)# 中值滤波img_array = cv2.medianBlur(img_array, 3)return Image.fromarray(img_array)# 使用示例
handler = CaptchaHandler()# 下载并预处理验证码
captcha_url = "https://example.com/captcha.jpg"
image = handler.download_captcha(captcha_url)if image:# 预处理processed_image = handler.preprocess_image(image)cleaned_image = handler.remove_noise(processed_image)# 保存处理后的图像cleaned_image.save("processed_captcha.jpg")
1.2 验证码识别流程
class CaptchaRecognizer:def __init__(self):self.handler = CaptchaHandler()# OCR配置self.ocr_config = '--psm 8 -c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'# 字符分割参数self.char_width_range = (10, 30)self.char_height_range = (20, 40)def recognize_text_captcha(self, image_path_or_url):"""识别文字验证码"""# 获取图像if image_path_or_url.startswith('http'):image = self.handler.download_captcha(image_path_or_url)else:image = Image.open(image_path_or_url)if not image:return None# 预处理processed_image = self.handler.preprocess_image(image)cleaned_image = self.handler.remove_noise(processed_image)# OCR识别try:text = pytesseract.image_to_string(cleaned_image, config=self.ocr_config).strip()# 清理识别结果text = self.clean_ocr_result(text)return textexcept Exception as e:print(f"OCR识别失败:{e}")return Nonedef clean_ocr_result(self, text):"""清理OCR识别结果"""# 移除空格和特殊字符text = ''.join(c for c in text if c.isalnum())# 常见字符替换replacements = {'0': 'O', # 数字0和字母O容易混淆'l': '1', # 小写l和数字1容易混淆'I': '1', # 大写I和数字1容易混淆'S': '5', # 字母S和数字5容易混淆}for old, new in replacements.items():text = text.replace(old, new)return textdef segment_characters(self, image):"""字符分割"""# 转换为numpy数组img_array = np.array(image)# 垂直投影vertical_projection = np.sum(img_array == 0, axis=0)# 找到字符边界char_boundaries = []in_char = Falsestart = 0for i, count in enumerate(vertical_projection):if count > 0 and not in_char:start = iin_char = Trueelif count == 0 and in_char:char_boundaries.append((start, i))in_char = False# 处理最后一个字符if in_char:char_boundaries.append((start, len(vertical_projection)))# 分割字符characters = []for start, end in char_boundaries:char_width = end - startif self.char_width_range[0] <= char_width <= self.char_width_range[1]:char_img = image.crop((start, 0, end, image.height))characters.append(char_img)return charactersdef recognize_segmented_captcha(self, image_path_or_url):"""识别分割后的验证码"""# 获取图像if image_path_or_url.startswith('http'):image = self.handler.download_captcha(image_path_or_url)else:image = Image.open(image_path_or_url)if not image:return None# 预处理processed_image = self.handler.preprocess_image(image)cleaned_image = self.handler.remove_noise(processed_image)# 字符分割characters = self.segment_characters(cleaned_image)# 逐个识别字符result = ""for i, char_img in enumerate(characters):try:char_text = pytesseract.image_to_string(char_img, config='--psm 10 -c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ').strip()if char_text:result += char_text[0] # 只取第一个字符except Exception as e:print(f"字符{i}识别失败:{e}")continuereturn self.clean_ocr_result(result)# 使用示例
recognizer = CaptchaRecognizer()# 识别文字验证码
captcha_text = recognizer.recognize_text_captcha("captcha.jpg")
print(f"识别结果:{captcha_text}")# 使用分割方法识别
segmented_result = recognizer.recognize_segmented_captcha("captcha.jpg")
print(f"分割识别结果:{segmented_result}")
2. OCR文字识别技术
2.1 Tesseract OCR
import pytesseract
from PIL import Image
import cv2
import numpy as npclass TesseractOCR:def __init__(self, tesseract_path=None):if tesseract_path:pytesseract.pytesseract.tesseract_cmd = tesseract_path# 不同类型验证码的配置self.configs = {'digits_only': '--psm 8 -c tessedit_char_whitelist=0123456789','letters_only': '--psm 8 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','alphanumeric': '--psm 8 -c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','chinese': '--psm 8 -l chi_sim','single_char': '--psm 10','single_word': '--psm 8','single_line': '--psm 7'}def enhance_image_for_ocr(self, image):"""为OCR优化图像"""# 转换为OpenCV格式if isinstance(image, Image.Image):image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)# 转换为灰度图gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)# 高斯模糊去噪blurred = cv2.GaussianBlur(gray, (5, 5), 0)# 自适应阈值二值化binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)# 形态学操作kernel = np.ones((2, 2), np.uint8)cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)cleaned = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)# 放大图像提高识别率height, width = cleaned.shapeenlarged = cv2.resize(cleaned, (width * 3, height * 3), interpolation=cv2.INTER_CUBIC)return enlargeddef recognize_with_multiple_configs(self, image, config_types=None):"""使用多种配置尝试识别"""if config_types is None:config_types = ['alphanumeric', 'digits_only', 'letters_only']# 图像预处理enhanced_image = self.enhance_image_for_ocr(image)results = {}for config_type in config_types:if config_type in self.configs:try:config = self.configs[config_type]text = pytesseract.image_to_string(enhanced_image, config=config).strip()# 清理结果text = ''.join(c for c in text if c.isalnum())if text:results[config_type] = textexcept Exception as e:print(f"配置 {config_type} 识别失败:{e}")continuereturn resultsdef get_best_result(self, results):"""从多个结果中选择最佳结果"""if not results:return None# 优先级:字母数字 > 纯数字 > 纯字母priority = ['alphanumeric', 'digits_only', 'letters_only']for config_type in priority:if config_type in results and results[config_type]:return results[config_type]# 如果没有优先级匹配,返回第一个非空结果for result in results.values():if result:return resultreturn Nonedef recognize_captcha(self, image_path_or_array):"""识别验证码主方法"""# 加载图像if isinstance(image_path_or_array, str):image = cv2.imread(image_path_or_array)elif isinstance(image_path_or_array, Image.Image):image = cv2.cvtColor(np.array(image_path_or_array), cv2.COLOR_RGB2BGR)else:image = image_path_or_array# 多配置识别results = self.recognize_with_multiple_configs(image)# 选择最佳结果best_result = self.get_best_result(results)return {'best_result': best_result,'all_results': results}# 使用示例
ocr = TesseractOCR()# 识别验证码
result = ocr.recognize_captcha("captcha.jpg")
print(f"最佳识别结果:{result['best_result']}")
print(f"所有结果:{result['all_results']}")
2.2 PaddleOCR集成
try:from paddleocr import PaddleOCRPADDLEOCR_AVAILABLE = True
except ImportError:PADDLEOCR_AVAILABLE = Falseprint("PaddleOCR未安装,请运行:pip install paddleocr")class PaddleOCRRecognizer:def __init__(self, use_angle_cls=True, lang='ch'):if not PADDLEOCR_AVAILABLE:raise ImportError("PaddleOCR未安装")self.ocr = PaddleOCR(use_angle_cls=use_angle_cls, lang=lang,show_log=False)def recognize_captcha(self, image_path_or_array):"""使用PaddleOCR识别验证码"""try:# PaddleOCR识别results = self.ocr.ocr(image_path_or_array, cls=True)if not results or not results[0]:return None# 提取文字texts = []for line in results[0]:text = line[1][0] # 获取识别的文字confidence = line[1][1] # 获取置信度if confidence > 0.5: # 置信度阈值texts.append(text)# 合并所有文字full_text = ''.join(texts)# 清理结果cleaned_text = ''.join(c for c in full_text if c.isalnum())return cleaned_textexcept Exception as e:print(f"PaddleOCR识别失败:{e}")return Nonedef recognize_with_details(self, image_path_or_array):"""获取详细识别结果"""try:results = self.ocr.ocr(image_path_or_array, cls=True)if not results or not results[0]:return Nonedetailed_results = []for line in results[0]:bbox = line[0] # 边界框坐标text = line[1][0] # 识别的文字confidence = line[1][1] # 置信度detailed_results.append({'text': text,'confidence': confidence,'bbox': bbox})return detailed_resultsexcept Exception as e:print(f"PaddleOCR详细识别失败:{e}")return None# 使用示例(如果安装了PaddleOCR)
if PADDLEOCR_AVAILABLE:paddle_ocr = PaddleOCRRecognizer(lang='en') # 英文识别# 简单识别result = paddle_ocr.recognize_captcha("captcha.jpg")print(f"PaddleOCR识别结果:{result}")# 详细识别detailed = paddle_ocr.recognize_with_details("captcha.jpg")if detailed:for item in detailed:print(f"文字:{item['text']}, 置信度:{item['confidence']:.2f}")
3. 图像验证码处理
3.1 点击验证码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
import randomclass ClickCaptchaHandler:def __init__(self, driver_path=None):self.driver = Noneself.driver_path = driver_pathdef init_driver(self):"""初始化浏览器驱动"""options = webdriver.ChromeOptions()options.add_argument('--no-sandbox')options.add_argument('--disable-dev-shm-usage')options.add_argument('--disable-blink-features=AutomationControlled')if self.driver_path:self.driver = webdriver.Chrome(self.driver_path, options=options)else:self.driver = webdriver.Chrome(options=options)# 隐藏webdriver特征self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")def solve_click_captcha(self, captcha_selector, target_text):"""解决点击验证码"""try:# 等待验证码加载captcha_element = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, captcha_selector)))# 获取验证码图片captcha_images = self.driver.find_elements(By.CSS_SELECTOR, f"{captcha_selector} img")# 分析每个图片for i, img in enumerate(captcha_images):# 获取图片的alt属性或其他标识img_alt = img.get_attribute('alt') or img.get_attribute('title') or ""# 如果图片描述包含目标文字,点击它if target_text.lower() in img_alt.lower():# 模拟人类点击行为self.human_like_click(img)time.sleep(random.uniform(0.5, 1.5))return Trueexcept Exception as e:print(f"点击验证码处理失败:{e}")return Falsedef human_like_click(self, element):"""模拟人类点击行为"""# 获取元素位置和大小location = element.locationsize = element.size# 计算随机点击位置(避免总是点击中心)offset_x = random.randint(-size['width']//4, size['width']//4)offset_y = random.randint(-size['height']//4, size['height']//4)# 移动到元素附近actions = ActionChains(self.driver)actions.move_to_element_with_offset(element, offset_x, offset_y)# 随机停顿time.sleep(random.uniform(0.1, 0.3))# 点击actions.click()actions.perform()def solve_image_selection_captcha(self, instruction_selector, images_selector):"""解决图像选择验证码"""try:# 获取指令文字instruction_element = self.driver.find_element(By.CSS_SELECTOR, instruction_selector)instruction = instruction_element.textprint(f"验证码指令:{instruction}")# 提取关键词keywords = self.extract_keywords(instruction)# 获取所有图片image_elements = self.driver.find_elements(By.CSS_SELECTOR, images_selector)# 分析并点击匹配的图片for img in image_elements:if self.should_click_image(img, keywords):self.human_like_click(img)time.sleep(random.uniform(0.3, 0.8))return Trueexcept Exception as e:print(f"图像选择验证码处理失败:{e}")return Falsedef extract_keywords(self, instruction):"""从指令中提取关键词"""# 常见的验证码指令关键词keyword_mapping = {'汽车': ['car', 'vehicle', 'automobile'],'交通灯': ['traffic light', 'traffic signal'],'自行车': ['bicycle', 'bike'],'公交车': ['bus'],'摩托车': ['motorcycle'],'人行横道': ['crosswalk', 'zebra crossing'],'红绿灯': ['traffic light'],'桥梁': ['bridge'],'山': ['mountain', 'hill'],'树': ['tree'],'房子': ['house', 'building'],'动物': ['animal'],'猫': ['cat'],'狗': ['dog']}keywords = []for chinese, english_list in keyword_mapping.items():if chinese in instruction:keywords.extend(english_list)keywords.append(chinese)return keywordsdef should_click_image(self, img_element, keywords):"""判断是否应该点击图片"""# 获取图片的各种属性alt_text = img_element.get_attribute('alt') or ""title_text = img_element.get_attribute('title') or ""class_name = img_element.get_attribute('class') or ""# 组合所有文本信息all_text = f"{alt_text} {title_text} {class_name}".lower()# 检查是否包含关键词for keyword in keywords:if keyword.lower() in all_text:return Truereturn Falsedef close_driver(self):"""关闭浏览器"""if self.driver:self.driver.quit()# 使用示例
click_handler = ClickCaptchaHandler()
click_handler.init_driver()try:# 访问包含验证码的页面click_handler.driver.get("https://example.com/captcha-page")# 解决点击验证码success = click_handler.solve_image_selection_captcha(instruction_selector=".captcha-instruction",images_selector=".captcha-image")if success:print("验证码解决成功")else:print("验证码解决失败")finally:click_handler.close_driver()
3.2 拼图验证码
import cv2
import numpy as np
from PIL import Image
import requestsclass JigsawCaptchaHandler:def __init__(self):self.session = requests.Session()def download_images(self, background_url, piece_url):"""下载背景图和拼图块"""try:# 下载背景图bg_response = self.session.get(background_url)bg_image = Image.open(io.BytesIO(bg_response.content))# 下载拼图块piece_response = self.session.get(piece_url)piece_image = Image.open(io.BytesIO(piece_response.content))return bg_image, piece_imageexcept Exception as e:print(f"下载图片失败:{e}")return None, Nonedef find_gap_position(self, background_image, piece_image):"""查找缺口位置"""# 转换为OpenCV格式bg_cv = cv2.cvtColor(np.array(background_image), cv2.COLOR_RGB2BGR)piece_cv = cv2.cvtColor(np.array(piece_image), cv2.COLOR_RGB2BGR)# 转换为灰度图bg_gray = cv2.cvtColor(bg_cv, cv2.COLOR_BGR2GRAY)piece_gray = cv2.cvtColor(piece_cv, cv2.COLOR_BGR2GRAY)# 边缘检测bg_edges = cv2.Canny(bg_gray, 50, 150)piece_edges = cv2.Canny(piece_gray, 50, 150)# 模板匹配result = cv2.matchTemplate(bg_edges, piece_edges, cv2.TM_CCOEFF_NORMED)# 找到最佳匹配位置min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)return max_loc[0], max_val # 返回x坐标和匹配度def find_gap_by_difference(self, background_image, puzzle_image):"""通过图像差异查找缺口"""# 转换为numpy数组bg_array = np.array(background_image)puzzle_array = np.array(puzzle_image)# 确保图像大小一致if bg_array.shape != puzzle_array.shape:puzzle_array = cv2.resize(puzzle_array, (bg_array.shape[1], bg_array.shape[0]))# 计算差异diff = cv2.absdiff(bg_array, puzzle_array)# 转换为灰度图diff_gray = cv2.cvtColor(diff, cv2.COLOR_RGB2GRAY)# 二值化_, binary = cv2.threshold(diff_gray, 30, 255, cv2.THRESH_BINARY)# 查找轮廓contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)if contours:# 找到最大的轮廓(假设是缺口)largest_contour = max(contours, key=cv2.contourArea)x, y, w, h = cv2.boundingRect(largest_contour)return x, y, w, hreturn Nonedef calculate_drag_distance(self, gap_x, piece_width):"""计算拖拽距离"""# 考虑拼图块的宽度drag_distance = gap_x - piece_width // 2# 添加一些随机偏移,模拟人类操作offset = random.randint(-5, 5)drag_distance += offsetreturn max(0, drag_distance)def generate_drag_path(self, distance):"""生成拖拽轨迹"""# 模拟人类拖拽行为的轨迹path = []current = 0# 加速阶段while current < distance * 0.7:step = random.uniform(2, 5)current += steppath.append(int(current))time.sleep(random.uniform(0.01, 0.02))# 减速阶段while current < distance:step = random.uniform(0.5, 2)current += steppath.append(int(current))time.sleep(random.uniform(0.02, 0.05))# 微调阶段for _ in range(random.randint(1, 3)):adjustment = random.randint(-2, 2)if path:path.append(path[-1] + adjustment)time.sleep(random.uniform(0.1, 0.2))return pathdef solve_jigsaw_captcha(self, driver, slider_selector, background_selector, piece_selector):"""解决拼图验证码"""try:# 等待元素加载slider = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, slider_selector)))background_elem = driver.find_element(By.CSS_SELECTOR, background_selector)piece_elem = driver.find_element(By.CSS_SELECTOR, piece_selector)# 获取图片URLbg_url = background_elem.get_attribute('src')piece_url = piece_elem.get_attribute('src')# 下载图片bg_image, piece_image = self.download_images(bg_url, piece_url)if not bg_image or not piece_image:return False# 查找缺口位置gap_x, confidence = self.find_gap_position(bg_image, piece_image)print(f"找到缺口位置:x={gap_x}, 置信度={confidence:.2f}")# 计算拖拽距离piece_width = piece_image.widthdrag_distance = self.calculate_drag_distance(gap_x, piece_width)print(f"计算拖拽距离:{drag_distance}px")# 生成拖拽轨迹drag_path = self.generate_drag_path(drag_distance)# 执行拖拽actions = ActionChains(driver)actions.click_and_hold(slider)for position in drag_path:actions.move_by_offset(1, 0) # 每次移动1像素time.sleep(random.uniform(0.01, 0.03))# 随机停顿后释放time.sleep(random.uniform(0.1, 0.3))actions.release()actions.perform()return Trueexcept Exception as e:print(f"拼图验证码解决失败:{e}")return False# 使用示例
jigsaw_handler = JigsawCaptchaHandler()# 在Selenium中使用
from selenium import webdriverdriver = webdriver.Chrome()
try:driver.get("https://example.com/jigsaw-captcha")success = jigsaw_handler.solve_jigsaw_captcha(driver=driver,slider_selector=".slider-button",background_selector=".captcha-background",piece_selector=".captcha-piece")if success:print("拼图验证码解决成功")else:print("拼图验证码解决失败")finally:driver.quit()
4. 滑块验证码破解
4.1 基础滑块验证码
class SliderCaptchaHandler:def __init__(self):self.session = requests.Session()def detect_slider_gap(self, background_image, slider_image=None):"""检测滑块缺口位置"""# 转换为OpenCV格式bg_cv = cv2.cvtColor(np.array(background_image), cv2.COLOR_RGB2BGR)bg_gray = cv2.cvtColor(bg_cv, cv2.COLOR_BGR2GRAY)if slider_image:# 如果有滑块图片,使用模板匹配slider_cv = cv2.cvtColor(np.array(slider_image), cv2.COLOR_RGB2BGR)slider_gray = cv2.cvtColor(slider_cv, cv2.COLOR_BGR2GRAY)result = cv2.matchTemplate(bg_gray, slider_gray, cv2.TM_CCOEFF_NORMED)_, max_val, _, max_loc = cv2.minMaxLoc(result)return max_loc[0], max_valelse:# 使用边缘检测查找缺口edges = cv2.Canny(bg_gray, 50, 150)# 查找轮廓contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)# 过滤轮廓,找到可能的缺口gap_candidates = []for contour in contours:area = cv2.contourArea(contour)if 500 < area < 5000: # 缺口面积范围x, y, w, h = cv2.boundingRect(contour)aspect_ratio = w / h# 缺口通常是矩形或接近矩形if 0.8 < aspect_ratio < 1.5:gap_candidates.append((x, y, w, h, area))if gap_candidates:# 选择最可能的缺口(通常是面积适中的)gap_candidates.sort(key=lambda x: abs(x[4] - 2000)) # 按接近理想面积排序return gap_candidates[0][0], 1.0 # 返回x坐标return None, 0.0def generate_human_like_trajectory(self, distance):"""生成类人拖拽轨迹"""trajectory = []current = 0# 参数设置acceleration_distance = distance * 0.6 # 加速距离deceleration_start = distance * 0.8 # 减速开始位置while current < distance:if current < acceleration_distance:# 加速阶段speed = random.uniform(1, 3) * (current / acceleration_distance + 0.5)elif current < deceleration_start:# 匀速阶段speed = random.uniform(2, 4)else:# 减速阶段remaining = distance - currentspeed = max(0.5, remaining / 10)current += speedtrajectory.append(min(int(current), distance))# 添加随机停顿if random.random() < 0.1:time.sleep(random.uniform(0.01, 0.05))# 确保最后到达目标位置if trajectory[-1] != distance:trajectory.append(distance)return trajectorydef add_trajectory_noise(self, trajectory):"""为轨迹添加噪声,模拟手抖"""noisy_trajectory = []for i, pos in enumerate(trajectory):# 添加垂直方向的随机偏移y_offset = random.randint(-2, 2)# 添加水平方向的微小偏移x_offset = random.uniform(-0.5, 0.5)noisy_trajectory.append((pos + x_offset, y_offset))return noisy_trajectorydef solve_slider_captcha(self, driver, slider_selector, background_selector=None):"""解决滑块验证码"""try:# 等待滑块元素slider = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, slider_selector)))# 获取滑块初始位置slider_location = slider.locationslider_size = slider.size# 如果有背景图片,分析缺口位置gap_x = Noneif background_selector:try:bg_element = driver.find_element(By.CSS_SELECTOR, background_selector)bg_url = bg_element.get_attribute('src')if bg_url:response = self.session.get(bg_url)bg_image = Image.open(io.BytesIO(response.content))gap_x, confidence = self.detect_slider_gap(bg_image)print(f"检测到缺口位置:{gap_x}, 置信度:{confidence:.2f}")except Exception as e:print(f"背景图片分析失败:{e}")# 如果没有检测到缺口,使用默认距离if gap_x is None:# 通常滑块需要移动到右侧的某个位置container_width = driver.execute_script("return arguments[0].parentElement.offsetWidth", slider)gap_x = container_width * 0.7 # 假设缺口在70%位置# 计算拖拽距离drag_distance = gap_x - slider_location['x']print(f"计算拖拽距离:{drag_distance}px")# 生成拖拽轨迹trajectory = self.generate_human_like_trajectory(abs(drag_distance))noisy_trajectory = self.add_trajectory_noise(trajectory)# 执行拖拽actions = ActionChains(driver)actions.click_and_hold(slider)for x_pos, y_offset in noisy_trajectory:if drag_distance < 0:x_pos = -x_posactions.move_by_offset(1 if x_pos > 0 else -1, y_offset)time.sleep(random.uniform(0.005, 0.02))# 在目标位置停顿time.sleep(random.uniform(0.1, 0.3))# 释放滑块actions.release()actions.perform()# 等待验证结果time.sleep(2)# 检查是否成功return self.check_slider_success(driver)except Exception as e:print(f"滑块验证码解决失败:{e}")return Falsedef check_slider_success(self, driver):"""检查滑块验证是否成功"""try:# 检查常见的成功标识success_indicators = [".captcha-success",".verify-success", ".slider-success","[class*='success']"]for indicator in success_indicators:try:element = driver.find_element(By.CSS_SELECTOR, indicator)if element.is_displayed():return Trueexcept:continue# 检查失败标识failure_indicators = [".captcha-error",".verify-failed",".slider-failed","[class*='error']","[class*='failed']"]for indicator in failure_indicators:try:element = driver.find_element(By.CSS_SELECTOR, indicator)if element.is_displayed():return Falseexcept:continue# 如果没有明确的成功/失败标识,检查滑块位置return True # 默认认为成功except Exception as e:print(f"检查验证结果失败:{e}")return False# 使用示例
slider_handler = SliderCaptchaHandler()driver = webdriver.Chrome()
try:driver.get("https://example.com/slider-captcha")success = slider_handler.solve_slider_captcha(driver=driver,slider_selector=".slider-button",background_selector=".captcha-background")if success:print("滑块验证码解决成功")else:print("滑块验证码解决失败")finally:driver.quit()
4.2 高级滑块验证码
class AdvancedSliderHandler:def __init__(self):self.session = requests.Session()# 轨迹模板(基于真实用户行为数据)self.trajectory_templates = [# 快速直接型{'type': 'direct', 'acceleration': 0.8, 'max_speed': 5, 'noise_level': 0.1},# 谨慎型{'type': 'careful', 'acceleration': 0.3, 'max_speed': 2, 'noise_level': 0.3},# 犹豫型{'type': 'hesitant', 'acceleration': 0.5, 'max_speed': 3, 'noise_level': 0.5}]def analyze_captcha_difficulty(self, background_image):"""分析验证码难度"""# 转换为OpenCV格式bg_cv = cv2.cvtColor(np.array(background_image), cv2.COLOR_RGB2BGR)bg_gray = cv2.cvtColor(bg_cv, cv2.COLOR_BGR2GRAY)# 计算图像复杂度edges = cv2.Canny(bg_gray, 50, 150)edge_density = np.sum(edges > 0) / edges.size# 计算颜色复杂度hist = cv2.calcHist([bg_gray], [0], None, [256], [0, 256])color_variance = np.var(hist)# 综合评估难度difficulty_score = edge_density * 0.6 + (color_variance / 10000) * 0.4if difficulty_score > 0.3:return 'hard'elif difficulty_score > 0.15:return 'medium'else:return 'easy'def select_strategy(self, difficulty):"""根据难度选择策略"""strategies = {'easy': {'template': self.trajectory_templates[0], # 直接型'retry_count': 1,'precision_adjustment': 0},'medium': {'template': self.trajectory_templates[1], # 谨慎型'retry_count': 2,'precision_adjustment': 2},'hard': {'template': self.trajectory_templates[2], # 犹豫型'retry_count': 3,'precision_adjustment': 5}}return strategies.get(difficulty, strategies['medium'])def generate_advanced_trajectory(self, distance, template, precision_adjustment=0):"""生成高级轨迹"""trajectory = []current = 0velocity = 0acceleration = template['acceleration']max_speed = template['max_speed']noise_level = template['noise_level']# 添加起始延迟start_delay = random.uniform(0.1, 0.3)time.sleep(start_delay)while current < distance - precision_adjustment:# 计算加速度(考虑距离剩余)remaining_distance = distance - currentif remaining_distance > distance * 0.7:# 加速阶段velocity += acceleration * random.uniform(0.8, 1.2)velocity = min(velocity, max_speed)elif remaining_distance > distance * 0.3:# 匀速阶段velocity = max_speed * random.uniform(0.8, 1.0)else:# 减速阶段velocity *= 0.85velocity = max(velocity, 0.5)# 添加噪声noise = random.uniform(-noise_level, noise_level)actual_velocity = max(0.1, velocity + noise)current += actual_velocitytrajectory.append(min(current, distance))# 随机微停顿if random.random() < 0.05:time.sleep(random.uniform(0.01, 0.03))# 精确调整阶段for _ in range(precision_adjustment):adjustment = random.uniform(0.5, 1.5)current += adjustmenttrajectory.append(min(current, distance))time.sleep(random.uniform(0.02, 0.05))return trajectorydef add_behavioral_patterns(self, trajectory):"""添加行为模式"""enhanced_trajectory = []for i, pos in enumerate(trajectory):# 添加垂直偏移(模拟手抖)y_offset = 0# 在特定位置添加更大的偏移if i < len(trajectory) * 0.2: # 开始阶段y_offset = random.randint(-1, 1)elif i > len(trajectory) * 0.8: # 结束阶段y_offset = random.randint(-2, 2)else: # 中间阶段y_offset = random.randint(-1, 1)# 偶尔添加回退动作if random.random() < 0.02 and i > 0:backtrack = random.uniform(0.5, 2.0)enhanced_trajectory.append((pos - backtrack, y_offset))time.sleep(random.uniform(0.01, 0.02))enhanced_trajectory.append((pos, y_offset))return enhanced_trajectorydef solve_advanced_slider(self, driver, config):"""解决高级滑块验证码"""max_attempts = 3for attempt in range(max_attempts):try:print(f"尝试第 {attempt + 1} 次解决滑块验证码")# 等待元素加载slider = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, config['slider_selector'])))# 分析验证码difficulty = 'medium' # 默认难度if config.get('background_selector'):try:bg_element = driver.find_element(By.CSS_SELECTOR, config['background_selector'])bg_url = bg_element.get_attribute('src')if bg_url:response = self.session.get(bg_url)bg_image = Image.open(io.BytesIO(response.content))difficulty = self.analyze_captcha_difficulty(bg_image)print(f"验证码难度:{difficulty}")except Exception as e:print(f"难度分析失败:{e}")# 选择策略strategy = self.select_strategy(difficulty)# 检测缺口位置gap_x = self.detect_gap_position(driver, config)if gap_x is None:print("无法检测缺口位置,使用默认策略")container = slider.find_element(By.XPATH, "./..")container_width = container.size['width']gap_x = container_width * 0.75# 计算拖拽距离slider_location = slider.locationdrag_distance = gap_x - slider_location['x']print(f"拖拽距离:{drag_distance}px")# 生成轨迹trajectory = self.generate_advanced_trajectory(abs(drag_distance), strategy['template'],strategy['precision_adjustment'])enhanced_trajectory = self.add_behavioral_patterns(trajectory)# 执行拖拽success = self.execute_drag(driver, slider, enhanced_trajectory, drag_distance < 0)if success:print("滑块验证码解决成功")return Trueelse:print(f"第 {attempt + 1} 次尝试失败")if attempt < max_attempts - 1:time.sleep(random.uniform(1, 3)) # 等待重试except Exception as e:print(f"第 {attempt + 1} 次尝试出错:{e}")continueprint("所有尝试均失败")return Falsedef detect_gap_position(self, driver, config):"""检测缺口位置"""try:if config.get('background_selector'):bg_element = driver.find_element(By.CSS_SELECTOR, config['background_selector'])bg_url = bg_element.get_attribute('src')if bg_url:response = self.session.get(bg_url)bg_image = Image.open(io.BytesIO(response.content))gap_x, confidence = self.detect_slider_gap(bg_image)if confidence > 0.6:return gap_xreturn Noneexcept Exception as e:print(f"缺口检测失败:{e}")return Nonedef execute_drag(self, driver, slider, trajectory, reverse=False):"""执行拖拽操作"""try:actions = ActionChains(driver)# 鼠标悬停actions.move_to_element(slider)actions.perform()time.sleep(random.uniform(0.1, 0.3))# 按下鼠标actions = ActionChains(driver)actions.click_and_hold(slider)actions.perform()time.sleep(random.uniform(0.05, 0.15))# 执行拖拽轨迹for i, (x_pos, y_offset) in enumerate(trajectory):if reverse:x_pos = -x_pos# 计算移动增量if i == 0:dx = x_poselse:dx = x_pos - (trajectory[i-1][0] if not reverse else -trajectory[i-1][0])actions = ActionChains(driver)actions.move_by_offset(dx, y_offset)actions.perform()# 动态延迟delay = random.uniform(0.005, 0.02)if i % 10 == 0: # 每10步稍微停顿delay += random.uniform(0.01, 0.03)time.sleep(delay)# 释放鼠标前的停顿time.sleep(random.uniform(0.1, 0.4))# 释放鼠标actions = ActionChains(driver)actions.release()actions.perform()# 等待验证结果time.sleep(2)return self.check_slider_success(driver)except Exception as e:print(f"拖拽执行失败:{e}")return False# 使用示例
advanced_handler = AdvancedSliderHandler()config = {'slider_selector': '.slider-button','background_selector': '.captcha-background','success_selector': '.verify-success','failure_selector': '.verify-failed'
}driver = webdriver.Chrome()
try:driver.get("https://example.com/advanced-slider")success = advanced_handler.solve_advanced_slider(driver, config)if success:print("高级滑块验证码解决成功")else:print("高级滑块验证码解决失败")finally:driver.quit()
5. 第三方验证码服务
5.1 验证码识别API
import requests
import base64
import time
import jsonclass CaptchaService:def __init__(self, api_key, service_type='2captcha'):self.api_key = api_keyself.service_type = service_type# 不同服务的API配置self.services = {'2captcha': {'submit_url': 'http://2captcha.com/in.php','result_url': 'http://2captcha.com/res.php','balance_url': 'http://2captcha.com/res.php'},'anticaptcha': {'submit_url': 'https://api.anti-captcha.com/createTask','result_url': 'https://api.anti-captcha.com/getTaskResult','balance_url': 'https://api.anti-captcha.com/getBalance'},'deathbycaptcha': {'submit_url': 'http://api.dbcapi.me/api/captcha','result_url': 'http://api.dbcapi.me/api/captcha/{captcha_id}','balance_url': 'http://api.dbcapi.me/api/user'}}def get_balance(self):"""获取账户余额"""if self.service_type == '2captcha':return self._get_2captcha_balance()elif self.service_type == 'anticaptcha':return self._get_anticaptcha_balance()elif self.service_type == 'deathbycaptcha':return self._get_dbc_balance()def _get_2captcha_balance(self):"""获取2captcha余额"""try:response = requests.get(self.services['2captcha']['balance_url'],params={'key': self.api_key,'action': 'getbalance'})if response.text.startswith('ERROR'):return {'error': response.text}else:return {'balance': float(response.text)}except Exception as e:return {'error': str(e)}def _wait_for_2captcha_result(self, captcha_id, timeout=120):"""等待2captcha结果"""start_time = time.time()while time.time() - start_time < timeout:time.sleep(5) # 等待5秒response = requests.get(self.services['2captcha']['result_url'],params={'key': self.api_key,'action': 'get','id': captcha_id})if response.text == 'CAPCHA_NOT_READY':continueelif response.text.startswith('ERROR'):return {'error': response.text}elif response.text.startswith('OK'):return {'result': response.text.split('|')[1]}return {'error': 'TIMEOUT'}def solve_recaptcha_v2(self, site_key, page_url, **kwargs):"""解决reCAPTCHA v2"""if self.service_type == '2captcha':return self._solve_2captcha_recaptcha_v2(site_key, page_url, **kwargs)def _solve_2captcha_recaptcha_v2(self, site_key, page_url, **kwargs):"""使用2captcha解决reCAPTCHA v2"""try:submit_data = {'key': self.api_key,'method': 'userrecaptcha','googlekey': site_key,'pageurl': page_url}# 添加额外参数if kwargs.get('invisible'):submit_data['invisible'] = '1'if kwargs.get('data_s'):submit_data['data-s'] = kwargs['data_s']response = requests.post(self.services['2captcha']['submit_url'],data=submit_data)if response.text.startswith('ERROR'):return {'error': response.text}captcha_id = response.text.split('|')[1]# 等待结果(reCAPTCHA通常需要更长时间)return self._wait_for_2captcha_result(captcha_id, timeout=300)except Exception as e:return {'error': str(e)}def solve_hcaptcha(self, site_key, page_url):"""解决hCaptcha"""try:submit_data = {'key': self.api_key,'method': 'hcaptcha','sitekey': site_key,'pageurl': page_url}response = requests.post(self.services['2captcha']['submit_url'],data=submit_data)if response.text.startswith('ERROR'):return {'error': response.text}captcha_id = response.text.split('|')[1]return self._wait_for_2captcha_result(captcha_id, timeout=300)except Exception as e:return {'error': str(e)}# 使用示例
captcha_service = CaptchaService(api_key='your_api_key', service_type='2captcha')# 检查余额
balance = captcha_service.get_balance()
print(f"账户余额:{balance}")# 解决文字验证码
text_result = captcha_service.solve_text_captcha('captcha.jpg',numeric=False,min_len=4,max_len=6
)
print(f"文字验证码结果:{text_result}")# 解决reCAPTCHA v2
recaptcha_result = captcha_service.solve_recaptcha_v2(site_key='6Le-wvkSAAAAAPBMRTvw0Q4Muexq9bi0DJwx_mJ-',page_url='https://example.com/recaptcha'
)
print(f"reCAPTCHA结果:{recaptcha_result}")
5.2 集成验证码服务到爬虫
class CaptchaIntegratedSpider:def __init__(self, captcha_service):self.captcha_service = captcha_serviceself.session = requests.Session()self.driver = Nonedef init_driver(self):"""初始化浏览器驱动"""options = webdriver.ChromeOptions()options.add_argument('--no-sandbox')options.add_argument('--disable-dev-shm-usage')self.driver = webdriver.Chrome(options=options)def handle_text_captcha(self, captcha_selector, input_selector):"""处理文字验证码"""try:# 获取验证码图片captcha_img = self.driver.find_element(By.CSS_SELECTOR, captcha_selector)captcha_url = captcha_img.get_attribute('src')# 使用第三方服务识别result = self.captcha_service.solve_text_captcha(captcha_url)if 'result' in result:# 输入验证码input_field = self.driver.find_element(By.CSS_SELECTOR, input_selector)input_field.clear()input_field.send_keys(result['result'])return Trueelse:print(f"验证码识别失败:{result.get('error', '未知错误')}")return Falseexcept Exception as e:print(f"处理文字验证码失败:{e}")return Falsedef handle_recaptcha(self, site_key):"""处理reCAPTCHA"""try:current_url = self.driver.current_url# 使用第三方服务解决reCAPTCHAresult = self.captcha_service.solve_recaptcha_v2(site_key, current_url)if 'result' in result:# 注入reCAPTCHA响应response_token = result['result']# 设置reCAPTCHA响应self.driver.execute_script(f"""document.getElementById('g-recaptcha-response').innerHTML = '{response_token}';if (typeof grecaptcha !== 'undefined') {{grecaptcha.getResponse = function() {{ return '{response_token}'; }};}}""")return Trueelse:print(f"reCAPTCHA解决失败:{result.get('error', '未知错误')}")return Falseexcept Exception as e:print(f"处理reCAPTCHA失败:{e}")return Falsedef crawl_with_captcha_handling(self, url, captcha_config):"""带验证码处理的爬取"""try:self.driver.get(url)# 等待页面加载time.sleep(3)# 检查是否有验证码captcha_detected = False# 检查文字验证码if captcha_config.get('text_captcha'):try:self.driver.find_element(By.CSS_SELECTOR, captcha_config['text_captcha']['selector'])captcha_detected = Trueprint("检测到文字验证码,正在处理...")success = self.handle_text_captcha(captcha_config['text_captcha']['selector'],captcha_config['text_captcha']['input_selector'])if not success:return Noneexcept:pass# 检查reCAPTCHAif captcha_config.get('recaptcha'):try:recaptcha_element = self.driver.find_element(By.CSS_SELECTOR, '.g-recaptcha')site_key = recaptcha_element.get_attribute('data-sitekey')if site_key:captcha_detected = Trueprint("检测到reCAPTCHA,正在处理...")success = self.handle_recaptcha(site_key)if not success:return Noneexcept:pass# 如果处理了验证码,提交表单if captcha_detected:submit_button = self.driver.find_element(By.CSS_SELECTOR, captcha_config.get('submit_selector', 'input[type="submit"]'))submit_button.click()# 等待页面跳转time.sleep(5)# 提取数据return self.extract_data()except Exception as e:print(f"爬取失败:{e}")return Nonedef extract_data(self):"""提取页面数据"""try:# 这里实现具体的数据提取逻辑title = self.driver.find_element(By.TAG_NAME, 'title').textreturn {'title': title,'url': self.driver.current_url,'timestamp': time.time()}except Exception as e:print(f"数据提取失败:{e}")return Nonedef close(self):"""关闭资源"""if self.driver:self.driver.quit()# 使用示例
captcha_service = CaptchaService(api_key='your_api_key')
spider = CaptchaIntegratedSpider(captcha_service)spider.init_driver()try:captcha_config = {'text_captcha': {'selector': '.captcha-image','input_selector': '#captcha-input'},'recaptcha': True,'submit_selector': '#submit-btn'}result = spider.crawl_with_captcha_handling('https://example.com/protected-page',captcha_config)if result:print(f"爬取成功:{result}")else:print("爬取失败")finally:spider.close()
6. 实战案例:电商网站登录
6.1 综合验证码处理
class EcommerceLoginBot:def __init__(self, captcha_service_config=None):self.driver = Noneself.captcha_handlers = {}# 初始化各种验证码处理器self.text_recognizer = CaptchaRecognizer()self.slider_handler = AdvancedSliderHandler()if captcha_service_config:self.captcha_service = CaptchaService(**captcha_service_config)else:self.captcha_service = Nonedef init_driver(self):"""初始化浏览器"""options = webdriver.ChromeOptions()options.add_argument('--no-sandbox')options.add_argument('--disable-dev-shm-usage')options.add_argument('--disable-blink-features=AutomationControlled')# 添加用户代理options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')self.driver = webdriver.Chrome(options=options)# 隐藏自动化特征self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")def login_with_captcha_handling(self, login_url, username, password):"""带验证码处理的登录"""try:self.driver.get(login_url)time.sleep(3)# 输入用户名和密码username_field = self.driver.find_element(By.NAME, 'username')password_field = self.driver.find_element(By.NAME, 'password')username_field.clear()username_field.send_keys(username)password_field.clear()password_field.send_keys(password)# 检查并处理各种验证码captcha_solved = self.detect_and_solve_captcha()if not captcha_solved:print("验证码处理失败")return False# 提交登录表单login_button = self.driver.find_element(By.CSS_SELECTOR, 'button[type="submit"], input[type="submit"]')login_button.click()# 等待登录结果time.sleep(5)# 检查登录是否成功return self.check_login_success()except Exception as e:print(f"登录失败:{e}")return Falsedef detect_and_solve_captcha(self):"""检测并解决验证码"""# 检查文字验证码if self.detect_text_captcha():return self.solve_text_captcha()# 检查滑块验证码if self.detect_slider_captcha():return self.solve_slider_captcha()# 检查点击验证码if self.detect_click_captcha():return self.solve_click_captcha()# 检查reCAPTCHAif self.detect_recaptcha():return self.solve_recaptcha()# 没有验证码return Truedef detect_text_captcha(self):"""检测文字验证码"""selectors = ['.captcha-image','#captcha-img','img[src*="captcha"]','img[src*="verify"]']for selector in selectors:try:element = self.driver.find_element(By.CSS_SELECTOR, selector)if element.is_displayed():return Trueexcept:continuereturn Falsedef solve_text_captcha(self):"""解决文字验证码"""try:# 查找验证码图片captcha_img = Noneimg_selectors = ['.captcha-image','#captcha-img','img[src*="captcha"]','img[src*="verify"]']for selector in img_selectors:try:captcha_img = self.driver.find_element(By.CSS_SELECTOR, selector)if captcha_img.is_displayed():breakexcept:continueif not captcha_img:return False# 获取验证码图片URLcaptcha_url = captcha_img.get_attribute('src')# 尝试本地识别result = self.text_recognizer.recognize_text_captcha(captcha_url)# 如果本地识别失败,使用第三方服务if not result and self.captcha_service:service_result = self.captcha_service.solve_text_captcha(captcha_url)if 'result' in service_result:result = service_result['result']if result:# 查找输入框input_selectors = ['#captcha-input','input[name*="captcha"]','input[name*="verify"]','input[placeholder*="验证码"]']for selector in input_selectors:try:input_field = self.driver.find_element(By.CSS_SELECTOR, selector)input_field.clear()input_field.send_keys(result)return Trueexcept:continuereturn Falseexcept Exception as e:print(f"文字验证码处理失败:{e}")return Falsedef detect_slider_captcha(self):"""检测滑块验证码"""selectors = ['.slider-button','.slide-verify','[class*="slider"]','[id*="slider"]']for selector in selectors:try:element = self.driver.find_element(By.CSS_SELECTOR, selector)if element.is_displayed():return Trueexcept:continuereturn Falsedef solve_slider_captcha(self):"""解决滑块验证码"""try:config = {'slider_selector': '.slider-button, .slide-verify, [class*="slider"], [id*="slider"]','background_selector': '.captcha-background, [class*="bg"], [class*="background"]'}return self.slider_handler.solve_advanced_slider(self.driver, config)except Exception as e:print(f"滑块验证码处理失败:{e}")return Falsedef detect_click_captcha(self):"""检测点击验证码"""selectors = ['.click-captcha','.image-captcha','[class*="click"]']for selector in selectors:try:element = self.driver.find_element(By.CSS_SELECTOR, selector)if element.is_displayed():return Trueexcept:continuereturn Falsedef solve_click_captcha(self):"""解决点击验证码"""try:# 这里可以集成点击验证码处理逻辑# 由于点击验证码通常需要图像识别,这里简化处理print("检测到点击验证码,需要人工处理")return Falseexcept Exception as e:print(f"点击验证码处理失败:{e}")return Falsedef detect_recaptcha(self):"""检测reCAPTCHA"""try:element = self.driver.find_element(By.CSS_SELECTOR, '.g-recaptcha')return element.is_displayed()except:return Falsedef solve_recaptcha(self):"""解决reCAPTCHA"""try:if not self.captcha_service:print("需要配置第三方验证码服务来处理reCAPTCHA")return Falserecaptcha_element = self.driver.find_element(By.CSS_SELECTOR, '.g-recaptcha')site_key = recaptcha_element.get_attribute('data-sitekey')if site_key:result = self.captcha_service.solve_recaptcha_v2(site_key, self.driver.current_url)if 'result' in result:# 注入reCAPTCHA响应response_token = result['result']self.driver.execute_script(f"""document.getElementById('g-recaptcha-response').innerHTML = '{response_token}';if (typeof grecaptcha !== 'undefined') {{grecaptcha.getResponse = function() {{ return '{response_token}'; }};}}""")return Truereturn Falseexcept Exception as e:print(f"reCAPTCHA处理失败:{e}")return Falsedef check_login_success(self):"""检查登录是否成功"""try:# 检查URL变化current_url = self.driver.current_urlif 'dashboard' in current_url or 'profile' in current_url or 'account' in current_url:return True# 检查页面元素success_indicators = ['.user-info','.logout-button','[class*="welcome"]','#user-menu']for indicator in success_indicators:try:element = self.driver.find_element(By.CSS_SELECTOR, indicator)if element.is_displayed():return Trueexcept:continue# 检查错误信息error_indicators = ['.error-message','.login-error','[class*="error"]']for indicator in error_indicators:try:element = self.driver.find_element(By.CSS_SELECTOR, indicator)if element.is_displayed() and element.text:print(f"登录错误:{element.text}")return Falseexcept:continuereturn Falseexcept Exception as e:print(f"检查登录状态失败:{e}")return Falsedef close(self):"""关闭浏览器"""if self.driver:self.driver.quit()# 使用示例
captcha_config = {'api_key': 'your_2captcha_api_key','service_type': '2captcha'
}login_bot = EcommerceLoginBot(captcha_config)
login_bot.init_driver()try:success = login_bot.login_with_captcha_handling(login_url='https://example-ecommerce.com/login',username='your_username',password='your_password')if success:print("登录成功!")# 继续执行其他操作else:print("登录失败!")finally:login_bot.close()
7. 练习与作业
7.1 基础练习
-
文字验证码识别
- 实现一个简单的数字验证码识别器
- 处理带噪声的字母验证码
- 比较不同OCR引擎的效果
-
图像预处理
- 实现验证码图像去噪算法
- 练习二值化和形态学操作
- 字符分割技术实现
7.2 进阶练习
-
滑块验证码
- 实现基础滑块验证码破解
- 优化拖拽轨迹生成算法
- 处理复杂背景的缺口检测
-
验证码服务集成
- 集成多个第三方验证码服务
- 实现服务切换和负载均衡
- 成本优化策略
7.3 实战项目
-
多平台登录系统
- 支持多种验证码类型
- 自动重试和错误处理
- 成功率统计和优化
-
验证码识别服务
- 构建本地验证码识别API
- 支持批量处理
- 性能监控和优化
8. 常见问题与解决方案
8.1 识别准确率问题
- 图像预处理优化
- 多引擎结果融合
- 训练自定义模型
8.2 反检测问题
- 行为模拟优化
- 请求频率控制
- 环境指纹管理
8.3 成本控制
- 本地识别优先
- 服务商选择策略
- 缓存机制实现
9. 下节预告
下一课我们将学习**《Python爬虫第10课:分布式爬虫架构与Scrapy-Redis》**,内容包括:
- 分布式爬虫基础概念
- Scrapy-Redis框架详解
- Redis在爬虫中的应用
- 分布式任务调度
- 数据去重和存储
- 监控和管理系统
验证码识别是爬虫技术中的重要环节,掌握这些技术将大大提升你的爬虫能力!
def solve_text_captcha(self, image_path_or_url, **kwargs):"""解决文字验证码"""if self.service_type == '2captcha':return self._solve_2captcha_text(image_path_or_url, **kwargs)elif self.service_type == 'anticaptcha':return self._solve_anticaptcha_text(image_path_or_url, **kwargs)elif self.service_type == 'deathbycaptcha':return self._solve_dbc_text(image_path_or_url, **kwargs)def _solve_2captcha_text(self, image_path_or_url, **kwargs):"""使用2captcha解决文字验证码"""try:# 准备图片数据if image_path_or_url.startswith('http'):response = requests.get(image_path_or_url)image_data = base64.b64encode(response.content).decode()else:with open(image_path_or_url, 'rb') as f:image_data = base64.b64encode(f.read()).decode()# 提交验证码submit_data = {'key': self.api_key,'method': 'base64','body': image_data}# 添加额外参数if kwargs.get('numeric'):submit_data['numeric'] = '1'if kwargs.get('min_len'):submit_data['min_len'] = str(kwargs['min_len'])if kwargs.get('max_len'):submit_data['max_len'] = str(kwargs['max_len'])if kwargs.get('phrase'):submit_data['phrase'] = '1'if kwargs.get('case_sensitive'):submit_data['regsense'] = '1'response = requests.post(self.services['2captcha']['submit_url'],data=submit_data)if response.text.startswith('ERROR'):return {'error': response.text}captcha_id = response.text.split('|')[1]# 等待结果return self._wait_for_2captcha_result(captcha_id)except Exception as e:return {'error': str(e)}
结尾
希望对初学者有帮助;致力于办公自动化的小小程序员一枚
希望能得到大家的【❤️一个免费关注❤️】感谢!
求个 🤞 关注 🤞 +❤️ 喜欢 ❤️ +👍 收藏 👍
此外还有办公自动化专栏,欢迎大家订阅:Python办公自动化专栏
此外还有爬虫专栏,欢迎大家订阅:Python爬虫基础专栏
此外还有Python基础专栏,欢迎大家订阅:Python基础学习专栏