基于DrissionPage的趣易百影院数据采集实战指南
一、引言:爬虫技术演进与DrissionPage的优势
在数据驱动的时代,网页爬虫已成为获取互联网信息的重要工具。传统爬虫工具如requests+BeautifulSoup组合虽轻量高效,但面对动态渲染页面时力不从心;而Selenium虽能模拟浏览器行为,却存在性能瓶颈和复杂配置问题。DrissionPage作为新兴的Python自动化工具,巧妙融合了requests的效率与Selenium的渲染能力,为网页数据采集提供了全新解决方案。
本文将以趣易百影院(假设目标网站)为例,完整演示从环境搭建到数据存储的全流程,并深入探讨反爬策略、分布式架构等高级技巧,提供生产环境可用的完整解决方案。
二、环境搭建与基础配置
2.1 安装与验证
pip install DrissionPage pandas fake_useragent redis sqlalchemy --upgrade
验证安装:
from DrissionPage import ChromiumPage, SessionPage
import pandas as pdprint("DrissionPage版本:", ChromiumPage().get_driver_version())
2.2 浏览器高级配置
from DrissionPage import ChromiumOptions
from fake_useragent import UserAgentdef create_browser_config(proxy=None, headless=False):"""创建浏览器配置对象"""co = ChromiumOptions()# 基础配置co.headless(headless)co.set_argument("--disable-gpu")co.set_argument("--no-sandbox")co.set_argument("--disable-dev-shm-usage")# 反检测配置co.set_argument("--disable-blink-features=AutomationControlled")co.set_argument("--disable-web-security")co.set_argument("--allow-running-insecure-content")co.set_argument("--disable-notifications")# 用户代理随机化ua = UserAgent().chromeco.set_user_agent(ua)# 代理设置if proxy:co.set_proxy(proxy)# 忽略证书错误co.ignore_certificate_errors(True)return co# 使用配置
browser_config = create_browser_config(headless=True)
page = ChromiumPage(browser_config)
三、基础操作:元素定位与数据提取
3.1 智能页面导航与等待机制
def safe_navigate(url, timeout=30, retries=3):"""安全的页面导航函数"""for attempt in range(retries):try:page.get(url, timeout=timeout)# 多种等待策略结合page.wait.load_start(timeout=10)page.wait.doc_loaded(timeout=15)# 检查关键元素是否存在if page.eles('.movie-item'):return Trueelse:page.wait.ele_loaded('.movie-item', timeout=10)return Trueexcept Exception as e:print(f"导航尝试 {attempt + 1} 失败: {str(e)}")if attempt == retries - 1:raisetime.sleep(2 ** attempt) # 指数退避return False# 使用示例
safe_navigate("https://www.yiibai.com/movie")
3.2 健壮的元素定位方法
def safe_element_locator(parent, selector, selector_type='css', timeout=5, default=None):"""安全的元素定位器"""try:if selector_type == 'css':element = parent.ele(selector, timeout=timeout)elif selector_type == 'xpath':element = parent.ele(f'xpath:{selector}', timeout=timeout)else:raise ValueError("不支持的选择器类型")return element if element else defaultexcept Exception as e:print(f"元素定位失败: {selector}, 错误: {str(e)}")return defaultdef extract_text(element, selector, default="", strip=True):"""安全提取文本"""target = safe_element_locator(element, selector)if not target:return defaulttext = target.textreturn text.strip() if strip and text else textdef extract_attribute(element, selector, attr, default=""):"""安全提取属性"""target = safe_element_locator(element, selector)if not target:return defaultreturn target.attr(attr) or default
四、进阶技巧:动态加载与分布式爬虫
4.1 高级动态内容处理
def handle_dynamic_content(page, max_scroll=10, scroll_delay=1):"""处理滚动加载内容"""last_height = page.run_js("return document.body.scrollHeight")scroll_attempts = 0while scroll_attempts < max_scroll:# 滚动到底部page.run_js("window.scrollTo(0, document.body.scrollHeight)")page.wait.load_complete(timeout=scroll_delay)# 检查新内容加载new_height = page.run_js("return document.body.scrollHeight")if new_height == last_height:breaklast_height = new_heightscroll_attempts += 1time.sleep(scroll_delay)def monitor_network_requests(page, url_pattern, timeout=30):"""监听特定网络请求"""try:page.listen.start(url_pattern)packet = page.listen.wait(timeout=timeout)if packet and packet.response:return packet.response.bodyexcept Exception as e:print(f"网络监听失败: {str(e)}")finally:page.listen.stop()return None
4.2 分布式爬虫架构实现
import redis
import json
from multiprocessing import Pool, Manager
import hashlibclass DistributedSpider:def __init__(self, redis_host='localhost', redis_port=6379):self.redis_conn = redis.Redis(host=redis_host, port=redis_port, db=0)self.task_queue = "movie:tasks"self.result_queue = "movie:results"self.visited_set = "movie:visited"def generate_task_id(self, url):"""生成任务ID"""return hashlib.md5(url.encode()).hexdigest()def add_task(self, url, priority=0):"""添加任务到队列"""task_id = self.generate_task_id(url)if not self.redis_conn.sismember(self.visited_set, task_id):task_data = {'url': url,'priority': priority,'task_id': task_id}self.redis_conn.zadd(self.task_queue, {json.dumps(task_data): priority})def get_task(self):"""获取任务"""task = self.redis_conn.zrange(self.task_queue, 0, 0)if task:self.redis_conn.zrem(self.task_queue, task[0])return json.loads(task[0])return Nonedef mark_visited(self, task_id):"""标记任务为已处理"""self.redis_conn.sadd(self.visited_set, task_id)def save_result(self, result):"""保存结果"""self.redis_conn.rpush(self.result_queue, json.dumps(result))
五、实战案例:趣易百影院数据采集
5.1 增强版爬虫实现
import pandas as pd
import re
from datetime import datetime
import sqlalchemy as db
from sqlalchemy import create_engine, Column, String, Float, Text, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmakerBase = declarative_base()class Movie(Base):__tablename__ = 'movies'id = Column(String(32), primary_key=True)title = Column(String(200), nullable=False)link = Column(String(500))rating = Column(Float)director = Column(String(100))actors = Column(Text)year = Column(Integer)release_date = Column(DateTime)description = Column(Text)created_at = Column(DateTime, default=datetime.now)updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)class EnhancedMovieSpider:def __init__(self, db_url='sqlite:///movies.db'):self.browser = ChromiumPage(create_browser_config(headless=True))self.session_page = SessionPage()self.engine = create_engine(db_url)Base.metadata.create_all(self.engine)self.Session = sessionmaker(bind=self.engine)def clean_rating(self, rating_text):"""清洗评分数据"""if not rating_text:return None# 使用正则表达式提取数字match = re.search(r'(\d+\.\d+|\d+)', rating_text)if match:return float(match.group(1))return Nonedef clean_year(self, year_text):"""清洗年份数据"""if not year_text:return Nonematch = re.search(r'(\d{4})', year_text)if match:return int(match.group(1))return Nonedef parse_list_page(self, page_url):"""解析列表页"""if not safe_navigate(self.browser, page_url):return []handle_dynamic_content(self.browser)movies = []items = self.browser.eles('.movie-item')for item in items:try:title = extract_text(item, '.title::text')link = extract_attribute(item, 'tag:a', 'href')rating_text = extract_text(item, '.rating::text')movie_data = {'title': title,'link': self._make_absolute_url(link),'rating': self.clean_rating(rating_text),'source_url': page_url,'crawl_time': datetime.now()}if movie_data['title'] and movie_data['link']:movies.append(movie_data)except Exception as e:print(f"解析电影条目失败: {str(e)}")continuereturn moviesdef parse_detail_page(self, detail_url):"""解析详情页"""try:self.session_page.get(detail_url, timeout=15)return {'director': extract_text(self.session_page, '.director::text'),'actors': extract_text(self.session_page, '.actors::text'),'year': self.clean_year(extract_text(self.session_page, '.year::text')),'description': extract_text(self.session_page, '.description::text'),'release_date': self._parse_release_date(extract_text(self.session_page, '.release-date::text'))}except Exception as e:print(f"解析详情页失败 {detail_url}: {str(e)}")return {}def _parse_release_date(self, date_text):"""解析发布日期"""try:if date_text:# 支持多种日期格式for fmt in ('%Y-%m-%d', '%Y/%m/%d', '%Y年%m月%d日'):try:return datetime.strptime(date_text, fmt)except ValueError:continueexcept Exception:passreturn Nonedef _make_absolute_url(self, url):"""转换为绝对URL"""if url.startswith('http'):return urlreturn f"https://www.yiibai.com{url}"def save_to_database(self, movie_data):"""保存到数据库"""session = self.Session()try:# 生成唯一IDmovie_id = hashlib.md5(movie_data['link'].encode()).hexdigest()# 检查是否已存在existing = session.query(Movie).filter_by(id=movie_id).first()if existing:# 更新现有记录existing.rating = movie_data.get('rating')existing.updated_at = datetime.now()else:# 创建新记录movie = Movie(id=movie_id,title=movie_data['title'],link=movie_data['link'],rating=movie_data.get('rating'),director=movie_data.get('director'),actors=movie_data.get('actors'),year=movie_data.get('year'),release_date=movie_data.get('release_date'),description=movie_data.get('description'))session.add(movie)session.commit()except Exception as e:session.rollback()print(f"数据库保存失败: {str(e)}")finally:session.close()def export_to_excel(self, filename='movies.xlsx'):"""导出到Excel"""try:session = self.Session()movies = session.query(Movie).all()data = []for movie in movies:data.append({'标题': movie.title,'评分': movie.rating,'导演': movie.director,'主演': movie.actors,'年份': movie.year,'上映日期': movie.release_date,'简介': movie.description,'链接': movie.link})df = pd.DataFrame(data)df.to_excel(filename, index=False, engine='openpyxl')except Exception as e:print(f"导出Excel失败: {str(e)}")def run(self, start_page=1, end_page=5):"""运行爬虫"""base_url = "https://www.yiibai.com/movie?page={}"for page_num in range(start_page, end_page + 1):print(f"正在抓取第 {page_num} 页...")try:# 抓取列表页list_url = base_url.format(page_num)movies = self.parse_list_page(list_url)for movie in movies:try:# 抓取详情页detail_data = self.parse_detail_page(movie['link'])movie.update(detail_data)# 保存到数据库self.save_to_database(movie)# 随机延迟,避免请求过快time.sleep(random.uniform(1, 3))except Exception as e:print(f"处理电影详情失败 {movie['link']}: {str(e)}")continue# 页面间延迟time.sleep(random.uniform(2, 5))except Exception as e:print(f"处理页面 {page_num} 失败: {str(e)}")continue# 导出最终结果self.export_to_excel()self.browser.quit()if __name__ == "__main__":spider = EnhancedMovieSpider()spider.run(start_page=1, end_page=5)
5.2 高级反爬策略集成
class AdvancedAntiAntiSpider(EnhancedMovieSpider):def __init__(self, proxy_pool=None, *args, **kwargs):super().__init__(*args, **kwargs)self.proxy_pool = proxy_pool or []self.current_proxy_index = 0self.ua_generator = UserAgent()def rotate_proxy(self):"""轮换代理"""if not self.proxy_pool:return Noneself.current_proxy_index = (self.current_proxy_index + 1) % len(self.proxy_pool)return self.proxy_pool[self.current_proxy_index]def rotate_user_agent(self):"""轮换User-Agent"""new_ua = self.ua_generator.chromeself.browser.set_user_agent(new_ua)return new_uadef execute_with_retry(self, func, max_retries=3, *args, **kwargs):"""带重试机制的请求执行"""for attempt in range(max_retries):try:# 每次重试时轮换代理和UAif attempt > 0:self.rotate_proxy()self.rotate_user_agent()time.sleep(2 ** attempt) # 指数退避return func(*args, **kwargs)except Exception as e:print(f"尝试 {attempt + 1} 失败: {str(e)}")if attempt == max_retries - 1:raisereturn Nonedef detect_and_handle_captcha(self):"""检测和处理验证码"""# 检查是否存在验证码元素captcha_elements = [('css', '#captcha'),('css', '.captcha-container'),('xpath', '//*[contains(text(), "验证码")]'),('xpath', '//*[contains(text(), "CAPTCHA")]')]for selector_type, selector in captcha_elements:if safe_element_locator(self.browser, selector, selector_type):print("检测到验证码,尝试处理...")return self.solve_captcha()return Truedef solve_captcha(self):"""解决验证码"""try:# 方法1: 使用OCR识别try:captcha_img = safe_element_locator(self.browser, '#captcha-img')if captcha_img:img_data = captcha_img.screenshot(as_base64=True)captcha_text = self.ocr_captcha(img_data)input_field = safe_element_locator(self.browser, '#captcha-input')if input_field and captcha_text:input_field.input(captcha_text)return Trueexcept:pass# 方法2: 使用第三方打码平台try:captcha_img = safe_element_locator(self.browser, '#captcha-img')if captcha_img:img_data = captcha_img.screenshot(as_base64=True)captcha_text = self.third_party_captcha_service(img_data)input_field = safe_element_locator(self.browser, '#captcha-input')if input_field and captcha_text:input_field.input(captcha_text)return Trueexcept:pass# 方法3: 等待手动输入print("请手动处理验证码,处理完成后按回车继续...")input()return Trueexcept Exception as e:print(f"验证码处理失败: {str(e)}")return Falsedef ocr_captcha(self, image_data):"""使用OCR识别验证码"""try:import ddddocrocr = ddddocr.DdddOcr()result = ocr.classification(image_data)return resultexcept:return Nonedef third_party_captcha_service(self, image_data):"""调用第三方打码平台"""# 这里以图鉴平台为例,实际使用时需要替换为相应的API调用try:import requestsimport base64# 构造请求数据data = {'username': 'your_username','password': 'your_password','typeid': '1000', # 验证码类型'image': base64.b64encode(image_data).decode()}response = requests.post('http://api.ttshitu.com/predict', json=data)result = response.json()if result['success']:return result['data']['result']except:passreturn None
六、性能优化与监控
6.1 性能监控装饰器
import time
from functools import wrapsdef monitor_performance(func):"""性能监控装饰器"""@wraps(func)def wrapper(*args, **kwargs):start_time = time.time()start_memory = psutil.Process().memory_info().rss if 'psutil' in globals() else 0try:result = func(*args, **kwargs)end_time = time.time()end_memory = psutil.Process().memory_info().rss if 'psutil' in globals() else 0# 记录性能数据performance_data = {'function': func.__name__,'execution_time': end_time - start_time,'memory_used': end_memory - start_memory,'timestamp': datetime.now()}# 这里可以保存到数据库或文件print(f"性能数据: {performance_data}")return resultexcept Exception as e:end_time = time.time()print(f"函数 {func.__name__} 执行失败,耗时: {end_time - start_time:.2f}s")raisereturn wrapper# 使用示例
@monitor_performance
def parse_list_page(self, page_url):# 原有实现pass
6.2 资源管理上下文处理器
from contextlib import contextmanager@contextmanager
def browser_context(config=None):"""浏览器上下文管理器"""browser = Nonetry:browser = ChromiumPage(config or create_browser_config())yield browserfinally:if browser:try:browser.quit()except:pass# 使用示例
with browser_context() as browser:browser.get("https://www.yiibai.com")# 执行操作
七、总结与展望
本文详细介绍了基于DrissionPage的趣易百影院数据采集全流程,提供了生产环境可用的完整解决方案。关键改进包括:
健壮性提升:添加了全面的异常处理和重试机制
反爬增强:实现了代理轮换、UA随机化、验证码处理等高级功能
分布式支持:基于Redis的分布式爬虫架构
数据持久化:支持数据库存储和增量更新
性能监控:集成性能跟踪和资源管理
扩展方向
实时数据管道:集成Kafka或RabbitMQ实现实时数据处理
机器学习集成:使用NLP技术分析电影评论情感
容器化部署:使用Docker容器化爬虫应用
自动化调度:集成Apache Airflow实现爬虫任务调度
前端展示:开发Web界面展示爬取结果和分析数据
DrissionPage凭借其独特的设计理念和强大的功能,为Python爬虫开发者提供了全新的选择。通过本文提供的实战方案,读者可以快速构建高效、稳定的数据采集系统,并可根据实际需求进行灵活扩展。
# 最终使用示例
if __name__ == "__main__":# 配置代理池proxy_pool = ['http://proxy1.example.com:8080','http://proxy2.example.com:8080',# ...更多代理]# 创建增强版爬虫实例spider = AdvancedAntiAntiSpider(proxy_pool=proxy_pool,db_url='sqlite:///movies.db')# 运行爬虫try:spider.run(start_page=1, end_page=10)print("爬取任务完成")except KeyboardInterrupt:print("用户中断爬取")except Exception as e:print(f"爬取过程发生错误: {str(e)}")finally:# 确保资源清理if hasattr(spider, 'browser') and spider.browser:spider.browser.quit()
通过以上优化,我们得到了一个功能完整、健壮性强、可扩展的数据采集系统,能够应对各种复杂的网络环境和反爬机制,为实际项目提供了可靠的技术基础。