"""
使用 DrissionPage 爬取 YouTube 视频评论,并插入 MySQL
优化版:复用浏览器、精准选择器、滚动加载、异常处理
"""import pandas as pd
from DrissionPage import ChromiumPage
import pymysql
import time
import logging
file_path = 'youtube.xlsx'
sheet_name = 0
db_config = {'host': 'localhost','user': 'root','password': '123457','database': 'youtube_db','charset': 'utf8mb4','autocommit': True
}
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
df = pd.read_excel(file_path, sheet_name=sheet_name)
video_urls = df['yt-lockup-view-model__content-image href'].dropna().tolist()
page = ChromiumPage()
logging.info("浏览器已启动")
try:connection = pymysql.connect(**db_config)cursor = connection.cursor()logging.info("数据库连接成功")
except Exception as e:logging.error(f"数据库连接失败: {e}")exit(1)
create_table_sql = """
CREATE TABLE IF NOT EXISTS commentes (id INT AUTO_INCREMENT PRIMARY KEY,video_url VARCHAR(255) NOT NULL,author VARCHAR(255) NOT NULL,comment TEXT,created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP -- 移除末尾的逗号
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
"""
cursor.execute(create_table_sql)
connection.commit()
logging.info("数据表检查/创建完成")
for idx, url in enumerate(video_urls):logging.info(f"正在处理第 {idx + 1}/{len(video_urls)} 个视频: {url}")items = []try:page.get(url)time.sleep(3)if not page.ele('#content-text', timeout=10):logging.warning(f"评论区未找到: {url}")continuepage.scroll(500)time.sleep(3)for _ in range(15): page.scroll.down(1500)time.sleep(2.5)try:author_elements = page.eles('@id=author-text')content_elements = page.eles('@id=content-text')for author_elem, content_elem in zip(author_elements, content_elements):items.append((url, author_elem.text.strip(), content_elem.text.strip()))except Exception as e:logging.debug(f"解析单条评论失败: {e}")continuelogging.info(f"从 {url} 采集到 {len(items)} 条有效评论")if items:insert_sql = """INSERT INTO commentes (video_url, author, comment) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE created_at = CURRENT_TIMESTAMP"""cursor.executemany(insert_sql, items)logging.info(f"✅ 成功插入 {len(items)} 条评论")except Exception as e:logging.error(f"处理视频失败 {url}: {e}")connection.rollback()time.sleep(2)
cursor.close()
connection.close()
page.quit()
logging.info("任务完成,资源已释放")