B站评论爬虫实战:从数据获取到情感分析
一、B站评论数据结构解析
1.1 API接口分析
B站评论采用RESTful API设计,主要接口:
https://api.bilibili.com/x/v2/reply?type=1&oid={视频aid}&pn={页码}
核心参数说明:
type
: 评论区类型(1=视频,11=专栏,17=动态)oid
: 对象ID(视频为aid,动态为dynamic_id)pn
: 页码(从1开始)ps
: 每页数量(默认20,最大49)sort
: 排序方式(0=时间,2=热度)
1.2 响应数据结构
{"code": 0,"data": {"page": {"num": 1,"size": 20,"count": 2500,"acount": 2500},"replies": [{"rpid": 123456789, // 评论ID"oid": 987654321, // 视频aid"type": 1,"mid": 111222333, // 用户ID"root": 0, // 根评论ID(0表示是一级评论)"parent": 0, // 父评论ID"dialog": 0, // 对话ID"count": 15, // 回复数"rcount": 15, // 实际回复数"like": 520, // 点赞数"ctime": 1634567890, // 发布时间戳"content": {"message": "评论内容","emote": { // 表情数据"[doge]": {"id": 26,"url": "https://..."}}},"member": {"uname": "用户名","avatar": "头像URL","level_info": {"current_level": 6 // 用户等级},"vip": {"vipStatus": 1,"vipType": 2}},"replies": [...] // 二级评论(最多显示3条)}]}
}
1.3 二级评论获取
二级评论需要单独请求:
https://api.bilibili.com/x/v2/reply/reply?type=1&oid={视频aid}&root={根评论rpid}&pn={页码}
二、技术难点与解决方案
2.1 反爬虫机制
难点1:Wbi签名验证
B站从2023年开始对部分接口启用Wbi签名机制,需要:
- 获取img_key和sub_key(从用户信息接口)
- 按规则打乱密钥
- 计算参数的MD5签名
import hashlib
import urllib.parse
from functools import reducedef get_wbi_keys():"""获取最新的img_key和sub_key"""headers = {'User-Agent': 'Mozilla/5.0 ...'}url = 'https://api.bilibili.com/x/web-interface/nav'resp = requests.get(url, headers=headers).json()img_url = resp['data']['wbi_img']['img_url']sub_url = resp['data']['wbi_img']['sub_url']img_key = img_url.split('/')[-1].split('.')[0]sub_key = sub_url.split('/')[-1].split('.')[0]return img_key, sub_keydef gen_wbi_sign(params, img_key, sub_key):"""生成Wbi签名"""mixin_key_enc_tab = [46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49,33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40,61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11,36, 20, 34, 44, 52]orig = img_key + sub_keymixin_key = reduce(lambda s, i: s + orig[i], mixin_key_enc_tab, '')[:32]# 按key排序并拼接params['wts'] = int(time.time())sorted_params = sorted(params.items())query = urllib.parse.urlencode(sorted_params)sign = hashlib.md5((query + mixin_key).encode()).hexdigest()return query + '&w_rid=' + sign
难点2:Cookie与User-Agent校验
必需的请求头配置:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36','Referer': 'https://www.bilibili.com','Origin': 'https://www.bilibili.com','Cookie': 'buvid3=...; SESSDATA=...' # 登录后获取
}
2.2 频率限制与并发控制
问题: 短时间大量请求会触发429错误或IP封禁
解决方案:
import asyncio
import aiohttp
from asyncio import Semaphoreclass BilibiliCrawler:def __init__(self, max_concurrent=5, delay=1.0):self.semaphore = Semaphore(max_concurrent)self.delay = delayasync def fetch_with_limit(self, session, url):async with self.semaphore:await asyncio.sleep(self.delay)async with session.get(url, headers=headers) as resp:return await resp.json()async def crawl_comments(self, oid, max_pages=10):async with aiohttp.ClientSession() as session:tasks = []for pn in range(1, max_pages + 1):url = f'https://api.bilibili.com/x/v2/reply?type=1&oid={oid}&pn={pn}'tasks.append(self.fetch_with_limit(session, url))results = await asyncio.gather(*tasks, return_exceptions=True)return results
2.3 数据完整性保障
难点: 评论总数与实际可获取数量不一致
B站API存在以下限制:
- 单视频最多返回前5000条一级评论
- 二级评论只展示部分,需额外请求
- 部分评论被折叠或删除
策略:
- 按时间和热度两种排序分别爬取
- 结合去重保证数据完整性
- 记录爬取进度,支持断点续爬
import sqlite3class CommentStorage:def __init__(self, db_path='bilibili.db'):self.conn = sqlite3.connect(db_path)self.create_table()def create_table(self):self.conn.execute('''CREATE TABLE IF NOT EXISTS comments (rpid INTEGER PRIMARY KEY,oid INTEGER,content TEXT,like_count INTEGER,reply_count INTEGER,ctime INTEGER,mid INTEGER,uname TEXT,user_level INTEGER,is_vip INTEGER,UNIQUE(rpid))''')self.conn.commit()def save_comment(self, comment_data):try:self.conn.execute('''INSERT OR IGNORE INTO comments VALUES (?,?,?,?,?,?,?,?,?,?)''', comment_data)self.conn.commit()except Exception as e:print(f"保存失败: {e}")
2.4 动态评论加载
问题: 部分热门视频评论采用动态加载,需要cursor分页
新版接口使用cursor模式:
def crawl_with_cursor(oid, cursor=None):url = 'https://api.bilibili.com/x/v2/reply/main'params = {'type': 1,'oid': oid,'mode': 3, # 热度排序'next': cursor if cursor else 0}# 添加Wbi签名signed_params = gen_wbi_sign(params, img_key, sub_key)response = requests.get(url + '?' + signed_params, headers=headers)return response.json()
三、情感分析实战
3.1 数据预处理
import re
import jiebaclass CommentPreprocessor:def __init__(self):# 加载停用词with open('stopwords.txt', 'r', encoding='utf-8') as f:self.stopwords = set(f.read().splitlines())# B站特有表情过滤self.emote_pattern = re.compile(r'\[.*?\]')def clean_text(self, text):# 移除表情text = self.emote_pattern.sub('', text)# 移除URLtext = re.sub(r'http[s]?://\S+', '', text)# 移除@用户text = re.sub(r'@\S+', '', text)# 移除特殊字符text = re.sub(r'[^\w\s]', '', text)return text.strip()def tokenize(self, text):cleaned = self.clean_text(text)words = jieba.cut(cleaned)return [w for w in words if w not in self.stopwords and len(w) > 1]
3.2 情感分析模型
方案1:基于词典的情感分析
class SentimentAnalyzer:def __init__(self):self.load_lexicon()def load_lexicon(self):"""加载情感词典"""self.positive_words = set(line.strip() for line in open('positive.txt', encoding='utf-8'))self.negative_words = set(line.strip() for line in open('negative.txt', encoding='utf-8'))self.degree_words = {'非常': 2.0, '特别': 2.0, '十分': 1.8,'很': 1.5, '比较': 1.2, '稍微': 0.8}self.negation_words = {'不', '没', '无', '非', '莫'}def analyze(self, words):score = 0i = 0while i < len(words):word = words[i]degree = 1.0negation = 1# 检查程度副词if i > 0 and words[i-1] in self.degree_words:degree = self.degree_words[words[i-1]]# 检查否定词if i > 0 and words[i-1] in self.negation_words:negation = -1# 计算情感值if word in self.positive_words:score += degree * negationelif word in self.negative_words:score -= degree * negationi += 1return 1 if score > 0 else (-1 if score < 0 else 0)
方案2:基于预训练模型
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torchclass BertSentiment:def __init__(self, model_name='uer/roberta-base-finetuned-chinanews-chinese'):self.tokenizer = AutoTokenizer.from_pretrained(model_name)self.model = AutoModelForSequenceClassification.from_pretrained(model_name)self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')self.model.to(self.device)def predict(self, text):inputs = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=512)inputs = {k: v.to(self.device) for k, v in inputs.items()}with torch.no_grad():outputs = self.model(**inputs)probs = torch.softmax(outputs.logits, dim=1)label = torch.argmax(probs, dim=1).item()# 0=negative, 1=neutral, 2=positivesentiment_map = {0: -1, 1: 0, 2: 1}return sentiment_map[label], probs[0][label].item()
3.3 多维度情感分析
import numpy as np
from collections import Counterclass AdvancedAnalyzer:def __init__(self):self.sentiment_model = BertSentiment()self.preprocessor = CommentPreprocessor()def analyze_video_comments(self, comments):"""综合分析视频评论"""results = {'total': len(comments),'sentiments': {'positive': 0, 'neutral': 0, 'negative': 0},'avg_sentiment_score': 0,'hot_words': [],'time_trend': [],'user_engagement': {}}all_words = []sentiment_scores = []for comment in comments:# 情感分析text = comment['content']['message']sentiment, score = self.sentiment_model.predict(text)if sentiment == 1:results['sentiments']['positive'] += 1elif sentiment == -1:results['sentiments']['negative'] += 1else:results['sentiments']['neutral'] += 1sentiment_scores.append(sentiment * score)# 词频统计words = self.preprocessor.tokenize(text)all_words.extend(words)# 计算平均情感值results['avg_sentiment_score'] = np.mean(sentiment_scores)# 高频词word_freq = Counter(all_words)results['hot_words'] = word_freq.most_common(20)return results
四、实际应用场景
4.1 舆情监控
class VideoMonitor:def __init__(self, video_ids):self.video_ids = video_idsself.analyzer = AdvancedAnalyzer()def monitor(self):"""实时监控视频评论情绪"""for vid in self.video_ids:comments = self.crawl_latest_comments(vid)analysis = self.analyzer.analyze_video_comments(comments)# 负面情绪预警if analysis['sentiments']['negative'] / analysis['total'] > 0.4:self.send_alert(vid, analysis)def send_alert(self, vid, data):print(f"⚠️ 视频 {vid} 负面评论占比过高: {data}")
4.2 热点话题挖掘
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeansclass TopicExtractor:def extract_topics(self, comments, n_topics=5):"""提取评论热点话题"""preprocessor = CommentPreprocessor()texts = [' '.join(preprocessor.tokenize(c['content']['message'])) for c in comments]# TF-IDF特征提取vectorizer = TfidfVectorizer(max_features=200)features = vectorizer.fit_transform(texts)# KMeans聚类kmeans = KMeans(n_clusters=n_topics, random_state=42)labels = kmeans.fit_predict(features)# 提取每个主题的关键词topics = []for i in range(n_topics):indices = np.where(labels == i)[0]topic_words = Counter()for idx in indices:words = texts[idx].split()topic_words.update(words)topics.append({'topic_id': i,'keywords': topic_words.most_common(10),'comment_count': len(indices)})return topics
4.3 用户画像分析
class UserProfiler:def analyze_active_users(self, comments):"""分析活跃用户特征"""user_stats = {}for comment in comments:mid = comment['mid']if mid not in user_stats:user_stats[mid] = {'username': comment['member']['uname'],'level': comment['member']['level_info']['current_level'],'is_vip': comment['member']['vip']['vipStatus'],'comment_count': 0,'total_likes': 0,'avg_sentiment': []}user_stats[mid]['comment_count'] += 1user_stats[mid]['total_likes'] += comment['like']# 计算用户情感倾向sentiment, _ = self.sentiment_model.predict(comment['content']['message'])user_stats[mid]['avg_sentiment'].append(sentiment)# 找出最活跃用户top_users = sorted(user_stats.items(), key=lambda x: x[1]['comment_count'], reverse=True)[:10]return top_users
4.4 数据可视化
import matplotlib.pyplot as plt
from wordcloud import WordCloudclass CommentVisualizer:def plot_sentiment_distribution(self, analysis_result):"""绘制情感分布饼图"""sentiments = analysis_result['sentiments']labels = ['正面', '中性', '负面']sizes = [sentiments['positive'], sentiments['neutral'], sentiments['negative']]colors = ['#66c2a5', '#8da0cb', '#fc8d62']plt.figure(figsize=(8, 6))plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%')plt.title('评论情感分布')plt.savefig('sentiment_distribution.png', dpi=300, bbox_inches='tight')def generate_wordcloud(self, hot_words):"""生成词云图"""word_freq = dict(hot_words)wc = WordCloud(font_path='simhei.ttf', # 中文字体width=1200,height=600,background_color='white',max_words=100).generate_from_frequencies(word_freq)plt.figure(figsize=(12, 6))plt.imshow(wc, interpolation='bilinear')plt.axis('off')plt.savefig('wordcloud.png', dpi=300, bbox_inches='tight')
五、完整示例
import asyncioasync def main():# 初始化爬虫crawler = BilibiliCrawler(max_concurrent=5, delay=1.5)storage = CommentStorage()analyzer = AdvancedAnalyzer()# 爬取评论video_id = 'BV1xx411c7mD' # 示例视频oid = bv_to_aid(video_id) # 需要BV号转AV号print(f"开始爬取视频 {video_id} 的评论...")results = await crawler.crawl_comments(oid, max_pages=50)# 保存数据all_comments = []for result in results:if isinstance(result, dict) and result.get('code') == 0:comments = result['data']['replies']for comment in comments:storage.save_comment(extract_comment_data(comment))all_comments.append(comment)print(f"共爬取 {len(all_comments)} 条评论")# 情感分析print("开始情感分析...")analysis = analyzer.analyze_video_comments(all_comments)print(f"\n=== 分析结果 ===")print(f"总评论数: {analysis['total']}")print(f"正面: {analysis['sentiments']['positive']} "f"({analysis['sentiments']['positive']/analysis['total']*100:.1f}%)")print(f"中性: {analysis['sentiments']['neutral']} "f"({analysis['sentiments']['neutral']/analysis['total']*100:.1f}%)")print(f"负面: {analysis['sentiments']['negative']} "f"({analysis['sentiments']['negative']/analysis['total']*100:.1f}%)")print(f"\n平均情感分: {analysis['avg_sentiment_score']:.3f}")print(f"\n高频词TOP10:")for word, count in analysis['hot_words'][:10]:print(f" {word}: {count}")if __name__ == '__main__':asyncio.run(main())
六、注意事项与最佳实践
合规性要求
- 遵守B站robots.txt协议
- 控制爬取频率,避免对服务器造成压力
- 不得用于商业用途或侵犯用户隐私
性能优化
- 使用异步请求提升效率
- 实现增量爬取,避免重复抓取
- 合理设置请求超时和重试机制
数据质量
- 过滤垃圾评论和机器人评论
- 处理表情符号和特殊字符
- 去除重复评论
长期维护
- API接口可能变更,需定期更新
- 反爬虫策略可能升级,需持续监控
- 定期更新情感词典和模型
总结
B站评论爬虫涉及API解析、反爬对抗、数据处理、情感分析等多个技术环节。通过合理的架构设计和算法选择,可以实现高效的数据采集和深度分析,为内容运营、舆情监控等场景提供数据支持。
在实践中需要注意平衡技术实现与合规要求,确保爬虫程序的稳定性和可维护性。