当前位置：首页 > news >正文

B站评论爬虫实战：从数据获取到情感分析

news 2025/10/4 6:41:39

一、B站评论数据结构解析

1.1 API接口分析

B站评论采用RESTful API设计，主要接口：

https://api.bilibili.com/x/v2/reply?type=1&oid={视频aid}&pn={页码}

核心参数说明：

type: 评论区类型（1=视频，11=专栏，17=动态）
oid: 对象ID（视频为aid，动态为dynamic_id）
pn: 页码（从1开始）
ps: 每页数量（默认20，最大49）
sort: 排序方式（0=时间，2=热度）

1.2 响应数据结构

{"code": 0,"data": {"page": {"num": 1,"size": 20,"count": 2500,"acount": 2500},"replies": [{"rpid": 123456789,          // 评论ID"oid": 987654321,            // 视频aid"type": 1,"mid": 111222333,            // 用户ID"root": 0,                   // 根评论ID（0表示是一级评论）"parent": 0,                 // 父评论ID"dialog": 0,                 // 对话ID"count": 15,                 // 回复数"rcount": 15,                // 实际回复数"like": 520,                 // 点赞数"ctime": 1634567890,         // 发布时间戳"content": {"message": "评论内容","emote": {                 // 表情数据"[doge]": {"id": 26,"url": "https://..."}}},"member": {"uname": "用户名","avatar": "头像URL","level_info": {"current_level": 6       // 用户等级},"vip": {"vipStatus": 1,"vipType": 2}},"replies": [...]             // 二级评论（最多显示3条）}]}
}

1.3 二级评论获取

二级评论需要单独请求：

https://api.bilibili.com/x/v2/reply/reply?type=1&oid={视频aid}&root={根评论rpid}&pn={页码}

二、技术难点与解决方案

2.1 反爬虫机制

难点1：Wbi签名验证

B站从2023年开始对部分接口启用Wbi签名机制，需要：

获取img_key和sub_key（从用户信息接口）
按规则打乱密钥
计算参数的MD5签名

import hashlib
import urllib.parse
from functools import reducedef get_wbi_keys():"""获取最新的img_key和sub_key"""headers = {'User-Agent': 'Mozilla/5.0 ...'}url = 'https://api.bilibili.com/x/web-interface/nav'resp = requests.get(url, headers=headers).json()img_url = resp['data']['wbi_img']['img_url']sub_url = resp['data']['wbi_img']['sub_url']img_key = img_url.split('/')[-1].split('.')[0]sub_key = sub_url.split('/')[-1].split('.')[0]return img_key, sub_keydef gen_wbi_sign(params, img_key, sub_key):"""生成Wbi签名"""mixin_key_enc_tab = [46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49,33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40,61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11,36, 20, 34, 44, 52]orig = img_key + sub_keymixin_key = reduce(lambda s, i: s + orig[i], mixin_key_enc_tab, '')[:32]# 按key排序并拼接params['wts'] = int(time.time())sorted_params = sorted(params.items())query = urllib.parse.urlencode(sorted_params)sign = hashlib.md5((query + mixin_key).encode()).hexdigest()return query + '&w_rid=' + sign

难点2：Cookie与User-Agent校验

必需的请求头配置：

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36','Referer': 'https://www.bilibili.com','Origin': 'https://www.bilibili.com','Cookie': 'buvid3=...; SESSDATA=...'  # 登录后获取
}

2.2 频率限制与并发控制

问题： 短时间大量请求会触发429错误或IP封禁

解决方案：

import asyncio
import aiohttp
from asyncio import Semaphoreclass BilibiliCrawler:def __init__(self, max_concurrent=5, delay=1.0):self.semaphore = Semaphore(max_concurrent)self.delay = delayasync def fetch_with_limit(self, session, url):async with self.semaphore:await asyncio.sleep(self.delay)async with session.get(url, headers=headers) as resp:return await resp.json()async def crawl_comments(self, oid, max_pages=10):async with aiohttp.ClientSession() as session:tasks = []for pn in range(1, max_pages + 1):url = f'https://api.bilibili.com/x/v2/reply?type=1&oid={oid}&pn={pn}'tasks.append(self.fetch_with_limit(session, url))results = await asyncio.gather(*tasks, return_exceptions=True)return results

2.3 数据完整性保障

难点： 评论总数与实际可获取数量不一致

B站API存在以下限制：

单视频最多返回前5000条一级评论
二级评论只展示部分，需额外请求
部分评论被折叠或删除

策略：

按时间和热度两种排序分别爬取
结合去重保证数据完整性
记录爬取进度，支持断点续爬

import sqlite3class CommentStorage:def __init__(self, db_path='bilibili.db'):self.conn = sqlite3.connect(db_path)self.create_table()def create_table(self):self.conn.execute('''CREATE TABLE IF NOT EXISTS comments (rpid INTEGER PRIMARY KEY,oid INTEGER,content TEXT,like_count INTEGER,reply_count INTEGER,ctime INTEGER,mid INTEGER,uname TEXT,user_level INTEGER,is_vip INTEGER,UNIQUE(rpid))''')self.conn.commit()def save_comment(self, comment_data):try:self.conn.execute('''INSERT OR IGNORE INTO comments VALUES (?,?,?,?,?,?,?,?,?,?)''', comment_data)self.conn.commit()except Exception as e:print(f"保存失败: {e}")

2.4 动态评论加载

问题： 部分热门视频评论采用动态加载，需要cursor分页

新版接口使用cursor模式：

def crawl_with_cursor(oid, cursor=None):url = 'https://api.bilibili.com/x/v2/reply/main'params = {'type': 1,'oid': oid,'mode': 3,  # 热度排序'next': cursor if cursor else 0}# 添加Wbi签名signed_params = gen_wbi_sign(params, img_key, sub_key)response = requests.get(url + '?' + signed_params, headers=headers)return response.json()

三、情感分析实战

3.1 数据预处理

import re
import jiebaclass CommentPreprocessor:def __init__(self):# 加载停用词with open('stopwords.txt', 'r', encoding='utf-8') as f:self.stopwords = set(f.read().splitlines())# B站特有表情过滤self.emote_pattern = re.compile(r'\[.*?\]')def clean_text(self, text):# 移除表情text = self.emote_pattern.sub('', text)# 移除URLtext = re.sub(r'http[s]?://\S+', '', text)# 移除@用户text = re.sub(r'@\S+', '', text)# 移除特殊字符text = re.sub(r'[^\w\s]', '', text)return text.strip()def tokenize(self, text):cleaned = self.clean_text(text)words = jieba.cut(cleaned)return [w for w in words if w not in self.stopwords and len(w) > 1]

3.2 情感分析模型

方案1：基于词典的情感分析

class SentimentAnalyzer:def __init__(self):self.load_lexicon()def load_lexicon(self):"""加载情感词典"""self.positive_words = set(line.strip() for line in open('positive.txt', encoding='utf-8'))self.negative_words = set(line.strip() for line in open('negative.txt', encoding='utf-8'))self.degree_words = {'非常': 2.0, '特别': 2.0, '十分': 1.8,'很': 1.5, '比较': 1.2, '稍微': 0.8}self.negation_words = {'不', '没', '无', '非', '莫'}def analyze(self, words):score = 0i = 0while i < len(words):word = words[i]degree = 1.0negation = 1# 检查程度副词if i > 0 and words[i-1] in self.degree_words:degree = self.degree_words[words[i-1]]# 检查否定词if i > 0 and words[i-1] in self.negation_words:negation = -1# 计算情感值if word in self.positive_words:score += degree * negationelif word in self.negative_words:score -= degree * negationi += 1return 1 if score > 0 else (-1 if score < 0 else 0)

方案2：基于预训练模型

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torchclass BertSentiment:def __init__(self, model_name='uer/roberta-base-finetuned-chinanews-chinese'):self.tokenizer = AutoTokenizer.from_pretrained(model_name)self.model = AutoModelForSequenceClassification.from_pretrained(model_name)self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')self.model.to(self.device)def predict(self, text):inputs = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=512)inputs = {k: v.to(self.device) for k, v in inputs.items()}with torch.no_grad():outputs = self.model(**inputs)probs = torch.softmax(outputs.logits, dim=1)label = torch.argmax(probs, dim=1).item()# 0=negative, 1=neutral, 2=positivesentiment_map = {0: -1, 1: 0, 2: 1}return sentiment_map[label], probs[0][label].item()

3.3 多维度情感分析

import numpy as np
from collections import Counterclass AdvancedAnalyzer:def __init__(self):self.sentiment_model = BertSentiment()self.preprocessor = CommentPreprocessor()def analyze_video_comments(self, comments):"""综合分析视频评论"""results = {'total': len(comments),'sentiments': {'positive': 0, 'neutral': 0, 'negative': 0},'avg_sentiment_score': 0,'hot_words': [],'time_trend': [],'user_engagement': {}}all_words = []sentiment_scores = []for comment in comments:# 情感分析text = comment['content']['message']sentiment, score = self.sentiment_model.predict(text)if sentiment == 1:results['sentiments']['positive'] += 1elif sentiment == -1:results['sentiments']['negative'] += 1else:results['sentiments']['neutral'] += 1sentiment_scores.append(sentiment * score)# 词频统计words = self.preprocessor.tokenize(text)all_words.extend(words)# 计算平均情感值results['avg_sentiment_score'] = np.mean(sentiment_scores)# 高频词word_freq = Counter(all_words)results['hot_words'] = word_freq.most_common(20)return results

四、实际应用场景

4.1 舆情监控

class VideoMonitor:def __init__(self, video_ids):self.video_ids = video_idsself.analyzer = AdvancedAnalyzer()def monitor(self):"""实时监控视频评论情绪"""for vid in self.video_ids:comments = self.crawl_latest_comments(vid)analysis = self.analyzer.analyze_video_comments(comments)# 负面情绪预警if analysis['sentiments']['negative'] / analysis['total'] > 0.4:self.send_alert(vid, analysis)def send_alert(self, vid, data):print(f"⚠️ 视频 {vid} 负面评论占比过高: {data}")

4.2 热点话题挖掘

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeansclass TopicExtractor:def extract_topics(self, comments, n_topics=5):"""提取评论热点话题"""preprocessor = CommentPreprocessor()texts = [' '.join(preprocessor.tokenize(c['content']['message'])) for c in comments]# TF-IDF特征提取vectorizer = TfidfVectorizer(max_features=200)features = vectorizer.fit_transform(texts)# KMeans聚类kmeans = KMeans(n_clusters=n_topics, random_state=42)labels = kmeans.fit_predict(features)# 提取每个主题的关键词topics = []for i in range(n_topics):indices = np.where(labels == i)[0]topic_words = Counter()for idx in indices:words = texts[idx].split()topic_words.update(words)topics.append({'topic_id': i,'keywords': topic_words.most_common(10),'comment_count': len(indices)})return topics

4.3 用户画像分析

class UserProfiler:def analyze_active_users(self, comments):"""分析活跃用户特征"""user_stats = {}for comment in comments:mid = comment['mid']if mid not in user_stats:user_stats[mid] = {'username': comment['member']['uname'],'level': comment['member']['level_info']['current_level'],'is_vip': comment['member']['vip']['vipStatus'],'comment_count': 0,'total_likes': 0,'avg_sentiment': []}user_stats[mid]['comment_count'] += 1user_stats[mid]['total_likes'] += comment['like']# 计算用户情感倾向sentiment, _ = self.sentiment_model.predict(comment['content']['message'])user_stats[mid]['avg_sentiment'].append(sentiment)# 找出最活跃用户top_users = sorted(user_stats.items(), key=lambda x: x[1]['comment_count'], reverse=True)[:10]return top_users

4.4 数据可视化

import matplotlib.pyplot as plt
from wordcloud import WordCloudclass CommentVisualizer:def plot_sentiment_distribution(self, analysis_result):"""绘制情感分布饼图"""sentiments = analysis_result['sentiments']labels = ['正面', '中性', '负面']sizes = [sentiments['positive'], sentiments['neutral'], sentiments['negative']]colors = ['#66c2a5', '#8da0cb', '#fc8d62']plt.figure(figsize=(8, 6))plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%')plt.title('评论情感分布')plt.savefig('sentiment_distribution.png', dpi=300, bbox_inches='tight')def generate_wordcloud(self, hot_words):"""生成词云图"""word_freq = dict(hot_words)wc = WordCloud(font_path='simhei.ttf',  # 中文字体width=1200,height=600,background_color='white',max_words=100).generate_from_frequencies(word_freq)plt.figure(figsize=(12, 6))plt.imshow(wc, interpolation='bilinear')plt.axis('off')plt.savefig('wordcloud.png', dpi=300, bbox_inches='tight')

五、完整示例

import asyncioasync def main():# 初始化爬虫crawler = BilibiliCrawler(max_concurrent=5, delay=1.5)storage = CommentStorage()analyzer = AdvancedAnalyzer()# 爬取评论video_id = 'BV1xx411c7mD'  # 示例视频oid = bv_to_aid(video_id)  # 需要BV号转AV号print(f"开始爬取视频 {video_id} 的评论...")results = await crawler.crawl_comments(oid, max_pages=50)# 保存数据all_comments = []for result in results:if isinstance(result, dict) and result.get('code') == 0:comments = result['data']['replies']for comment in comments:storage.save_comment(extract_comment_data(comment))all_comments.append(comment)print(f"共爬取 {len(all_comments)} 条评论")# 情感分析print("开始情感分析...")analysis = analyzer.analyze_video_comments(all_comments)print(f"\n=== 分析结果 ===")print(f"总评论数: {analysis['total']}")print(f"正面: {analysis['sentiments']['positive']} "f"({analysis['sentiments']['positive']/analysis['total']*100:.1f}%)")print(f"中性: {analysis['sentiments']['neutral']} "f"({analysis['sentiments']['neutral']/analysis['total']*100:.1f}%)")print(f"负面: {analysis['sentiments']['negative']} "f"({analysis['sentiments']['negative']/analysis['total']*100:.1f}%)")print(f"\n平均情感分: {analysis['avg_sentiment_score']:.3f}")print(f"\n高频词TOP10:")for word, count in analysis['hot_words'][:10]:print(f"  {word}: {count}")if __name__ == '__main__':asyncio.run(main())