当前位置：首页 > news >正文

机器学习案例——对好评和差评进行预测

news 2025/8/18 14:02:37

一、概述

本项目旨在构建一个中文评论情感分类系统，能够自动判断用户评论的情感倾向。我们使用两种标注明确的评论数据：

优质评价（标记为1）：包含正面情感的评论，如"质量非常好，物超所值"、"服务态度很棒，下次还会来"等
差评（标记为0）：包含负面情感的评论，如"完全不符合描述，非常失望"、"客服态度极差，再也不买了"等

项目目标是通过机器学习技术，训练一个能够准确区分这两种评论的分类模型。具体应用场景包括：

电商平台的商品评价自动分析
社交媒体舆情监控
客户服务反馈分类

二、数据准备与预处理

1. 数据加载与初步检查

import pandas as pd# 读取数据文件，指定编码格式为GBK以兼容中文
goodop = pd.read_table('优质评价.txt', encoding='gbk', names=['content'])  # 好评数据
badop = pd.read_table('差评.txt', encoding='gbk', names=['content'])      # 差评数据# 读取停用词表，使用utf-8编码
stopwords = pd.read_csv('StopwordsCN.txt', encoding='utf-8', engine='python', names=['stopword'])  # 停用词表# 检查数据基本情况
print(f"优质评价数量：{len(goodop)}")
print(f"差评数量：{len(badop)}")
print(f"停用词数量：{len(stopwords)}")

2. 中文分词处理

使用jieba进行中文分词，采用精确模式：

import jieba# 添加自定义词典（可选）
jieba.load_userdict('user_dict.txt')  # 包含领域特定词汇def get_words(comment):"""对评论内容进行分词处理参数：comment: 包含评论内容的DataFrame返回：分词后的二维列表，每个子列表包含一条评论的分词结果"""words = []for content in comment.content.values.tolist():# 精确模式分词，过滤掉空格和空字符串result = [word.strip() for word in jieba.lcut(content) if word.strip()]if len(result) > 1:  # 过滤掉单字评论words.append(result)return words# 对好评和差评分别分词
goodwords = get_words(goodop)
badwords = get_words(badop)# 示例：查看第一条好评的分词结果
print("示例分词结果：", goodwords[0])

3. 停用词过滤

def stop_words(comment, stopword):"""过滤停用词参数：comment: 分词后的评论列表stopword: 停用词列表返回：过滤停用词后的评论列表"""filtered_words = []for text in comment:# 保留不在停用词表中且长度大于1的词filtered_text = [word for word in text if word not in stopword and len(word) > 1]if filtered_text:  # 过滤后不为空才保留filtered_words.append(filtered_text)return filtered_words# 获取停用词列表
stop = stopwords.stopword.values.tolist()# 扩展停用词表（可选）
additional_stopwords = ['这个', '那个', '一些']
stop.extend(additional_stopwords)# 过滤停用词
good = stop_words(goodwords, stop)
bad = stop_words(badwords, stop)# 示例：查看过滤后的第一条差评
print("过滤后示例：", bad[0])

三、特征工程

1. 构建标注数据集

# 创建DataFrame并添加标签
good_df = pd.DataFrame({'content': good, 'label': 1})  # 好评标记为1
bad_df = pd.DataFrame({'content': bad, 'label': 0})    # 差评标记为0# 合并数据集并打乱顺序
train = pd.concat([good_df, bad_df]).sample(frac=1, random_state=42).reset_index(drop=True)# 检查类别分布
print("数据分布：")
print(train['label'].value_counts())

2. 划分训练集和测试集

from sklearn.model_selection import train_test_split# 将分词列表转换为字符串形式
def prepare_text_data(text_list):return [' '.join(text) for text in text_list]# 划分训练集和测试集（80%训练，20%测试）
x_train, x_test, y_train, y_test = train_test_split(train.content.values,train.label.values,random_state=42,  # 固定随机种子保证可复现test_size=0.2,stratify=train.label.values  # 保持类别比例
)# 检查划分结果
print(f"训练集大小：{len(x_train)}")
print(f"测试集大小：{len(x_test)}")

3. 文本向量化

from sklearn.feature_extraction.text import CountVectorizer# 准备训练文本数据
train_text = prepare_text_data(x_train)# 初始化CountVectorizer
vectorizer = CountVectorizer(max_features=5000,    # 限制特征数量ngram_range=(1,3),    # 考虑1-3元的词组min_df=5,             # 忽略出现次数少于5次的词max_df=0.8            # 忽略出现在80%以上文档中的词
)# 拟合向量化器并转换训练数据
vec_train = vectorizer.fit_transform(train_text)# 查看特征数量
print(f"特征数量：{vec_train.shape[1]}")

四、模型构建与训练

1. 处理类别不平衡

from imblearn.over_sampling import SMOTEprint("处理前类别分布：", pd.Series(y_train).value_counts())# 应用SMOTE算法进行过采样
smote = SMOTE(random_state=0,       # 随机种子sampling_strategy='auto',  # 平衡少数类k_neighbors=5         # 近邻数
)
vec_train_res, y_train_res = smote.fit_resample(vec_train, y_train)print("处理后类别分布：", pd.Series(y_train_res).value_counts())

2. 训练朴素贝叶斯分类器

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV# 初始化分类器
classifier = MultinomialNB()# 设置参数网格
param_grid = {'alpha': [0.1, 0.5, 1.0],  # 平滑参数'fit_prior': [True, False]  # 是否学习类先验概率
}# 网格搜索
grid_search = GridSearchCV(classifier,param_grid,cv=5,              # 5折交叉验证scoring='f1',      # 使用F1分数评估n_jobs=-1          # 使用所有CPU核心
)# 执行网格搜索
grid_search.fit(vec_train_res, y_train_res)# 获取最佳模型
best_classifier = grid_search.best_estimator_
print("最佳参数：", grid_search.best_params_)

五、模型评估

1. 训练集评估

from sklearn import metrics# 训练集预测
y_pred = best_classifier.predict(vec_train_res)# 输出分类报告
print('训练集评估结果：')
print(metrics.classification_report(y_train_res, y_pred))# 绘制混淆矩阵
conf_matrix = metrics.confusion_matrix(y_train_res, y_pred)
print("混淆矩阵：")
print(conf_matrix)

2. 测试集评估

# 准备测试数据
test_text = prepare_text_data(x_test)
vec_test = vectorizer.transform(test_text)# 测试集预测
y_test_pred = best_classifier.predict(vec_test)# 输出评估结果
print('测试集评估结果：')
print(metrics.classification_report(y_test, y_test_pred))# 计算ROC曲线下面积
y_score = best_classifier.predict_proba(vec_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_score)
roc_auc = metrics.auc(fpr, tpr)
print(f"ROC AUC: {roc_auc:.4f}")

六、模型应用：预测新评论

def predict_sentiment(text):"""预测新评论的情感倾向参数：text: 待预测的评论文本返回：情感倾向标签和概率"""# 分词words = jieba.lcut(text)# 过滤停用词filtered = [word for word in words if word not in stop and len(word) > 1]# 准备输入prepared = ' '.join(filtered)# 向量化vectorized = vectorizer.transform([prepared])# 预测prediction = best_classifier.predict(vectorized)proba = best_classifier.predict_proba(vectorized)[0]result = {'label': '好评' if prediction[0] == 1 else '差评','confidence': max(proba),'positive_prob': proba[1],'negative_prob': proba[0]}return result# 测试新评论
test_comments = ["这款产品非常好用，我非常满意！","质量太差了，完全不符合描述","一般般吧，没什么特别的感觉"
]for comment in test_comments:result = predict_sentiment(comment)print(f'评论: "{comment}"')print(f'预测结果: {result["label"]} (置信度: {result["confidence"]:.2f})')print(f'正面概率: {result["positive_prob"]:.4f}, 负面概率: {result["negative_prob"]:.4f}')print()