当前位置：首页 > news >正文

实验二-决策树-葡萄酒

news 2025/10/21 10:32:23

# 导入所需库
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import graphviz
import matplotlib.pyplot as plt# 解决matplotlib中文显示问题
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei']  # 支持中文的字体列表
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示异常问题# 读取葡萄酒质量数据集（需确保文件路径正确）
data = pd.read_csv("D:data\Wine_Quality_Data.csv")# 查看数据基本信息（观察数据结构、缺失值等）
print("数据前5行：")
print(data.head())
print("\n数据基本信息：")
print(data.info())
print("\n数据描述性统计：")
print(data.describe())# 数据类型转换：将颜色特征（white/red）转换为整数（0/1）
color_map = {'white': 0, 'red': 1}
data['color'] = data['color'].map(color_map).astype(int)  # 用map显式映射，替换np.int为内置int# 提取特征列和标签列
# 特征列：除color外的所有列；标签列：color（预测葡萄酒颜色）
feature_cols = [x for x in data.columns if x != 'color']
feature = data[feature_cols]  # 特征矩阵
target = data['color']        # 标签向量# 拆分数据集：测试集占比30%，随机种子设为42（保证结果可复现）
Xtrain, Xtest, ytrain, ytest = train_test_split(feature, target, test_size=0.3, random_state=42
)# 检查数据集分布一致性（比较训练集与测试集标签占比）
print("\n训练集标签分布（颜色：样本数）：")
print(ytrain.value_counts(normalize=True))  # normalize=True显示占比
print("\n测试集标签分布（颜色：样本数）：")
print(ytest.value_counts(normalize=True))# 3.1 基础模型训练与测试（无参数限制）
# 创建决策树分类器实例
dt_base = tree.DecisionTreeClassifier(random_state=42)  # 设random_state确保结果稳定
# 用训练集训练模型
dt_base.fit(Xtrain, ytrain)
# 测试集预测与正确率计算
score_base = dt_base.score(Xtest, ytest)
print(f"\n基础模型测试集正确率：{score_base:.4f}")# 3.2 详细性能评估（训练集+测试集的准确率、查准率、查全率、F1值）
# 定义性能评估函数
def measure_error(y_true, y_pred, label):"""计算分类模型的4个关键指标y_true：真实标签，y_pred：预测标签，label：结果名称（如train/test）返回包含4个指标的Series"""return pd.Series({'accuracy': accuracy_score(y_true, y_pred),  # 准确率'precision': precision_score(y_true, y_pred, zero_division=0),  # 查准率（避免0除错误）'recall': recall_score(y_true, y_pred, zero_division=0),        # 查全率'f1': f1_score(y_true, y_pred, zero_division=0)                # F1值}, name=label)# 分别在训练集和测试集上预测
ytrain_pred_base = dt_base.predict(Xtrain)
ytest_pred_base = dt_base.predict(Xtest)# 整合训练集与测试集性能指标
train_test_error = pd.concat([measure_error(ytrain, ytrain_pred_base, 'train'),measure_error(ytest, ytest_pred_base, 'test')
], axis=1)  # axis=1表示按列拼接print("\n训练集与测试集性能指标对比：")
print(train_test_error)# 定义中文特征名（与feature_cols顺序对应）
feature_name_cn = ['非挥发性酸', '挥发性酸', '柠檬酸', '剩余糖分', '氯化物','游离二氧化硫', '总二氧化硫', '密度', 'pH', '硫酸盐', '酒精', '质量'
]# 生成树状图数据
dot_data = tree.export_graphviz(dt_base,                  # 训练好的决策树模型feature_names=feature_name_cn,  # 特征中文名称class_names=['白葡萄酒', '红葡萄酒'],  # 标签中文名称（0=白，1=红）filled=True,              # 节点填充颜色（按类别区分）rounded=True              # 节点边框圆角
)# 绘制并显示树状图（需提前安装GraphViz，且配置环境变量）
graph = graphviz.Source(dot_data)
graph.render("wine_color_decision_tree")  # 保存树状图为PDF文件
graph.view()  # 打开树状图查看# 5.1 调试max_depth（最大树深度）：寻找最优深度
test_scores = []  # 存储不同深度的测试集正确率
depth_range = range(1, 11)  # 深度从1到10for depth in depth_range:# 构建带指定深度的决策树dt_tune = tree.DecisionTreeClassifier(max_depth=depth,criterion='entropy',    # 用信息增益计算不纯度random_state=42,splitter='best'         # 优先选择重要特征分枝)dt_tune.fit(Xtrain, ytrain)# 记录测试集正确率test_scores.append(dt_tune.score(Xtest, ytest))# 绘制max_depth与测试集正确率的关系图
plt.figure(figsize=(8, 4))
plt.plot(depth_range, test_scores, color='red', marker='o', label='测试集正确率')
plt.xlabel('max_depth（最大树深度）')
plt.ylabel('测试集正确率')
plt.title('max_depth对决策树性能的影响')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()# 输出最优深度及对应正确率
best_depth = depth_range[test_scores.index(max(test_scores))]
best_score = max(test_scores)
print(f"\n最优max_depth：{best_depth}，对应测试集正确率：{best_score:.4f}")# 5.2 调试criterion（不纯度计算方式）：对比gini与entropy
# 构建两种criterion的模型
dt_gini = tree.DecisionTreeClassifier(criterion='gini', random_state=42, max_depth=best_depth)
dt_entropy = tree.DecisionTreeClassifier(criterion='entropy', random_state=42, max_depth=best_depth)# 训练并评估
dt_gini.fit(Xtrain, ytrain)
dt_entropy.fit(Xtrain, ytrain)score_gini = dt_gini.score(Xtest, ytest)
score_entropy = dt_entropy.score(Xtest, ytest)print(f"\ncriterion='gini'的测试集正确率：{score_gini:.4f}")
print(f"criterion='entropy'的测试集正确率：{score_entropy:.4f}")

数据前5行：
fixed_acidity volatile_acidity citric_acid ... alcohol quality color
0 7.4 0.70 0.00 ... 9.4 5 red
1 7.8 0.88 0.00 ... 9.8 5 red
2 7.8 0.76 0.04 ... 9.8 5 red
3 11.2 0.28 0.56 ... 9.8 6 red
4 7.4 0.70 0.00 ... 9.4 5 red

[5 rows x 13 columns]

数据基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed_acidity 6497 non-null float64
1 volatile_acidity 6497 non-null float64
2 citric_acid 6497 non-null float64
3 residual_sugar 6497 non-null float64
4 chlorides 6497 non-null float64
5 free_sulfur_dioxide 6497 non-null float64
6 total_sulfur_dioxide 6497 non-null float64
7 density 6497 non-null float64
8 pH 6497 non-null float64
9 sulphates 6497 non-null float64
10 alcohol 6497 non-null float64
11 quality 6497 non-null int64
12 color 6497 non-null object
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB
None

数据描述性统计：
fixed_acidity volatile_acidity ... alcohol quality
count 6497.000000 6497.000000 ... 6497.000000 6497.000000
mean 7.215307 0.339666 ... 10.491801 5.818378
std 1.296434 0.164636 ... 1.192712 0.873255
min 3.800000 0.080000 ... 8.000000 3.000000
25% 6.400000 0.230000 ... 9.500000 5.000000
50% 7.000000 0.290000 ... 10.300000 6.000000
75% 7.700000 0.400000 ... 11.300000 6.000000
max 15.900000 1.580000 ... 14.900000 9.000000

[8 rows x 12 columns]

训练集标签分布（颜色：样本数）：
color
0 0.758742
1 0.241258
Name: proportion, dtype: float64

测试集标签分布（颜色：样本数）：
color
0 0.742564
1 0.257436
Name: proportion, dtype: float64

基础模型测试集正确率：0.9862

训练集与测试集性能指标对比：
train test
accuracy 0.999560 0.986154
precision 1.000000 0.975952
recall 0.998177 0.970120
f1 0.999088 0.973027