运用平均值填充后的数据进行模型预测
1.代码实现:
import pandas as pd
from sklearn import metricstrain_data = pd.read_excel(r'E:\pythonProject3\矿物分类系统\temp_data\训练数据集[平均值填充].xlsx')
test_data = pd.read_excel(r'E:\pythonProject3\矿物分类系统\temp_data\测试数据集[平均值填充].xlsx')
train_data_x = train_data.drop(['矿物类型'],axis=1)
train_data_y = train_data['矿物类型']
test_data_x = test_data.drop(['矿物类型'],axis=1)
test_data_y = test_data['矿物类型']
result_data = {}from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],'max_iter': [100, 200, 500],'multi_class': ['auto', 'ovr', 'multinomial'],
}logreg = LogisticRegression()
grid_search = GridSearchCV(logreg, param_grid, cv=5)# 创建GridSearchCV对象
grid_search.fit(train_data_x, train_data_y)# 在训练集上执行网格搜索
print("Best parameters set found on development set:")# 输出最佳参数
print(grid_search.best_params_)# """建立最优模型"""
LR_result= {}#用来保存训练之后的结果。
lr = LogisticRegression(C = 0.001, max_iter = 100,penalty='none',solver='newton-cg')
lr.fit(train_data_x,train_data_y)
train_predict=lr.predict(train_data_x)
print('线性回归训练集预测结果:')
print(metrics.classification_report(train_data_y,train_predict,digits=2))
test_predict = lr.predict(test_data_x)
print('线性回归测试集预测结果:')
print(metrics.classification_report(test_data_y,test_predict,digits=2))
a =metrics.classification_report(test_data_y,test_predict,digits=2)
b = a.split()
print(b[25])
LR_result['recall 0']= float(b[6])
LR_result['recall_1']= float(b[11])
LR_result['recall_2']= float(b[16])
LR_result['recall_3']= float(b[21])
LR_result['acc']= float(b[25])
result_data['LR']= LR_resultimport json
result = {}
result['mean fill']= result_data
with open(r'temp_data/平均值填充result.json','w',encoding='utf-8')as file:json.dump(result, file, ensure_ascii=False, indent=4)from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
param_grid = {# 核函数:优先尝试RBF(处理非线性特征)和线性核(高维特征时)'kernel': ['rbf', 'linear'],# 正则化参数:矿物数据常用1-100范围'C': [1, 10, 50, 100],# RBF核的带宽参数:控制样本影响范围'gamma': ['scale', 'auto', 0.01, 0.1, 1],# 类别权重:处理矿物样本不平衡问题'class_weight': [None, 'balanced']
}
svm = SVC(probability=True, random_state=42)
grid_search = GridSearchCV(svm, param_grid, cv=5)# 创建GridSearchCV对象
grid_search.fit(train_data_x, train_data_y)# 在训练集上执行网格搜索
print("Best parameters set found on development set:")# 输出最佳参数
print(grid_search.best_params_)
from sklearn.svm import SVC
svm_result = {}
svm = SVC(C=10,class_weight=None,gamma=1,kernel='rbf')
svm.fit(train_data_x,train_data_y)
train_predict=svm.predict(train_data_x)
print('svm训练集预测结果:')
print(metrics.classification_report(train_data_y,train_predict,digits=2))
test_predict = svm.predict(test_data_x)
print('svm测试集预测结果:')
print(metrics.classification_report(test_data_y,test_predict,digits=2))
a =metrics.classification_report(test_data_y,test_predict,digits=2)
b = a.split()
print(b[25])
svm_result['recall 0']= float(b[6])
svm_result['recall_1']= float(b[11])
svm_result['recall_2']= float(b[16])
svm_result['recall_3']= float(b[21])
svm_result['acc']= float(b[25])
result_data['svm']= svm_resultimport json
result1 = {}
result1['mean fill']= result_data
with open(r'temp_data/平均值填充result.json','w',encoding='utf-8')as file:json.dump(result, file, ensure_ascii=False, indent=4)from sklearn.ensemble import RandomForestClassifier
RF_result = {}
rfc = RandomForestClassifier(n_estimators=250,max_depth=20,max_features='sqrt',min_samples_split=3)
rfc.fit(train_data_x,train_data_y)
train_predict=rfc.predict(train_data_x)
print('随机森林训练集预测结果:')
print(metrics.classification_report(train_data_y,train_predict,digits=2))
test_predict = rfc.predict(test_data_x)
print('随机森林测试集预测结果:')
print(metrics.classification_report(test_data_y,test_predict,digits=2))
a =metrics.classification_report(test_data_y,test_predict,digits=2)
b = a.split()
print(b[25])
RF_result['recall 0']= float(b[6])
RF_result['recall_1']= float(b[11])
RF_result['recall_2']= float(b[16])
RF_result['recall_3']= float(b[21])
RF_result['acc']= float(b[25])
result_data['RF']= RF_resultimport json
result2 = {}
result2['mean fill']= result_data
with open(r'temp_data/平均值填充result.json','w',encoding='utf-8')as file:json.dump(result, file, ensure_ascii=False, indent=4)from sklearn.naive_bayes import GaussianNB
GNB_result ={}
gnb= GaussianNB()
gnb.fit(train_data_x,train_data_y)
train_predict = gnb.predict(train_data_x)
print('GNB的train:n',metrics.classification_report(train_data_y, train_predict))
test_predict = gnb.predict(test_data_x)
print('GNB的test:\n',metrics.classification_report(test_data_y, test_predict))
a =metrics.classification_report(test_data_y,test_predict,digits=2)
b = a.split()
print(b[25])
GNB_result['recall 0']= float(b[6])
GNB_result['recall_1']= float(b[11])
GNB_result['recall_2']= float(b[16])
GNB_result['recall_3']= float(b[21])
GNB_result['acc']= float(b[25])
result_data['GNB']= GNB_resultimport json
result3 = {}
result3['mean fill']= result_data
with open(r'temp_data/平均值填充result.json','w',encoding='utf-8')as file:json.dump(result, file, ensure_ascii=False, indent=4)import xgboost as xgb
XGBoost_result ={}
xgb_model = xgb.XGBClassifier(learning_rate=0.05,n_estimatprs=200,num_class =5,max_depth=7,min_child_weight=1,gamma=0,subsample=0.6,colsample_bytree=0.8,objective='multi:softmax',seed=0)
xgb_model.fit(train_data_x,train_data_y)
train_predict = xgb_model.predict(train_data_x)
print('XGBoost的train:n',metrics.classification_report(train_data_y, train_predict))
test_predict = xgb_model.predict(test_data_x)
print('XGBoost的test:\n',metrics.classification_report(test_data_y, test_predict))
a =metrics.classification_report(test_data_y,test_predict,digits=2)
b = a.split()
print(b[25])
XGBoost_result['recall 0']= float(b[6])
XGBoost_result['recall_1']= float(b[11])
XGBoost_result['recall_2']= float(b[16])
XGBoost_result['recall_3']= float(b[21])
XGBoost_result['acc']= float(b[25])
result_data['XGBoost']= XGBoost_resultimport json
result4 = {}
result4['mean fill']= result_data
with open(r'temp_data/平均值填充result.json','w',encoding='utf-8')as file:json.dump(result, file, ensure_ascii=False, indent=4)from sklearn.ensemble import AdaBoostClassifier
AdaBoost_result = {}
ada = AdaBoostClassifier(n_estimators=100,learning_rate=0.3)
ada.fit(train_data_x,train_data_y)
train_predict = ada.predict(train_data_x)
print('AdaBoost的train:n',metrics.classification_report(train_data_y, train_predict))
test_predict = ada.predict(test_data_x)
print('AdaBoost的test:\n',metrics.classification_report(test_data_y, test_predict))
a =metrics.classification_report(test_data_y,test_predict,digits=2)
b = a.split()
print(b[25])
AdaBoost_result['recall 0']= float(b[6])
AdaBoost_result['recall_1']= float(b[11])
AdaBoost_result['recall_2']= float(b[16])
AdaBoost_result['recall_3']= float(b[21])
AdaBoost_result['acc']= float(b[25])
result_data['AdaBoost']= AdaBoost_resultimport json
result5 = {}
result5['mean fill']= result_data
with open(r'temp_data/平均值填充result.json','w',encoding='utf-8')as file:json.dump(result, file, ensure_ascii=False, indent=4)
代码的核心逻辑是:
- 读取预处理后的训练集和测试集(采用平均值填充缺失值)
- 分割特征数据(X)和目标变量(y,即 "矿物类型")
- 依次训练多种分类模型(逻辑回归、SVM、随机森林等)
- 对每个模型进行性能评估,提取关键指标(召回率、准确率)
- 将所有模型的结果汇总并保存到 JSON 文件
2. 关键代码解析
2.1 数据准备
import pandas as pd
from sklearn import metrics# 读取训练集和测试集(Excel文件)
train_data = pd.read_excel(r'E:\pythonProject3\矿物分类系统\temp_data\训练数据集[平均值填充].xlsx')
test_data = pd.read_excel(r'E:\pythonProject3\矿物分类系统\temp_data\测试数据集[平均值填充].xlsx')# 分割特征(X)和目标变量(y)
train_data_x = train_data.drop(['矿物类型'], axis=1) # 训练特征
train_data_y = train_data['矿物类型'] # 训练标签(矿物类型)
test_data_x = test_data.drop(['矿物类型'], axis=1) # 测试特征
test_data_y = test_data['矿物类型'] # 测试标签
result_data = {} # 用于存储所有模型的结果
2.2 模型训练与评估(以逻辑回归为例)
所有模型的处理逻辑一致,以第一个模型(逻辑回归)为例解析:
# 1. 超参数网格搜索(GridSearchCV)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV# 定义超参数候选范围
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], # 正则化强度(越小正则化越强)'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], # 优化器'max_iter': [100, 200, 500], # 最大迭代次数'multi_class': ['auto', 'ovr', 'multinomial'] # 多分类策略
}# 网格搜索:5折交叉验证(cv=5)寻找最佳参数
logreg = LogisticRegression()
grid_search = GridSearchCV(logreg, param_grid, cv=5)
grid_search.fit(train_data_x, train_data_y)
print("最佳参数:", grid_search.best_params_) # 输出找到的最佳参数# 2. 用最佳参数训练模型并评估
LR_result = {} # 存储当前模型的结果# 用最佳参数初始化模型(这里手动指定了参数,可能是网格搜索后确定的最优值)
lr = LogisticRegression(C=0.001, max_iter=100, penalty='none', solver='newton-cg')
lr.fit(train_data_x, train_data_y) # 训练模型# 在训练集上预测并打印评估报告
train_predict = lr.predict(train_data_x)
print('线性回归训练集预测结果:')
print(metrics.classification_report(train_data_y, train_predict, digits=2))# 在测试集上预测并打印评估报告
test_predict = lr.predict(test_data_x)
print('线性回归测试集预测结果:')
print(metrics.classification_report(test_data_y, test_predict, digits=2))# 3. 提取关键指标并保存
a = metrics.classification_report(test_data_y, test_predict, digits=2) # 评估报告字符串
b = a.split() # 分割字符串为列表(用于提取数值)# 提取4个类别的召回率(recall)和总体准确率(acc)
LR_result['recall 0'] = float(b[6]) # 类别0的召回率
LR_result['recall_1'] = float(b[11]) # 类别1的召回率
LR_result['recall_2'] = float(b[16]) # 类别2的召回率
LR_result['recall_3'] = float(b[21]) # 类别3的召回率
LR_result['acc'] = float(b[25]) # 总体准确率result_data['LR'] = LR_result # 将当前模型结果加入总结果字典
2.3 其他模型的处理
代码后续依次训练了以下模型,流程与逻辑回归一致:
- SVM(支持向量机):使用
SVC
,通过网格搜索优化kernel
(核函数)、C
(正则化)等参数。 - 随机森林:使用
RandomForestClassifier
,直接指定参数(未用网格搜索)。 - 朴素贝叶斯:使用
GaussianNB
(高斯朴素贝叶斯),无复杂超参数。 - XGBoost:使用
XGBClassifier
,指定学习率、树深度等参数(注意代码中n_estimatprs
是拼写错误,应为n_estimators
)。 - AdaBoost:使用
AdaBoostClassifier
,优化迭代次数和学习率。
2.4 结果保存
每个模型训练后,都会将结果更新到result_data
,最终通过 JSON 保存:
import json
result = {}
result['mean fill'] = result_data # 标记当前结果是"平均值填充"策略下的结果
with open(r'temp_data/平均值填充result.json', 'w', encoding='utf-8') as file:json.dump(result, file, ensure_ascii=False, indent=4) # 保存为JSON文件
4. 总结
这段代码实现了一个完整的矿物类型分类实验流程,通过多种机器学习算法进行分类,并对比各模型性能。核心目标是评估不同算法在 "平均值填充" 预处理策略下的分类效果,为矿物分类系统选择最优模型提供依据。