当前位置：首页 > news >正文

3.机器学习-分类模型-线性模型

news 2025/8/26 5:44:18

模型名称	特点	适用场景	sklearn 类
逻辑回归 (Logistic Regression)	线性分类，输出概率，可正则化防止过拟合	二分类/多分类，特征线性可分	`linear_model.LogisticRegression`
感知机 (Perceptron)	简单线性分类器，无概率输出	线性可分数据	`linear_model.Perceptron`

一.逻辑回归

# 导入逻辑回归模型
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None)    # 显示最大行数
pd.set_option('display.max_columns', None)  # 显示最大列数
pd.set_option('display.max_colwidth', None)  # 显示的最大列宽
pd.set_option('display.width', None)  # 显示的最宽度# 导入数据
data = pd.read_excel("股票客户流失.xlsx")# 数据预处理
# 4.1 使用均值填写缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)# 4.2 处理异常值
numeric_data = data.select_dtypes(include=[np.number])
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))  # 仅对数值型数据计算 Z-score
threshold = 3  # Z-score 阈值 3个标准差
outliers = (z_scores > threshold).any(axis=1)  # 检测异常值
print("检测到的异常值行索引:\n", data[outliers].index.tolist())  # 输出异常值的行索引
print(data[outliers])
data = data[~outliers]  # 移除异常值# 划分特征和目标变量
X = data.drop("是否流失", axis=1)
y = data["是否流失"]# 4.3 将数据划分为训练集与测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# 4.4 创建标准化训练集与测试集
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)# 建立逻辑回归模型
model = LogisticRegression(max_iter=1000, random_state=42)# 参数网格
param_grid = {'C': [0.01, 0.1, 1, 10, 100],  # 正则化强度的倒数'penalty': ['l1', 'l2'],  # 正则化类型'solver': ['liblinear']  # 适用于小数据集的优化算法
}# 网格搜索
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=KFold(n_splits=5, random_state=42, shuffle=True),scoring='accuracy',  # 使用准确率作为评估指标n_jobs=-1
)
grid_search.fit(X_train, y_train)# 输出最佳参数组合
print("最佳参数组合:", grid_search.best_params_)# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)# 输出测试集的评估指标
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集 F1 Score:", f1_score(y_test, y_pred))
print("测试集 ROC AUC Score:", roc_auc_score(y_test, y_pred))
print("混淆矩阵:\n", confusion_matrix(y_test, y_pred))
print("分类报告:\n", classification_report(y_test, y_pred))# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=KFold(n_splits=5, random_state=42, shuffle=True), scoring='accuracy')
print(f"5折交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")

二.感知机

# 导入感知机模型
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None)    # 显示最大行数
pd.set_option('display.max_columns', None)  # 显示最大列数
pd.set_option('display.max_colwidth', None)  # 显示的最大列宽
pd.set_option('display.width', None)  # 显示的最宽度# 导入数据
data = pd.read_excel("股票客户流失.xlsx")# 数据预处理
# 4.1 使用均值填写缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)# 4.2 处理异常值
numeric_data = data.select_dtypes(include=[np.number])
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))  # 仅对数值型数据计算 Z-score
threshold = 3  # Z-score 阈值 3个标准差
outliers = (z_scores > threshold).any(axis=1)  # 检测异常值
print("检测到的异常值行索引:\n", data[outliers].index.tolist())  # 输出异常值的行索引
print(data[outliers])
data = data[~outliers]  # 移除异常值# 划分特征和目标变量
X = data.drop("是否流失", axis=1)
y = data["是否流失"]# 4.3 将数据划分为训练集与测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# 4.4 创建标准化训练集与测试集
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)# 建立感知机模型
model = Perceptron(random_state=42)# 参数网格
param_grid = {'alpha': [0.0001, 0.001, 0.01, 0.1],  # 正则化参数'max_iter': [100, 500, 1000],  # 最大迭代次数'eta0': [0.01, 0.1, 1.0],  # 学习率'penalty': ['l1', 'l2', 'elasticnet', None]  # 正则化类型
}# 网格搜索
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=KFold(n_splits=5, random_state=42, shuffle=True),scoring='accuracy',  # 使用准确率作为评估指标n_jobs=-1
)
grid_search.fit(X_train, y_train)# 输出最佳参数组合
print("最佳参数组合:", grid_search.best_params_)# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)# 输出测试集的评估指标
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集 F1 Score:", f1_score(y_test, y_pred))
print("测试集 ROC AUC Score:", roc_auc_score(y_test, y_pred))
print("混淆矩阵:\n", confusion_matrix(y_test, y_pred))
print("分类报告:\n", classification_report(y_test, y_pred))# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=KFold(n_splits=5, random_state=42, shuffle=True), scoring='accuracy')
print(f"5折交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")

查看全文

http://www.dtcms.com/a/238009.html