DAY 19 常见的特征筛选算法
@浙大疏锦行
1.方差筛选
2.皮尔逊相关系数筛选
3.lasso筛选
4.树模型重要性
5.shap重要性
6.递归特征消除REF
1.方差筛选
# 输入:特征矩阵 X,方差阈值 threshold
# 输出:筛选后的特征矩阵 X_selected
import numpy as npdef variance_selection(X, threshold):# 计算每个特征的方差variances = np.var(X, axis=0)# 选择方差大于阈值的特征索引selected_indices = np.where(variances > threshold)[0]# 筛选特征矩阵X_selected = X[:, selected_indices]return X_selected
- 皮尔逊相关系数筛选
# 输入:特征矩阵 X,目标变量 y,相关系数阈值 threshold
# 输出:筛选后的特征矩阵 X_selected
import numpy as npdef pearson_correlation_selection(X, y, threshold):correlations = []for i in range(X.shape[1]):# 计算皮尔逊相关系数corr = np.corrcoef(X[:, i], y)[0, 1]correlations.append(np.abs(corr))# 选择相关系数绝对值大于阈值的特征索引selected_indices = np.where(np.array(correlations) > threshold)[0]# 筛选特征矩阵X_selected = X[:, selected_indices]return X_selected
- Lasso 筛选
# 输入:特征矩阵 X,目标变量 y,正则化系数 alpha,选择的特征数量 k
# 输出:筛选后的特征矩阵 X_selected
from sklearn.linear_model import Lasso
import numpy as npdef lasso_selection(X, y, alpha, k):# 创建 Lasso 模型并拟合数据lasso = Lasso(alpha=alpha)lasso.fit(X, y)# 获取特征系数coefficients = lasso.coef_# 获取系数绝对值最大的前 k 个特征的索引top_k_indices = np.argsort(np.abs(coefficients))[-k:]# 筛选特征矩阵X_selected = X[:, top_k_indices]return X_selected
- 树模型重要性
# 输入:特征矩阵 X,目标变量 y,选择的特征数量 k
# 输出:筛选后的特征矩阵 X_selected
from sklearn.ensemble import RandomForestClassifier
import numpy as npdef tree_importance_selection(X, y, k):# 创建随机森林模型并拟合数据model = RandomForestClassifier()model.fit(X, y)# 获取特征重要性得分importances = model.feature_importances_# 获取重要性得分最大的前 k 个特征的索引top_k_indices = np.argsort(importances)[-k:]# 筛选特征矩阵X_selected = X[:, top_k_indices]return X_selected
- SHAP 重要性
# 输入:特征矩阵 X,目标变量 y,选择的特征数量 k
# 输出:筛选后的特征矩阵 X_selected
import shap
from sklearn.ensemble import RandomForestClassifier
import numpy as npdef shap_importance_selection(X, y, k):# 创建随机森林模型并拟合数据model = RandomForestClassifier()model.fit(X, y)# 创建 SHAP 解释器explainer = shap.Explainer(model)# 计算 SHAP 值shap_values = explainer(X)# 计算每个特征的平均绝对值 SHAP 值mean_abs_shap = np.abs(shap_values.values).mean(axis=0)# 获取平均绝对值 SHAP 值最大的前 k 个特征的索引top_k_indices = np.argsort(mean_abs_shap)[-k:]# 筛选特征矩阵X_selected = X[:, top_k_indices]return X_selected
- 递归特征消除(RFE)
# 输入:特征矩阵 X,目标变量 y,模型 estimator,要保留的特征数量 n_features_to_select
# 输出:筛选后的特征矩阵 X_selected
from sklearn.feature_selection import RFEdef rfe_selection(X, y, estimator, n_features_to_select):# 创建 RFE 选择器selector = RFE(estimator, n_features_to_select=n_features_to_select)# 拟合数据并进行特征选择selector = selector.fit(X, y)# 筛选特征矩阵X_selected = selector.transform(X)return X_selected