典型的软件开发模型百度seo优化工具
目录
下载数据
一、导入相关包
二、数据加载
三、特征工程
四、构建模型
五、评估与可视化
六、程序流程
七、完整代码
一、导入相关包
# 导入库部分
import numpy as np # 数值计算基础库
import pandas as pd # 数据处理库
from sklearn.preprocessing import MinMaxScaler # 数据标准化
from xgboost import XGBClassifier # XGBoost分类器
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold # 数据分割和超参数优化
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, roc_auc_score) # 评估指标
import matplotlib.pyplot as plt # 可视化
import seaborn as sns # 高级可视化
import joblib # 模型持久化
from datetime import datetime # 时间戳生成
二、数据加载
def load_data(path):"""加载并预处理数据"""df = pd.read_csv(path) # 读取CSV文件# 数据质量断言检查assert 'status' in df.columns, "数据必须包含status列"# 打印关键统计信息(调试用)print(f"数据分布:\n{df['status'].value_counts()}")print(f"\n缺失值统计:\n{df.isnull().sum()}")# 用中位数填充缺失值(比均值更抗异常值)df = df.fillna(df.median())return df
三、特征工程
def feature_engineering(df):"""特征处理"""# 移除标签列和无关列(患者姓名)features = df.drop(['status', 'name'], axis=1) labels = df['status'].values# MinMax标准化到[-1,1]范围scaler = MinMaxScaler(feature_range=(-1, 1))features_scaled = scaler.fit_transform(features)return features_scaled, labels, scaler # 返回scaler对象用于后续推理
四、构建模型
def optimize_model(X_train, y_train):"""使用网格搜索优化XGBoost"""# 扩展的参数网格(基于文献和实验)param_grid = {'learning_rate': [0.01, 0.05, 0.1], # 更精细的学习率设置'max_depth': [3, 5, 7], # 树深度范围'min_child_weight': [1, 3], # 子节点最小权重'gamma': [0, 0.1], # 分裂最小损失下降'subsample': [0.7, 0.9], # 样本采样比例'colsample_bytree': [0.7, 0.9], # 特征采样比例'reg_alpha': [0, 0.1], # L1正则化'reg_lambda': [0.1, 1], # L2正则化'n_estimators': [100, 200] # 树的数量}# 分层K折交叉验证(保持类别分布)cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)# 配置基准模型(带早停机制)base_model = XGBClassifier(objective='binary:logistic',eval_metric='logloss', # 使用对数损失early_stopping_rounds=10, # 早停轮数random_state=39)# 网格搜索配置grid_search = GridSearchCV(estimator=base_model,param_grid=param_grid,cv=cv,scoring='roc_auc', # 使用AUC作为优化目标n_jobs=-1, # 使用所有CPU核心verbose=1 # 打印进度)grid_search.fit(X_train, y_train)return grid_search.best_estimator_, grid_search.best_params_
五、评估与可视化
def evaluate_model(model, X_test, y_test):"""模型评估与结果可视化"""# 生成预测结果y_pred = model.predict(X_test)y_proba = model.predict_proba(X_test)[:, 1] # 获取正类概率# 打印分类报告print("\n分类报告:")print(classification_report(y_test, y_pred))# 输出AUC分数print(f"\nAUC分数: {roc_auc_score(y_test, y_proba):.4f}")# 混淆矩阵热力图plt.figure(figsize=(6,4))cm = confusion_matrix(y_test, y_pred)sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',xticklabels=['健康', '患病'],yticklabels=['健康', '患病'])plt.title('混淆矩阵')plt.show()# 特征重要性可视化plt.figure(figsize=(10, 6))feat_imp = pd.Series(model.feature_importances_, index=df.drop(['status', 'name'], axis=1).columns)feat_imp.nlargest(15).plot(kind='barh')plt.title('Top 15特征重要性')plt.tight_layout()plt.show()
六、程序流程
# 主程序流程
if __name__ == "__main__":# 数据加载df = load_data('./data/parkinsons.data')# 特征工程X, y, scaler = feature_engineering(df)# 数据分割(分层抽样)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, # 保持类别比例random_state=39)# 模型优化print("\n开始参数优化...")best_model, best_params = optimize_model(X_train, y_train)print(f"\n最佳参数: {best_params}")# 模型评估evaluate_model(best_model, X_test, y_test)# 模型保存(带时间戳)timestamp = datetime.now().strftime("%Y%m%d_%H%M")model_path = f"parkinson_model_v{timestamp}.pkl"joblib.dump({'model': best_model, 'scaler': scaler}, model_path)print(f"\n模型已保存到: {model_path}")
七、完整代码
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from datetime import datetime# 1. 数据加载与预处理
def load_data(path):"""加载并预处理数据"""df = pd.read_csv(path)# 数据质量检查assert 'status' in df.columns, "数据必须包含status列"print(f"数据分布:\n{df['status'].value_counts()}")print(f"\n缺失值统计:\n{df.isnull().sum()}")# 识别数值列和非数值列numeric_cols = df.select_dtypes(include=[np.number]).columnsnon_numeric_cols = df.select_dtypes(exclude=[np.number]).columnsprint(f"\n数值列: {list(numeric_cols)}")print(f"非数值列: {list(non_numeric_cols)}")# 仅对数值列填充中位数df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())return df# 2. 特征工程
def feature_engineering(df):"""特征处理"""# 分离特征和标签features = df.drop(['status', 'name'], axis=1) # 移除无关列labels = df['status'].values# 数据标准化scaler = MinMaxScaler(feature_range=(-1, 1))features_scaled = scaler.fit_transform(features)return features_scaled, labels, scaler# 3. 模型优化
def optimize_model(X_train, y_train):"""使用网格搜索优化XGBoost"""# 改进的参数网格(基于文献和实验)param_grid = {'learning_rate': [0.01, 0.05, 0.1], # 更精细的学习率'max_depth': [3, 5, 7],'min_child_weight': [1, 3],'gamma': [0, 0.1], # 添加gamma参数控制分裂'subsample': [0.7, 0.9],'colsample_bytree': [0.7, 0.9],'reg_alpha': [0, 0.1],'reg_lambda': [0.1, 1],'n_estimators': [100, 200]}# 改进的交叉验证策略(分层K折)cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)# 使用早停的基准模型base_model = XGBClassifier(objective='binary:logistic',eval_metric='logloss',early_stopping_rounds=10,random_state=39)# 网格搜索配置grid_search = GridSearchCV(estimator=base_model,param_grid=param_grid,cv=cv,scoring='roc_auc', # 使用AUC作为评估指标n_jobs=-1,verbose=1)grid_search.fit(X_train, y_train)return grid_search.best_estimator_, grid_search.best_params_# 4. 评估与可视化
def evaluate_model(model, X_test, y_test):"""模型评估与结果可视化"""y_pred = model.predict(X_test)y_proba = model.predict_proba(X_test)[:, 1]print("\n分类报告:")print(classification_report(y_test, y_pred))print(f"\nAUC分数: {roc_auc_score(y_test, y_proba):.4f}")# 混淆矩阵可视化cm = confusion_matrix(y_test, y_pred)sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')plt.title('混淆矩阵')plt.show()# 特征重要性plt.figure(figsize=(10, 6))feat_imp = pd.Series(model.feature_importances_,index=df.drop(['status', 'name'], axis=1).columns)feat_imp.nlargest(15).plot(kind='barh')plt.title('Top 15特征重要性')plt.show()# 主流程
if __name__ == "__main__":# 数据加载df = load_data('data/parkinsons.csv')# 特征工程X, y, scaler = feature_engineering(df)# 数据分割(分层抽样)X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,stratify=y,random_state=39)# 模型优化print("\n开始参数优化...")best_model, best_params = optimize_model(X_train, y_train)print(f"\n最佳参数: {best_params}")# 模型评估evaluate_model(best_model, X_test, y_test)# 模型保存(带时间戳)timestamp = datetime.now().strftime("%Y%m%d_%H%M")model_path = f"parkinson_model_v{timestamp}.pkl"joblib.dump({'model': best_model, 'scaler': scaler}, model_path)print(f"\n模型已保存到: {model_path}")