python训练营day23
知识回顾:
- 转化器和估计器的概念
- 管道工程
- ColumnTransformer和Pipeline类
作业:
整理下全部逻辑的先后顺序,看看能不能制作出适合所有机器学习的通用pipeline
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier # 示例模型
from sklearn.metrics import accuracy_score# 1. 数据加载
def load_data(train_path, test_path):train_data = pd.read_csv(train_path)test_data = pd.read_csv(test_path)return train_data, test_data# 2. 数据预处理
def preprocess_data(train_data, test_data):# 分离特征和目标变量X_train = train_data.drop(columns=['target'])y_train = train_data['target']X_test = test_data.drop(columns=['target'], errors='ignore') # 测试集可能没有目标变量# 定义数值列和分类列numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columnscategorical_features = X_train.select_dtypes(include=['object', 'category']).columns# 创建预处理管道numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), # 填充缺失值('scaler', StandardScaler()) # 标准化])categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), # 填充缺失值('onehot', OneHotEncoder(handle_unknown='ignore')) # 独热编码])preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),('cat', categorical_transformer, categorical_features)])return preprocessor, X_train, y_train, X_test# 3. 模型训练
def train_model(preprocessor, X_train, y_train):# 定义完整的 Pipelinemodel_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', RandomForestClassifier(random_state=42)) # 示例模型])# 训练模型model_pipeline.fit(X_train, y_train)return model_pipeline# 4. 模型评估
def evaluate_model(model_pipeline, X_test, y_test=None):if y_test is not None:y_pred = model_pipeline.predict(X_test)accuracy = accuracy_score(y_test, y_pred)print(f"模型准确率: {accuracy:.2f}")else:print("测试集没有目标变量,无法评估模型。")# 5. 主函数
def main():train_path = 'train.csv'test_path = 'test.csv'train_data, test_data = load_data(train_path, test_path)preprocessor, X_train, y_train, X_test = preprocess_data(train_data, test_data)model_pipeline = train_model(preprocessor, X_train, y_train)# 如果测试集有目标变量,可以评估模型if 'target' in test_data.columns:y_test = test_data['target']evaluate_model(model_pipeline, X_test, y_test)else:evaluate_model(model_pipeline, X_test)if __name__ == "__main__":main()
@浙大疏锦行