Python打卡Day22 kaggle泰坦尼克号生存预测
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline# 加载数据
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')# 特征工程函数
def feature_engineering(df):# 创建家庭规模特征df['FamilySize'] = df['SibSp'] + df['Parch'] + 1# 创建是否独行特征df['IsAlone'] = (df['FamilySize'] == 1).astype(int)return df# 应用特征工程
train_data = feature_engineering(train_data)
test_data = feature_engineering(test_data)# 定义预处理步骤
categorical_cols = ['Pclass', 'Sex', 'Embarked']
numeric_cols = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize', 'IsAlone']# 数值特征处理管道
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))
])# 分类特征处理管道
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))
])# 组合预处理步骤
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_cols),('cat', categorical_transformer, categorical_cols)])# 构建完整管道
model = Pipeline(steps=[('preprocessor', preprocessor),('classifier', RandomForestClassifier(n_estimators=100,max_depth=5,random_state=42))
])# 准备训练数据
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']# 训练模型
model.fit(X_train, y_train)# 预测测试集
predictions = model.predict(test_data)# 创建提交文件
output = pd.DataFrame({'PassengerId': test_data.PassengerId,'Survived': predictions
})
output.to_csv('submission.csv', index=False)print("提交文件已保存为 submission.csv")
@浙大疏锦行