Kaggle-Predicting Optimal Fertilizers-(多分类+xgboost)
Predicting Optimal Fertilizers
题意:
给出土壤的特性,预测出3种最佳的肥料
数据处理:
1.有数字型和类别型,类别不能随意换成数字,独热编码。cat可以直接处理category类型。
2.构造一些相关土壤特性特征
3.由于label是category类型,但是xgb不可以处理category类型,因此需要先编码,最后求出结果之后再解码。
建立模型:
1.catboost交叉验证、xgboost交叉验证
代码:
import os
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score# 忽略警告信息
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'def init():"""初始化设置"""pd.set_option('display.width', 1000)pd.set_option('display.max_colwidth', 1000)pd.set_option("display.max_rows", 1000)pd.set_option("display.max_columns", 1000)def load_data(path_train, path_test):"""加载数据"""df_train = pd.read_csv(path_train)df_test = pd.read_csv(path_test)print(f"Train shape: {df_train.shape}, Test shape: {df_test.shape}")return df_train, df_testdef feature_engineering(df_all):"""特征工程:创建新特征"""# 肥力综合指数df_all['Fertility_Index'] = (0.4 * df_all['Nitrogen'] / 100 +0.3 * df_all['Phosphorous'] / 50 +0.3 * df_all['Potassium'] / 150)# 氮磷比df_all['N_P_ratio'] = df_all['Nitrogen'] / (df_all['Phosphorous'] + 1e-6)# 钾素盈亏差df_all['K_deficit'] = df_all['Potassium'] - (df_all['Nitrogen'] + df_all['Phosphorous']) / 2# 类别编码df_all['Crop_Type_Code'] = LabelEncoder().fit_transform(df_all['Crop Type'])category_data = pd.get_dummies(df_all[['Soil Type', 'Crop Type']])df_all = pd.concat([df_all.drop(['Soil Type', 'Crop Type'], axis=1), category_data], axis=1)return df_alldef prepare_data(df_train, df_test):"""合并训练集和测试集并进行预处理"""df_all = pd.concat([df_train.drop(['id', 'Fertilizer Name'], axis=1),df_test.drop(['id'], axis=1)], axis=0).reset_index(drop=True)df_all = feature_engineering(df_all)X_train = df_all[:len(df_train)]Y_train = LabelEncoder().fit_transform(df_train['Fertilizer Name'])X_test = df_all[len(df_train):]return X_train, Y_train, X_testdef train_model(X_train, Y_train, model_type='xgb', n_splits=5):"""使用交叉验证训练模型"""models = []oof_preds = np.zeros((X_train.shape[0],))scores = []kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, Y_train)):print(f"\nFold {fold + 1}/{n_splits}")x_tr, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]y_tr, y_val = Y_train[train_idx], Y_train[val_idx]if model_type == 'xgb':model = XGBClassifier(max_depth=12,colsample_bytree=0.467,subsample=0.86,n_estimators=8000,learning_rate=0.03,gamma=0.26,max_delta_step=4,reg_alpha=2.7,reg_lambda=1.4,early_stopping_rounds=500,objective='multi:softprob',random_state=13,enable_categorical=True,tree_method='hist',device='cuda')elif model_type == 'cat':model = CatBoostClassifier(iterations=8000,learning_rate=0.03,depth=10,loss_function='MultiClass',eval_metric='MultiClass',random_seed=42,od_type='Iter',od_wait=500,verbose=100,task_type="GPU")elif model_type == 'lgb':model = LGBMClassifier(n_estimators=8000,learning_rate=0.03,num_leaves=255,max_depth=10,subsample=0.8,colsample_bytree=0.7,class_weight='balanced',metric='multi_logloss',early_stopping_rounds=500,random_state=42,verbosity=-1)model.fit(x_tr, y_tr, eval_set=[(x_val, y_val)], verbose=100)val_pred = model.predict(x_val)score = accuracy_score(y_val, val_pred)print(f"Validation Accuracy: {score:.4f}")oof_preds[val_idx] = val_predmodels.append(model)scores.append(score)print(f"\nAverage CV Accuracy: {np.mean(scores):.4f} ± {np.std(scores):.4f}")return models, scoresdef predict_test(models, X_test):"""对测试集进行预测并取平均"""pred_proba = np.zeros((X_test.shape[0], len(np.unique(Y_train))))for model in models:pred_proba += model.predict_proba(X_test) / len(models)return pred_probadef generate_submission(df_test, pred_proba, le, output_path='submission.csv'):"""生成提交文件"""pred_top3 = np.argsort(pred_proba, axis=1)[:, -3:][:, ::-1]top3_labels = [list(le.classes_[i]) for i in pred_top3]submission = pd.DataFrame({'id': df_test['id'],'Fertilizer Name': [' '.join(row) for row in top3_labels]})submission.to_csv(output_path, index=False)print(f"Submission saved to {output_path}")if __name__ == '__main__':init()# Step 1: 加载数据df_train, df_test = load_data('train.csv', 'test.csv')# Step 2: 准备数据X_train, Y_train, X_test = prepare_data(df_train, df_test)# Step 3: 训练模型(支持 xgb/cat/lgb)models, scores = train_model(X_train, Y_train, model_type='xgb', n_splits=5)# Step 4: 预测测试集pred_proba = predict_test(models, X_test)# Step 5: 生成提交文件le = LabelEncoder()le.fit(df_train['Fertilizer Name'])generate_submission(df_test, pred_proba, le)
#AI生成版本0.34190