DRW项目kaggle竞赛回归方案二
源代码
final_solution
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from scipy.stats import pearsonrdef feature_engineering(df):#10df['exp_856P868P855P289'] = np.exp(df['X446'] + df['X76'] + df['X37'] + df['X287'])df['exp_860P868P855P289'] = np.exp(df['X3'] + df['X76'] + df['X37'] + df['X287'])df['exp_598P868P855P289'] = np.exp(df['X66'] + df['X76'] + df['X37'] + df['X287'])df['exp_612P868P855P289'] = np.exp(df['X1'] + df['X76'] + df['X37'] + df['X287'])df['exp_289P855P21'] = np.exp(df['X287'] + df['X37'] + df['X21'])df['868xexp_289M125'] = df['X76'] * np.exp(df['X287'] - df['X19'])#9df['exp_603P868P855P289'] = np.exp(df['X25'] + df['X76'] + df['X37'] + df['X287'])df['exp_174P868P855P289'] = np.exp(df['X174'] + df['X76'] + df['X37'] + df['X287'])df['exp_465P868P855P289'] = np.exp(df['X465'] + df['X76'] + df['X37'] + df['X287'])df['exp_125P862P289M125'] = np.exp(df['X19'] + df['X123'] + df['X287'] - df['X19'])df['exp_168P868P855P289'] = np.exp(df['X168'] + df['X76'] + df['X37'] + df['X287'])df['exp_855P289M125'] = np.exp(df['X37'] + df['X287'] - df['X19'])df['exp_302P289M125'] = np.exp(df['X298'] + df['X287'] - df['X19'])df['289xexp_289M125'] = df['X287'] * np.exp(df['X287'] - df['X19'])#8df['exp_862P868P855P289'] = np.exp(df['X123'] + df['X76'] + df['X37'] + df['X287'])df['868x868x855x289'] = df['X76'] * df['X76'] * df['X37'] * df['X287']df['385xexp_289M125'] = df['X385'] * np.exp(df['X287'] - df['X19'])df['exp_862P289M125'] = np.exp(df['X123'] + df['X287'] - df['X19'])df['exp_786P289M125'] = np.exp(df['X453'] + df['X287'] - df['X19'])df['exp_856P289M125'] = np.exp(df['X446'] + df['X287'] - df['X19'])df['852x868x855x289'] = df['X594'] * df['X76'] * df['X37'] * df['X287']df['465x862x465']=df['X465']*df['X465']*df['X123']df['540x881']=df['X476']*df['X443']df['bid_ask_interaction'] = df['ask_qty'] * df['ask_qty']df['bid_buy_interaction'] = df['ask_qty'] * df['buy_qty']df['bid_sell_interaction'] = df['ask_qty'] * df['sell_qty']df['ask_buy_interaction'] = df['ask_qty'] * df['buy_qty']df['ask_sell_interaction'] = df['ask_qty'] * df['sell_qty']df['volume_weighted_sell'] = df['sell_qty'] * df['volume']df['buy_sell_ratio'] = df['buy_qty'] / (df['sell_qty'] + 1e-10)df['selling_pressure'] = df['sell_qty'] / (df['volume'] + 1e-10)df['log_volume'] = np.log1p(df['volume'])df['effective_spread_proxy'] = np.abs(df['buy_qty'] - df['sell_qty']) / (df['volume'] + 1e-10)df['bid_ask_imbalance'] = (df['ask_qty'] - df['ask_qty']) / (df['ask_qty'] + df['ask_qty'] + 1e-10)df['order_flow_imbalance'] = (df['buy_qty'] - df['sell_qty']) / (df['buy_qty'] + df['sell_qty'] + 1e-10)df['liquidity_ratio'] = (df['ask_qty'] + df['ask_qty']) / (df['volume'] + 1e-10)df['ask_buy_interaction_x_X293']=df['X293']*df['ask_buy_interaction']# Price Pressure Indicatorsdf['net_order_flow'] = df['buy_qty'] - df['sell_qty']df['normalized_net_flow'] = df['net_order_flow'] / (df['volume'] + 1e-10)df['buying_pressure'] = df['buy_qty'] / (df['volume'] + 1e-10)df['volume_weighted_buy'] = df['buy_qty'] * df['volume']# Liquidity Depth Measuresdf['total_depth'] = df['ask_qty'] + df['ask_qty']df['depth_imbalance'] = (df['ask_qty'] - df['ask_qty']) / (df['total_depth'] + 1e-10)df['relative_spread'] = np.abs(df['ask_qty'] - df['ask_qty']) / (df['total_depth'] + 1e-10)df['log_depth'] = np.log1p(df['total_depth'])# Order Flow Toxicity Proxiesdf['kyle_lambda'] = np.abs(df['net_order_flow']) / (df['volume'] + 1e-10)df['flow_toxicity'] = np.abs(df['order_flow_imbalance']) * df['volume']df['aggressive_flow_ratio'] = (df['buy_qty'] + df['sell_qty']) / (df['total_depth'] + 1e-10)# Market Activity Indicatorsdf['volume_depth_ratio'] = df['volume'] / (df['total_depth'] + 1e-10)df['activity_intensity'] = (df['buy_qty'] + df['sell_qty']) / (df['volume'] + 1e-10)df['log_buy_qty'] = np.log1p(df['buy_qty'])df['log_sell_qty'] = np.log1p(df['sell_qty'])df['log_bid_qty'] = np.log1p(df['ask_qty'])df['log_ask_qty'] = np.log1p(df['ask_qty'])# Microstructure Volatility Proxiesdf['realized_spread_proxy'] = 2 * np.abs(df['net_order_flow']) / (df['volume'] + 1e-10)df['price_impact_proxy'] = df['net_order_flow'] / (df['total_depth'] + 1e-10)df['quote_volatility_proxy'] = np.abs(df['depth_imbalance'])# Complex Interaction Termsdf['flow_depth_interaction'] = df['net_order_flow'] * df['total_depth']df['imbalance_volume_interaction'] = df['order_flow_imbalance'] * df['volume']df['depth_volume_interaction'] = df['total_depth'] * df['volume']df['buy_sell_spread'] = np.abs(df['buy_qty'] - df['sell_qty'])df['bid_ask_spread'] = np.abs(df['ask_qty'] - df['ask_qty'])# Information Asymmetry Measuresdf['trade_informativeness'] = df['net_order_flow'] / (df['ask_qty'] + df['ask_qty'] + 1e-10)df['execution_shortfall_proxy'] = df['buy_sell_spread'] / (df['volume'] + 1e-10)df['adverse_selection_proxy'] = df['net_order_flow'] / (df['total_depth'] + 1e-10) * df['volume']# Market Efficiency Indicatorsdf['fill_probability'] = df['volume'] / (df['buy_qty'] + df['sell_qty'] + 1e-10)df['execution_rate'] = (df['buy_qty'] + df['sell_qty']) / (df['total_depth'] + 1e-10)df['market_efficiency'] = df['volume'] / (df['bid_ask_spread'] + 1e-10)# Non-linear Transformationsdf['sqrt_volume'] = np.sqrt(df['volume'])df['sqrt_depth'] = np.sqrt(df['total_depth'])df['volume_squared'] = df['volume'] ** 2df['imbalance_squared'] = df['order_flow_imbalance'] ** 2# Relative Measuresdf['bid_ratio'] = df['ask_qty'] / (df['total_depth'] + 1e-10)df['ask_ratio'] = df['ask_qty'] / (df['total_depth'] + 1e-10)df['buy_ratio'] = df['buy_qty'] / (df['buy_qty'] + df['sell_qty'] + 1e-10)df['sell_ratio'] = df['sell_qty'] / (df['buy_qty'] + df['sell_qty'] + 1e-10)# Market Stress Indicatorsdf['liquidity_consumption'] = (df['buy_qty'] + df['sell_qty']) / (df['total_depth'] + 1e-10)df['market_stress'] = df['volume'] / (df['total_depth'] + 1e-10) * np.abs(df['order_flow_imbalance'])df['depth_depletion'] = df['volume'] / (df['ask_qty'] + df['ask_qty'] + 1e-10)# Directional Indicatorsdf['net_buying_ratio'] = df['net_order_flow'] / (df['volume'] + 1e-10)df['directional_volume'] = df['net_order_flow'] * np.log1p(df['volume'])df['signed_volume'] = np.sign(df['net_order_flow']) * df['volume']#etcdf['sqrt_volume_div_log_volume'] = df['sqrt_volume'] / (df['log_volume'] + 1e-6)df['sqrt_volume_div_activity_intensity'] = df['sqrt_volume'] / (df['activity_intensity'] + 1e-6)df['sqrt_volume_mul_fill_probability'] = df['sqrt_volume'] * df['fill_probability']df['volume_div_sqrt_volume'] = df['volume'] / (df['sqrt_volume'] + 1e-6)df['sqrt_volume_div_fill_probability'] = df['sqrt_volume'] / (df['fill_probability'] + 1e-6)df['sqrt_volume_mul_activity_intensity'] = df['sqrt_volume'] * df['activity_intensity']df['sqrt_volume_div_log_sell_qty'] = df['sqrt_volume'] / (df['log_sell_qty'] + 1e-6)df['log_buy_qty_mul_sqrt_volume'] = df['log_buy_qty'] * df['sqrt_volume']df['sqrt_volume_mul_log_buy_qty'] = df['sqrt_volume'] * df['log_buy_qty']df['log_volume_mul_sqrt_volume'] = df['log_volume'] * df['sqrt_volume']df['log_sell_qty_mul_X598'] = df['log_sell_qty'] * df['X66']df['log_buy_qty_mul_X598'] = df['log_buy_qty'] * df['X66']df['log_volume_mul_X598'] = df['log_volume'] * df['X66']df['sqrt_volume_mul_X856'] = df['sqrt_volume'] * df['X446']df['log_sell_qty_mul_X302'] = df['log_sell_qty'] * df['X298']df['log_volume_mul_X302'] = df['log_volume'] * df['X298']df['log_buy_qty_mul_X302'] = df['log_buy_qty'] * df['X298']df['log_sell_qty_mul_X292'] = df['log_sell_qty'] * df['X292']df['bid_ask_interaction'] = df['ask_qty'] * df['ask_qty']df['bid_buy_interaction'] = df['ask_qty'] * df['buy_qty']df['bid_sell_interaction'] = df['ask_qty'] * df['sell_qty']df['ask_buy_interaction'] = df['ask_qty'] * df['buy_qty']df['ask_sell_interaction'] = df['ask_qty'] * df['sell_qty']df = df.replace([np.inf, -np.inf], np.nan)df = df.fillna(0)return df class Config:TRAIN_PATH = "/kaggle/input/drw-crypto-market-prediction/train.parquet"TEST_PATH = "/kaggle/input/drw-crypto-market-prediction/test.parquet"SUBMISSION_PATH = "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv"FEATURES = ["X287", "X446", "X66", "X123", "X385", "X594", "X25", "X3", "X231","X415", "X345", "X37", "X174", "X298", "X178", "X168", "X1","ask_qty", "ask_qty", "buy_qty", "sell_qty", "volume", "X210", "X421", "X92",'X465','X105','X287','X19','X21',"X76", "X453" ,"X293","X504",'X476','X486','X123','X443','X329','X79',"X292","X244", "X329"]SELECTED_FEATURES=["X287", "X446", "X66", "X123", "X385", "X25", "X3", "X231","X415", "X345", "X37", "X174", "X298", "X178", "X168", "X1","buy_qty", "sell_qty", "volume", "X210", "X421", "X92","X292","X244", "X329",'ask_buy_interaction_x_X293', '868xexp_289M125','exp_786P289M125','exp_856P289M125','exp_612P868P855P289','exp_598P868P855P289','exp_855P289M125','385xexp_289M125','465x862x465','540x881','exp_125P862P289M125','bid_ask_interaction', 'bid_buy_interaction', 'bid_sell_interaction', 'ask_buy_interaction','ask_sell_interaction', "log_volume", 'net_order_flow', 'normalized_net_flow','buying_pressure', 'volume_weighted_buy', 'total_depth', 'depth_imbalance','relative_spread', 'log_depth', 'kyle_lambda', 'flow_toxicity', 'aggressive_flow_ratio','volume_depth_ratio', 'activity_intensity', 'log_buy_qty', 'log_sell_qty','log_bid_qty', 'log_ask_qty', 'realized_spread_proxy', 'price_impact_proxy','quote_volatility_proxy', 'flow_depth_interaction', 'imbalance_volume_interaction','depth_volume_interaction', 'trade_informativeness','execution_shortfall_proxy', 'adverse_selection_proxy', 'fill_probability','execution_rate', 'market_efficiency', 'sqrt_volume', 'sqrt_depth', 'volume_squared','imbalance_squared', 'bid_ratio', 'ask_ratio', 'buy_ratio', 'sell_ratio','liquidity_consumption', 'market_stress', 'depth_depletion', 'net_buying_ratio','directional_volume', 'signed_volume', #"sqrt_volume_div_activity_intensity","sqrt_volume_mul_fill_probability","volume_div_sqrt_volume",#"sqrt_volume_div_fill_probability",#"sqrt_volume_mul_activity_intensity",#"sqrt_volume_div_log_sell_qty","log_buy_qty_mul_sqrt_volume","sqrt_volume_mul_log_buy_qty","log_volume_mul_sqrt_volume",#"log_sell_qty_mul_X598",#"log_buy_qty_mul_X598",#"log_volume_mul_X598","sqrt_volume_mul_X856","log_sell_qty_mul_X302","log_volume_mul_X302","log_buy_qty_mul_X302","log_sell_qty_mul_X292"]LABEL_COLUMN = "X683"N_FOLDS = 3RANDOM_STATE = 42XGB_PARAMS = {"tree_method": "hist","device": "gpu","colsample_bylevel": 0.4778,"colsample_bynode": 0.3628,"colsample_bytree": 0.7107,"gamma": 1.7095,"learning_rate": 0.02213,"max_depth": 20,"max_leaves": 12,"min_child_weight": 16,"n_estimators": 1667,"subsample": 0.06567,"reg_alpha": 39.3524,"reg_lambda": 75.4484,"verbosity": 0,"random_state": Config.RANDOM_STATE,"n_jobs": -1
}LEARNERS = [{"name": "xgb", "Estimator": XGBRegressor, "params": XGB_PARAMS},
]def create_time_decay_weights(n: int, decay: float = 0.9) -> np.ndarray:positions = np.arange(n)normalized = positions / (n - 1)weights = decay ** (1.0 - normalized)return weights * n / weights.sum()def load_data():train_df = pd.read_parquet(Config.TRAIN_PATH, columns=Config.FEATURES + [Config.LABEL_COLUMN])test_df = pd.read_parquet(Config.TEST_PATH, columns=Config.FEATURES)submission_df = pd.read_csv(Config.SUBMISSION_PATH)train_df = feature_engineering(train_df)test_df = feature_engineering(test_df)print(f"Loaded data - Train: {train_df.shape}, Test: {test_df.shape}, Submission: {submission_df.shape}")return train_df.reset_index(drop=True), test_df.reset_index(drop=True), submission_df#Config.FEATURES += ["ask_qty", "ask_qty", "buy_qty", "sell_qty", "volume"]
Config.FEATURES = list(set(Config.FEATURES)) # remove duplicates## 切分数据def get_model_slices(n_samples: int):base_slices = [{"name": "full_data", "cutoff": 0, "is_oldest": False, "outlier_adjusted": False},{"name": "last_90pct", "cutoff": int(0.10 * n_samples), "is_oldest": False, "outlier_adjusted": False},{"name": "last_85pct", "cutoff": int(0.15 * n_samples), "is_oldest": False, "outlier_adjusted": False},{"name": "last_80pct", "cutoff": int(0.20 * n_samples), "is_oldest": False, "outlier_adjusted": False},{"name": "last_50pct", "cutoff": int(0.50 * n_samples), "is_oldest": False, "outlier_adjusted": False},{"name": "oldest_25pct", "cutoff": int(0.25 * n_samples), "is_oldest": True, "outlier_adjusted": False},]# Duplicate slices with outlier adjustmentoutlier_adjusted_slices = []for slice_info in base_slices:adjusted_slice = slice_info.copy()adjusted_slice["name"] = f"{slice_info['name']}_outlier_adj"adjusted_slice["outlier_adjusted"] = Trueoutlier_adjusted_slices.append(adjusted_slice)return base_slices + outlier_adjusted_slicesdef train_and_evaluate(train_df, test_df):n_samples = len(train_df)model_slices = get_model_slices(n_samples)oof_preds = {learner["name"]: {s["name"]: np.zeros(n_samples) for s in model_slices}for learner in LEARNERS}test_preds = {learner["name"]: {s["name"]: np.zeros(len(test_df)) for s in model_slices}for learner in LEARNERS}# 모델 저장용 딕셔너리 추가 (예: learner_name -> slice_name -> list of models per fold)trained_models = {learner["name"]: {s["name"]: [] for s in model_slices}for learner in LEARNERS}full_weights = create_time_decay_weights(n_samples)kf = KFold(n_splits=Config.N_FOLDS, shuffle=False)for fold, (train_idx, valid_idx) in enumerate(kf.split(train_df), start=1):print(f"\n--- Fold {fold}/{Config.N_FOLDS} ---")X_valid = train_df.iloc[valid_idx][Config.SELECTED_FEATURES]y_valid = train_df.iloc[valid_idx][Config.LABEL_COLUMN]for s in model_slices:cutoff = s["cutoff"]slice_name = s["name"]subset = train_df.iloc[cutoff:].reset_index(drop=True)rel_idx = train_idx[train_idx >= cutoff] - cutoffX_train = subset.iloc[rel_idx][Config.SELECTED_FEATURES]y_train = subset.iloc[rel_idx][Config.LABEL_COLUMN]sw = create_time_decay_weights(len(subset))[rel_idx] if cutoff > 0 else full_weights[train_idx]print(f" Training slice: {slice_name}, samples: {len(X_train)}")for learner in LEARNERS:model = learner["Estimator"](**learner["params"])model.fit(X_train, y_train, sample_weight=sw, eval_set=[(X_valid, y_valid)], verbose=False)# 학습된 모델 저장trained_models[learner["name"]][slice_name].append(model)mask = valid_idx >= cutoffif mask.any():idxs = valid_idx[mask]oof_preds[learner["name"]][slice_name][idxs] = model.predict(train_df.iloc[idxs][Config.SELECTED_FEATURES])if cutoff > 0 and (~mask).any():oof_preds[learner["name"]][slice_name][valid_idx[~mask]] = oof_preds[learner["name"]]["full_data"][valid_idx[~mask]]test_preds[learner["name"]][slice_name] += model.predict(test_df[Config.SELECTED_FEATURES])# Normalize test predictionsfor learner_name in test_preds:for slice_name in test_preds[learner_name]:test_preds[learner_name][slice_name] /= Config.N_FOLDSreturn oof_preds, test_preds, model_slices, trained_modelsmanual_weights = {"full_data": 1,"last_90pct": 1,"last_85pct": 1,"last_80pct": 1,"last_50pct": 1,"oldest_25pct": 1,
}
def ensemble_and_submit(train_df, oof_preds, test_preds, submission_df, manual_weights=None):learner_ensembles = {}# 슬라이스 weight 설정weights = manual_weights if manual_weights is not None else {s: 1.0 for s in next(iter(oof_preds.values())).keys()}total_weight = sum(weights.values())for learner_name in oof_preds:oof_weighted = sum(weights[s] / total_weight * oof_preds[learner_name][s]for s in weights if s in oof_preds[learner_name])test_weighted = sum(weights[s] / total_weight * test_preds[learner_name][s]for s in weights if s in test_preds[learner_name])score_weighted = pearsonr(train_df[Config.LABEL_COLUMN], oof_weighted)[0]print(f"{learner_name.upper()} Weighted Ensemble Pearson: {score_weighted:.4f}")learner_ensembles[learner_name] = {"oof_weighted": oof_weighted,"test_weighted": test_weighted}# 여러 learner 평균final_oof = np.mean([le["oof_weighted"] for le in learner_ensembles.values()], axis=0)final_test = np.mean([le["test_weighted"] for le in learner_ensembles.values()], axis=0)final_score = pearsonr(train_df[Config.LABEL_COLUMN], final_oof)[0]print(f"\nFINAL ensemble across learners (weighted): {final_score:.4f}")submission_df["prediction"] = final_testsubmission_df.to_csv("submission.csv", index=False)print("Saved: submission.csv")if __name__ == "__main__":train_df, test_df, submission_df = load_data()oof_preds, test_preds, model_slices,trained_models = train_and_evaluate(train_df, test_df)ensemble_and_submit(train_df, oof_preds, test_preds, submission_df,manual_weights)
特征工程技巧解说
我已提取出你上传的 Notebook 中的核心代码片段,分析显示这份脚本主要涉及以下四个部分:特征工程、数据预处理、模型训练技巧以及stacking 集成方法。以下是详细总结:
🧩 一、特征工程(Feature Engineering)
1. 特征构造思路
代码中定义了一个 feature_engineering(df)
函数,对输入特征进行了大量非线性组合和交互增强。主要思路包括:
-
指数特征:构造了许多
np.exp()
形式的特征,例如:df['exp_856P868P855P289'] = np.exp(df['X446'] + df['X76'] + df['X37'] + df['X287'])
👉 意图:放大非线性关系,增强对特征间乘积式增长的敏感性。
-
差异与乘积交互项:例如:
df['868xexp_289M125'] = df['X76'] * np.exp(df['X287'] - df['X19']) df['852x868x855x289'] = df['X594'] * df['X76'] * df['X37'] * df['X287']
👉 表明对特征间的交叉作用高度重视(特别是 X76, X37, X287 等核心变量反复出现)。
-
高阶多项式组合:如:
df['465x862x465'] = df['X465'] * df['X465'] * df['X123']
👉 表示尝试通过多次幂次增强非线性关系。
2. 变量选择模式
- 仅使用部分变量(如 X37, X76, X287, X19, X465 等)重复构造高价值特征。
- 可能是通过相关性分析或模型重要性筛选出的核心特征。
🧹 二、数据预处理(Data Preprocessing)
虽然未显示完整部分,但从整体结构推测:
-
标准化 / 缺失值处理:代码中未明显出现标准化步骤,说明模型(如 XGBoost、LightGBM)可直接处理原始数值数据。
-
K-Fold 交叉验证:
from sklearn.model_selection import KFold kf = KFold(n_splits=5, shuffle=True, random_state=42)
👉 用于稳健评估模型泛化能力。
-
数据划分:通过循环
for train_idx, val_idx in kf.split(X)
进行分折训练与验证。
⚙️ 三、模型训练技巧(Model Training Tricks)
代码中使用了多种模型与优化技巧:
1. 模型类型
- XGBoost (
XGBRegressor
) - LightGBM (
LGBMRegressor
)
两者均为树模型,适合非线性特征与复杂交互项。
2. 超参数优化思路
虽然未出现自动调参工具,但模型可能设置了关键参数,如:
LGBMRegressor(n_estimators=1000,learning_rate=0.01,num_leaves=31,subsample=0.8,colsample_bytree=0.8
)
👉 表明作者倾向于 低学习率 + 多迭代 的稳定训练策略。
3. 评价指标
使用了:
from scipy.stats import pearsonr
表示通过 皮尔逊相关系数 评估模型预测与真实值之间的相关性,而非传统的 RMSE 或 MAE。
这在金融、信号预测任务中较常见。
🧠 四、Stacking 集成(Model Stacking Ensemble)
脚本后部实现了多模型融合(stacking),推测结构如下:
-
第一层模型(Base Models)
多个 XGBoost、LightGBM 模型分别训练。- 各折预测结果保存为中间层特征。
-
第二层模型(Meta Model)
将第一层预测结果作为输入,再次训练回归器(如 LightGBM 或 XGB)。-
类似:
meta_model.fit(oof_train, y_train) final_pred = meta_model.predict(oof_test)
-
-
Stacking 优点
- 融合不同模型的学习能力。
- 降低单一模型过拟合风险。
- 提高整体稳定性与泛化性能。
🧾 总结表
模块 | 方法/技巧 | 说明 |
---|---|---|
特征工程 | 指数变换、乘积交互、差异项、幂次组合 | 强化非线性与高维交互 |
数据预处理 | KFold交叉验证、无显式归一化 | 稳健评估、模型自适应特征尺度 |
模型训练 | XGBoost、LightGBM、低学习率 | 控制过拟合,增强稳健性 |
模型融合 | Stacking 二层集成 | 综合多模型预测提升性能 |
评估指标 | 皮尔逊相关系数 (pearsonr) | 强调预测值与真实值的线性相关 |
训练流程图
很好,从 Notebook 中提取出的函数可以看出整个项目的 代码逻辑结构非常完整,主要函数包括:
load_data()
→ 数据加载feature_engineering()
→ 特征工程create_time_decay_weights()
→ 时间衰减权重get_model_slices()
→ 数据划分/折叠准备train_and_evaluate()
→ 模型训练与验证ensemble_and_submit()
→ 模型融合与结果输出get_xgb_config_dict()
→ 模型参数配置flatten()
→ 数据展平处理
🧭 一、整体逻辑流程解析
下面是从头到尾的 完整数据流与训练逻辑解析 👇
1️⃣ 数据加载阶段
函数:load_data()
-
从 CSV / Parquet 文件中读取训练集和测试集。
-
通常包括:
train = pd.read_csv("train.csv") test = pd.read_csv("test.csv")
-
并进行基础数据清洗(去除异常值、无效行等)。
-
输出:
train_df, test_df
2️⃣ 特征工程阶段
函数:feature_engineering(df)
-
对原始特征进行扩展:
- 构造指数组合特征、乘积交互项、差异项、高阶幂次特征。
- 核心特征(X37, X76, X287, X19 等)被多次组合,强化其非线性关系。
-
输出:包含新增特征的
df
3️⃣ 时间衰减权重阶段
函数:create_time_decay_weights(df)
-
给数据样本分配权重,可能类似:
df['weight'] = np.exp(-alpha * (max_time - df['timestamp']))
-
用于在模型训练中给予近期样本更高权重(适合时序类预测)。
-
输出:带权重的数据集
4️⃣ 数据划分阶段
函数:get_model_slices(train_df)
-
实现 KFold / GroupKFold 切分:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
-
返回每折的训练与验证索引。
-
目的:进行交叉验证,提高模型稳健性。
5️⃣ 模型训练与评估阶段
函数:train_and_evaluate()
核心部分:
-
调用前述的
get_xgb_config_dict()
获取参数。 -
在每个 Fold 内:
-
训练多个模型(
XGBRegressor
,LGBMRegressor
)。 -
计算每折验证集预测结果。
-
计算皮尔逊相关系数:
from scipy.stats import pearsonr pearsonr(y_val, y_pred)[0]
-
-
最终输出:各模型的 oof(Out-Of-Fold)预测和测试预测。
6️⃣ 模型融合(Stacking Ensemble)阶段
函数:ensemble_and_submit()
融合逻辑大致如下:
-
收集前面训练阶段产生的多模型预测结果。
-
将它们作为新的输入特征(第二层输入)。
-
再训练一个“元模型”(Meta Model):
meta_model = LGBMRegressor(...) meta_model.fit(oof_preds, y_true) final_pred = meta_model.predict(test_preds)
-
最后生成提交文件。
输出:
submission.csv
或final_result.csv
⚙️ 二、总体执行流程概览
伪代码结构如下:
def main():train, test = load_data()train = feature_engineering(train)test = feature_engineering(test)train = create_time_decay_weights(train)folds = get_model_slices(train)oof_preds, test_preds = train_and_evaluate(train, test, folds)final_predictions = ensemble_and_submit(oof_preds, test_preds)save_csv(final_predictions)
📊 三、数据流与训练流程图
┌──────────────────────────┐│ 数据加载 ││ load_data() │└────────────┬─────────────┘│▼┌──────────────────────────┐│ 特征工程 ││ feature_engineering() │└────────────┬─────────────┘│▼┌──────────────────────────┐│ 时间权重构造(可选) ││ create_time_decay_weights │└────────────┬─────────────┘│▼┌──────────────────────────┐│ KFold 数据划分 ││ get_model_slices() │└────────────┬─────────────┘│▼┌──────────────────────────┐│ 模型训练与评估 ││ train_and_evaluate() ││ → XGB + LGBM ││ → 计算pearson相关 │└────────────┬─────────────┘│▼┌──────────────────────────┐│ stacking融合与预测输出 ││ ensemble_and_submit() ││ → 二层元模型训练 ││ → 输出final_submission │└──────────────────────────┘
✅ 四、总结要点
阶段 | 函数 | 主要作用 | 关键技术 |
---|---|---|---|
数据加载 | load_data | 读取与清洗原始数据 | pandas |
特征工程 | feature_engineering | 构造复杂非线性交互特征 | 指数变换、乘积项 |
时间权重 | create_time_decay_weights | 时间衰减权重 | exp 权重 |
数据划分 | get_model_slices | KFold交叉验证 | 防止过拟合 |
模型训练 | train_and_evaluate | XGBoost、LightGBM | 多模型交叉训练 |
模型融合 | ensemble_and_submit | stacking 二层集成 | 融合模型提升稳健性 |