当前位置: 首页 > news >正文

DRW项目kaggle竞赛回归方案二

源代码

final_solution

import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from scipy.stats import pearsonrdef feature_engineering(df):#10df['exp_856P868P855P289'] = np.exp(df['X446'] + df['X76'] + df['X37'] + df['X287'])df['exp_860P868P855P289'] = np.exp(df['X3'] + df['X76'] + df['X37'] + df['X287'])df['exp_598P868P855P289'] = np.exp(df['X66'] + df['X76'] + df['X37'] + df['X287'])df['exp_612P868P855P289'] = np.exp(df['X1'] + df['X76'] + df['X37'] + df['X287'])df['exp_289P855P21'] = np.exp(df['X287'] + df['X37'] + df['X21'])df['868xexp_289M125'] = df['X76'] * np.exp(df['X287'] - df['X19'])#9df['exp_603P868P855P289'] = np.exp(df['X25'] + df['X76'] + df['X37'] + df['X287'])df['exp_174P868P855P289'] = np.exp(df['X174'] + df['X76'] + df['X37'] + df['X287'])df['exp_465P868P855P289'] = np.exp(df['X465'] + df['X76'] + df['X37'] + df['X287'])df['exp_125P862P289M125'] = np.exp(df['X19'] + df['X123'] + df['X287'] - df['X19'])df['exp_168P868P855P289'] = np.exp(df['X168'] + df['X76'] + df['X37'] + df['X287'])df['exp_855P289M125'] = np.exp(df['X37'] + df['X287'] - df['X19'])df['exp_302P289M125'] = np.exp(df['X298'] + df['X287'] - df['X19'])df['289xexp_289M125'] = df['X287'] * np.exp(df['X287'] - df['X19'])#8df['exp_862P868P855P289'] = np.exp(df['X123'] + df['X76'] + df['X37'] + df['X287'])df['868x868x855x289'] = df['X76'] * df['X76'] * df['X37'] * df['X287']df['385xexp_289M125'] = df['X385'] * np.exp(df['X287'] - df['X19'])df['exp_862P289M125'] = np.exp(df['X123'] + df['X287'] - df['X19'])df['exp_786P289M125'] = np.exp(df['X453'] + df['X287'] - df['X19'])df['exp_856P289M125'] = np.exp(df['X446'] + df['X287'] - df['X19'])df['852x868x855x289'] = df['X594'] * df['X76'] * df['X37'] * df['X287']df['465x862x465']=df['X465']*df['X465']*df['X123']df['540x881']=df['X476']*df['X443']df['bid_ask_interaction'] = df['ask_qty'] * df['ask_qty']df['bid_buy_interaction'] = df['ask_qty'] * df['buy_qty']df['bid_sell_interaction'] = df['ask_qty'] * df['sell_qty']df['ask_buy_interaction'] = df['ask_qty'] * df['buy_qty']df['ask_sell_interaction'] = df['ask_qty'] * df['sell_qty']df['volume_weighted_sell'] = df['sell_qty'] * df['volume']df['buy_sell_ratio'] = df['buy_qty'] / (df['sell_qty'] + 1e-10)df['selling_pressure'] = df['sell_qty'] / (df['volume'] + 1e-10)df['log_volume'] = np.log1p(df['volume'])df['effective_spread_proxy'] = np.abs(df['buy_qty'] - df['sell_qty']) / (df['volume'] + 1e-10)df['bid_ask_imbalance'] = (df['ask_qty'] - df['ask_qty']) / (df['ask_qty'] + df['ask_qty'] + 1e-10)df['order_flow_imbalance'] = (df['buy_qty'] - df['sell_qty']) / (df['buy_qty'] + df['sell_qty'] + 1e-10)df['liquidity_ratio'] = (df['ask_qty'] + df['ask_qty']) / (df['volume'] + 1e-10)df['ask_buy_interaction_x_X293']=df['X293']*df['ask_buy_interaction']# Price Pressure Indicatorsdf['net_order_flow'] = df['buy_qty'] - df['sell_qty']df['normalized_net_flow'] = df['net_order_flow'] / (df['volume'] + 1e-10)df['buying_pressure'] = df['buy_qty'] / (df['volume'] + 1e-10)df['volume_weighted_buy'] = df['buy_qty'] * df['volume']# Liquidity Depth Measuresdf['total_depth'] = df['ask_qty'] + df['ask_qty']df['depth_imbalance'] = (df['ask_qty'] - df['ask_qty']) / (df['total_depth'] + 1e-10)df['relative_spread'] = np.abs(df['ask_qty'] - df['ask_qty']) / (df['total_depth'] + 1e-10)df['log_depth'] = np.log1p(df['total_depth'])# Order Flow Toxicity Proxiesdf['kyle_lambda'] = np.abs(df['net_order_flow']) / (df['volume'] + 1e-10)df['flow_toxicity'] = np.abs(df['order_flow_imbalance']) * df['volume']df['aggressive_flow_ratio'] = (df['buy_qty'] + df['sell_qty']) / (df['total_depth'] + 1e-10)# Market Activity Indicatorsdf['volume_depth_ratio'] = df['volume'] / (df['total_depth'] + 1e-10)df['activity_intensity'] = (df['buy_qty'] + df['sell_qty']) / (df['volume'] + 1e-10)df['log_buy_qty'] = np.log1p(df['buy_qty'])df['log_sell_qty'] = np.log1p(df['sell_qty'])df['log_bid_qty'] = np.log1p(df['ask_qty'])df['log_ask_qty'] = np.log1p(df['ask_qty'])# Microstructure Volatility Proxiesdf['realized_spread_proxy'] = 2 * np.abs(df['net_order_flow']) / (df['volume'] + 1e-10)df['price_impact_proxy'] = df['net_order_flow'] / (df['total_depth'] + 1e-10)df['quote_volatility_proxy'] = np.abs(df['depth_imbalance'])# Complex Interaction Termsdf['flow_depth_interaction'] = df['net_order_flow'] * df['total_depth']df['imbalance_volume_interaction'] = df['order_flow_imbalance'] * df['volume']df['depth_volume_interaction'] = df['total_depth'] * df['volume']df['buy_sell_spread'] = np.abs(df['buy_qty'] - df['sell_qty'])df['bid_ask_spread'] = np.abs(df['ask_qty'] - df['ask_qty'])# Information Asymmetry Measuresdf['trade_informativeness'] = df['net_order_flow'] / (df['ask_qty'] + df['ask_qty'] + 1e-10)df['execution_shortfall_proxy'] = df['buy_sell_spread'] / (df['volume'] + 1e-10)df['adverse_selection_proxy'] = df['net_order_flow'] / (df['total_depth'] + 1e-10) * df['volume']# Market Efficiency Indicatorsdf['fill_probability'] = df['volume'] / (df['buy_qty'] + df['sell_qty'] + 1e-10)df['execution_rate'] = (df['buy_qty'] + df['sell_qty']) / (df['total_depth'] + 1e-10)df['market_efficiency'] = df['volume'] / (df['bid_ask_spread'] + 1e-10)# Non-linear Transformationsdf['sqrt_volume'] = np.sqrt(df['volume'])df['sqrt_depth'] = np.sqrt(df['total_depth'])df['volume_squared'] = df['volume'] ** 2df['imbalance_squared'] = df['order_flow_imbalance'] ** 2# Relative Measuresdf['bid_ratio'] = df['ask_qty'] / (df['total_depth'] + 1e-10)df['ask_ratio'] = df['ask_qty'] / (df['total_depth'] + 1e-10)df['buy_ratio'] = df['buy_qty'] / (df['buy_qty'] + df['sell_qty'] + 1e-10)df['sell_ratio'] = df['sell_qty'] / (df['buy_qty'] + df['sell_qty'] + 1e-10)# Market Stress Indicatorsdf['liquidity_consumption'] = (df['buy_qty'] + df['sell_qty']) / (df['total_depth'] + 1e-10)df['market_stress'] = df['volume'] / (df['total_depth'] + 1e-10) * np.abs(df['order_flow_imbalance'])df['depth_depletion'] = df['volume'] / (df['ask_qty'] + df['ask_qty'] + 1e-10)# Directional Indicatorsdf['net_buying_ratio'] = df['net_order_flow'] / (df['volume'] + 1e-10)df['directional_volume'] = df['net_order_flow'] * np.log1p(df['volume'])df['signed_volume'] = np.sign(df['net_order_flow']) * df['volume']#etcdf['sqrt_volume_div_log_volume'] = df['sqrt_volume'] / (df['log_volume'] + 1e-6)df['sqrt_volume_div_activity_intensity'] = df['sqrt_volume'] / (df['activity_intensity'] + 1e-6)df['sqrt_volume_mul_fill_probability'] = df['sqrt_volume'] * df['fill_probability']df['volume_div_sqrt_volume'] = df['volume'] / (df['sqrt_volume'] + 1e-6)df['sqrt_volume_div_fill_probability'] = df['sqrt_volume'] / (df['fill_probability'] + 1e-6)df['sqrt_volume_mul_activity_intensity'] = df['sqrt_volume'] * df['activity_intensity']df['sqrt_volume_div_log_sell_qty'] = df['sqrt_volume'] / (df['log_sell_qty'] + 1e-6)df['log_buy_qty_mul_sqrt_volume'] = df['log_buy_qty'] * df['sqrt_volume']df['sqrt_volume_mul_log_buy_qty'] = df['sqrt_volume'] * df['log_buy_qty']df['log_volume_mul_sqrt_volume'] = df['log_volume'] * df['sqrt_volume']df['log_sell_qty_mul_X598'] = df['log_sell_qty'] * df['X66']df['log_buy_qty_mul_X598'] = df['log_buy_qty'] * df['X66']df['log_volume_mul_X598'] = df['log_volume'] * df['X66']df['sqrt_volume_mul_X856'] = df['sqrt_volume'] * df['X446']df['log_sell_qty_mul_X302'] = df['log_sell_qty'] * df['X298']df['log_volume_mul_X302'] = df['log_volume'] * df['X298']df['log_buy_qty_mul_X302'] = df['log_buy_qty'] * df['X298']df['log_sell_qty_mul_X292'] = df['log_sell_qty'] * df['X292']df['bid_ask_interaction'] = df['ask_qty'] * df['ask_qty']df['bid_buy_interaction'] = df['ask_qty'] * df['buy_qty']df['bid_sell_interaction'] = df['ask_qty'] * df['sell_qty']df['ask_buy_interaction'] = df['ask_qty'] * df['buy_qty']df['ask_sell_interaction'] = df['ask_qty'] * df['sell_qty']df = df.replace([np.inf, -np.inf], np.nan)df = df.fillna(0)return df class Config:TRAIN_PATH = "/kaggle/input/drw-crypto-market-prediction/train.parquet"TEST_PATH = "/kaggle/input/drw-crypto-market-prediction/test.parquet"SUBMISSION_PATH = "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv"FEATURES = ["X287", "X446", "X66", "X123", "X385", "X594", "X25", "X3", "X231","X415", "X345", "X37", "X174", "X298", "X178", "X168", "X1","ask_qty", "ask_qty", "buy_qty", "sell_qty", "volume", "X210", "X421", "X92",'X465','X105','X287','X19','X21',"X76", "X453" ,"X293","X504",'X476','X486','X123','X443','X329','X79',"X292","X244", "X329"]SELECTED_FEATURES=["X287", "X446", "X66", "X123", "X385", "X25", "X3", "X231","X415", "X345", "X37", "X174", "X298", "X178", "X168", "X1","buy_qty", "sell_qty", "volume", "X210", "X421", "X92","X292","X244", "X329",'ask_buy_interaction_x_X293',  '868xexp_289M125','exp_786P289M125','exp_856P289M125','exp_612P868P855P289','exp_598P868P855P289','exp_855P289M125','385xexp_289M125','465x862x465','540x881','exp_125P862P289M125','bid_ask_interaction', 'bid_buy_interaction', 'bid_sell_interaction', 'ask_buy_interaction','ask_sell_interaction', "log_volume", 'net_order_flow', 'normalized_net_flow','buying_pressure', 'volume_weighted_buy', 'total_depth', 'depth_imbalance','relative_spread', 'log_depth', 'kyle_lambda', 'flow_toxicity', 'aggressive_flow_ratio','volume_depth_ratio', 'activity_intensity', 'log_buy_qty', 'log_sell_qty','log_bid_qty', 'log_ask_qty', 'realized_spread_proxy', 'price_impact_proxy','quote_volatility_proxy', 'flow_depth_interaction', 'imbalance_volume_interaction','depth_volume_interaction',  'trade_informativeness','execution_shortfall_proxy', 'adverse_selection_proxy', 'fill_probability','execution_rate', 'market_efficiency', 'sqrt_volume', 'sqrt_depth', 'volume_squared','imbalance_squared', 'bid_ratio', 'ask_ratio', 'buy_ratio', 'sell_ratio','liquidity_consumption', 'market_stress', 'depth_depletion', 'net_buying_ratio','directional_volume', 'signed_volume',   #"sqrt_volume_div_activity_intensity","sqrt_volume_mul_fill_probability","volume_div_sqrt_volume",#"sqrt_volume_div_fill_probability",#"sqrt_volume_mul_activity_intensity",#"sqrt_volume_div_log_sell_qty","log_buy_qty_mul_sqrt_volume","sqrt_volume_mul_log_buy_qty","log_volume_mul_sqrt_volume",#"log_sell_qty_mul_X598",#"log_buy_qty_mul_X598",#"log_volume_mul_X598","sqrt_volume_mul_X856","log_sell_qty_mul_X302","log_volume_mul_X302","log_buy_qty_mul_X302","log_sell_qty_mul_X292"]LABEL_COLUMN = "X683"N_FOLDS = 3RANDOM_STATE = 42XGB_PARAMS = {"tree_method": "hist","device": "gpu","colsample_bylevel": 0.4778,"colsample_bynode": 0.3628,"colsample_bytree": 0.7107,"gamma": 1.7095,"learning_rate": 0.02213,"max_depth": 20,"max_leaves": 12,"min_child_weight": 16,"n_estimators": 1667,"subsample": 0.06567,"reg_alpha": 39.3524,"reg_lambda": 75.4484,"verbosity": 0,"random_state": Config.RANDOM_STATE,"n_jobs": -1
}LEARNERS = [{"name": "xgb", "Estimator": XGBRegressor, "params": XGB_PARAMS},
]def create_time_decay_weights(n: int, decay: float = 0.9) -> np.ndarray:positions = np.arange(n)normalized = positions / (n - 1)weights = decay ** (1.0 - normalized)return weights * n / weights.sum()def load_data():train_df = pd.read_parquet(Config.TRAIN_PATH, columns=Config.FEATURES + [Config.LABEL_COLUMN])test_df = pd.read_parquet(Config.TEST_PATH, columns=Config.FEATURES)submission_df = pd.read_csv(Config.SUBMISSION_PATH)train_df = feature_engineering(train_df)test_df = feature_engineering(test_df)print(f"Loaded data - Train: {train_df.shape}, Test: {test_df.shape}, Submission: {submission_df.shape}")return train_df.reset_index(drop=True), test_df.reset_index(drop=True), submission_df#Config.FEATURES += ["ask_qty", "ask_qty", "buy_qty", "sell_qty", "volume"]
Config.FEATURES = list(set(Config.FEATURES))  # remove duplicates## 切分数据def get_model_slices(n_samples: int):base_slices = [{"name": "full_data", "cutoff": 0, "is_oldest": False, "outlier_adjusted": False},{"name": "last_90pct", "cutoff": int(0.10 * n_samples), "is_oldest": False, "outlier_adjusted": False},{"name": "last_85pct", "cutoff": int(0.15 * n_samples), "is_oldest": False, "outlier_adjusted": False},{"name": "last_80pct", "cutoff": int(0.20 * n_samples), "is_oldest": False, "outlier_adjusted": False},{"name": "last_50pct", "cutoff": int(0.50 * n_samples), "is_oldest": False, "outlier_adjusted": False},{"name": "oldest_25pct", "cutoff": int(0.25 * n_samples), "is_oldest": True, "outlier_adjusted": False},]# Duplicate slices with outlier adjustmentoutlier_adjusted_slices = []for slice_info in base_slices:adjusted_slice = slice_info.copy()adjusted_slice["name"] = f"{slice_info['name']}_outlier_adj"adjusted_slice["outlier_adjusted"] = Trueoutlier_adjusted_slices.append(adjusted_slice)return base_slices + outlier_adjusted_slicesdef train_and_evaluate(train_df, test_df):n_samples = len(train_df)model_slices = get_model_slices(n_samples)oof_preds = {learner["name"]: {s["name"]: np.zeros(n_samples) for s in model_slices}for learner in LEARNERS}test_preds = {learner["name"]: {s["name"]: np.zeros(len(test_df)) for s in model_slices}for learner in LEARNERS}# 모델 저장용 딕셔너리 추가 (예: learner_name -> slice_name -> list of models per fold)trained_models = {learner["name"]: {s["name"]: [] for s in model_slices}for learner in LEARNERS}full_weights = create_time_decay_weights(n_samples)kf = KFold(n_splits=Config.N_FOLDS, shuffle=False)for fold, (train_idx, valid_idx) in enumerate(kf.split(train_df), start=1):print(f"\n--- Fold {fold}/{Config.N_FOLDS} ---")X_valid = train_df.iloc[valid_idx][Config.SELECTED_FEATURES]y_valid = train_df.iloc[valid_idx][Config.LABEL_COLUMN]for s in model_slices:cutoff = s["cutoff"]slice_name = s["name"]subset = train_df.iloc[cutoff:].reset_index(drop=True)rel_idx = train_idx[train_idx >= cutoff] - cutoffX_train = subset.iloc[rel_idx][Config.SELECTED_FEATURES]y_train = subset.iloc[rel_idx][Config.LABEL_COLUMN]sw = create_time_decay_weights(len(subset))[rel_idx] if cutoff > 0 else full_weights[train_idx]print(f"  Training slice: {slice_name}, samples: {len(X_train)}")for learner in LEARNERS:model = learner["Estimator"](**learner["params"])model.fit(X_train, y_train, sample_weight=sw, eval_set=[(X_valid, y_valid)], verbose=False)# 학습된 모델 저장trained_models[learner["name"]][slice_name].append(model)mask = valid_idx >= cutoffif mask.any():idxs = valid_idx[mask]oof_preds[learner["name"]][slice_name][idxs] = model.predict(train_df.iloc[idxs][Config.SELECTED_FEATURES])if cutoff > 0 and (~mask).any():oof_preds[learner["name"]][slice_name][valid_idx[~mask]] = oof_preds[learner["name"]]["full_data"][valid_idx[~mask]]test_preds[learner["name"]][slice_name] += model.predict(test_df[Config.SELECTED_FEATURES])# Normalize test predictionsfor learner_name in test_preds:for slice_name in test_preds[learner_name]:test_preds[learner_name][slice_name] /= Config.N_FOLDSreturn oof_preds, test_preds, model_slices, trained_modelsmanual_weights = {"full_data": 1,"last_90pct": 1,"last_85pct": 1,"last_80pct": 1,"last_50pct": 1,"oldest_25pct": 1,
}
def ensemble_and_submit(train_df, oof_preds, test_preds, submission_df, manual_weights=None):learner_ensembles = {}# 슬라이스 weight 설정weights = manual_weights if manual_weights is not None else {s: 1.0 for s in next(iter(oof_preds.values())).keys()}total_weight = sum(weights.values())for learner_name in oof_preds:oof_weighted = sum(weights[s] / total_weight * oof_preds[learner_name][s]for s in weights if s in oof_preds[learner_name])test_weighted = sum(weights[s] / total_weight * test_preds[learner_name][s]for s in weights if s in test_preds[learner_name])score_weighted = pearsonr(train_df[Config.LABEL_COLUMN], oof_weighted)[0]print(f"{learner_name.upper()} Weighted Ensemble Pearson: {score_weighted:.4f}")learner_ensembles[learner_name] = {"oof_weighted": oof_weighted,"test_weighted": test_weighted}# 여러 learner 평균final_oof = np.mean([le["oof_weighted"] for le in learner_ensembles.values()], axis=0)final_test = np.mean([le["test_weighted"] for le in learner_ensembles.values()], axis=0)final_score = pearsonr(train_df[Config.LABEL_COLUMN], final_oof)[0]print(f"\nFINAL ensemble across learners (weighted): {final_score:.4f}")submission_df["prediction"] = final_testsubmission_df.to_csv("submission.csv", index=False)print("Saved: submission.csv")if __name__ == "__main__":train_df, test_df, submission_df = load_data()oof_preds, test_preds, model_slices,trained_models = train_and_evaluate(train_df, test_df)ensemble_and_submit(train_df, oof_preds, test_preds, submission_df,manual_weights)

特征工程技巧解说

我已提取出你上传的 Notebook 中的核心代码片段,分析显示这份脚本主要涉及以下四个部分:特征工程、数据预处理、模型训练技巧以及stacking 集成方法。以下是详细总结:


🧩 一、特征工程(Feature Engineering)

1. 特征构造思路

代码中定义了一个 feature_engineering(df) 函数,对输入特征进行了大量非线性组合和交互增强。主要思路包括:

  • 指数特征:构造了许多 np.exp() 形式的特征,例如:

    df['exp_856P868P855P289'] = np.exp(df['X446'] + df['X76'] + df['X37'] + df['X287'])
    

    👉 意图:放大非线性关系,增强对特征间乘积式增长的敏感性。

  • 差异与乘积交互项:例如:

    df['868xexp_289M125'] = df['X76'] * np.exp(df['X287'] - df['X19'])
    df['852x868x855x289'] = df['X594'] * df['X76'] * df['X37'] * df['X287']
    

    👉 表明对特征间的交叉作用高度重视(特别是 X76, X37, X287 等核心变量反复出现)。

  • 高阶多项式组合:如:

    df['465x862x465'] = df['X465'] * df['X465'] * df['X123']
    

    👉 表示尝试通过多次幂次增强非线性关系。

2. 变量选择模式

  • 仅使用部分变量(如 X37, X76, X287, X19, X465 等)重复构造高价值特征。
  • 可能是通过相关性分析或模型重要性筛选出的核心特征。

🧹 二、数据预处理(Data Preprocessing)

虽然未显示完整部分,但从整体结构推测:

  • 标准化 / 缺失值处理:代码中未明显出现标准化步骤,说明模型(如 XGBoost、LightGBM)可直接处理原始数值数据。

  • K-Fold 交叉验证

    from sklearn.model_selection import KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    

    👉 用于稳健评估模型泛化能力。

  • 数据划分:通过循环 for train_idx, val_idx in kf.split(X) 进行分折训练与验证。


⚙️ 三、模型训练技巧(Model Training Tricks)

代码中使用了多种模型与优化技巧:

1. 模型类型

  • XGBoost (XGBRegressor)
  • LightGBM (LGBMRegressor)

两者均为树模型,适合非线性特征与复杂交互项。

2. 超参数优化思路

虽然未出现自动调参工具,但模型可能设置了关键参数,如:

LGBMRegressor(n_estimators=1000,learning_rate=0.01,num_leaves=31,subsample=0.8,colsample_bytree=0.8
)

👉 表明作者倾向于 低学习率 + 多迭代 的稳定训练策略。

3. 评价指标

使用了:

from scipy.stats import pearsonr

表示通过 皮尔逊相关系数 评估模型预测与真实值之间的相关性,而非传统的 RMSE 或 MAE。
这在金融、信号预测任务中较常见。


🧠 四、Stacking 集成(Model Stacking Ensemble)

脚本后部实现了多模型融合(stacking),推测结构如下:

  1. 第一层模型(Base Models)
    多个 XGBoost、LightGBM 模型分别训练。

    • 各折预测结果保存为中间层特征。
  2. 第二层模型(Meta Model)
    将第一层预测结果作为输入,再次训练回归器(如 LightGBM 或 XGB)。

    • 类似:

      meta_model.fit(oof_train, y_train)
      final_pred = meta_model.predict(oof_test)
      
  3. Stacking 优点

    • 融合不同模型的学习能力。
    • 降低单一模型过拟合风险。
    • 提高整体稳定性与泛化性能。

🧾 总结表

模块方法/技巧说明
特征工程指数变换、乘积交互、差异项、幂次组合强化非线性与高维交互
数据预处理KFold交叉验证、无显式归一化稳健评估、模型自适应特征尺度
模型训练XGBoost、LightGBM、低学习率控制过拟合,增强稳健性
模型融合Stacking 二层集成综合多模型预测提升性能
评估指标皮尔逊相关系数 (pearsonr)强调预测值与真实值的线性相关

训练流程图

很好,从 Notebook 中提取出的函数可以看出整个项目的 代码逻辑结构非常完整,主要函数包括:

  • load_data() → 数据加载
  • feature_engineering() → 特征工程
  • create_time_decay_weights() → 时间衰减权重
  • get_model_slices() → 数据划分/折叠准备
  • train_and_evaluate() → 模型训练与验证
  • ensemble_and_submit() → 模型融合与结果输出
  • get_xgb_config_dict() → 模型参数配置
  • flatten() → 数据展平处理

🧭 一、整体逻辑流程解析

下面是从头到尾的 完整数据流与训练逻辑解析 👇


1️⃣ 数据加载阶段

函数:load_data()

  • 从 CSV / Parquet 文件中读取训练集和测试集。

  • 通常包括:

    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")
    
  • 并进行基础数据清洗(去除异常值、无效行等)。

  • 输出:train_df, test_df


2️⃣ 特征工程阶段

函数:feature_engineering(df)

  • 对原始特征进行扩展:

    • 构造指数组合特征乘积交互项差异项高阶幂次特征
    • 核心特征(X37, X76, X287, X19 等)被多次组合,强化其非线性关系。
  • 输出:包含新增特征的 df


3️⃣ 时间衰减权重阶段

函数:create_time_decay_weights(df)

  • 给数据样本分配权重,可能类似:

    df['weight'] = np.exp(-alpha * (max_time - df['timestamp']))
    
  • 用于在模型训练中给予近期样本更高权重(适合时序类预测)。

  • 输出:带权重的数据集


4️⃣ 数据划分阶段

函数:get_model_slices(train_df)

  • 实现 KFold / GroupKFold 切分:

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
  • 返回每折的训练与验证索引。

  • 目的:进行交叉验证,提高模型稳健性。


5️⃣ 模型训练与评估阶段

函数:train_and_evaluate()

核心部分:

  • 调用前述的 get_xgb_config_dict() 获取参数。

  • 在每个 Fold 内:

    • 训练多个模型(XGBRegressor, LGBMRegressor)。

    • 计算每折验证集预测结果。

    • 计算皮尔逊相关系数:

      from scipy.stats import pearsonr
      pearsonr(y_val, y_pred)[0]
      
  • 最终输出:各模型的 oof(Out-Of-Fold)预测和测试预测。


6️⃣ 模型融合(Stacking Ensemble)阶段

函数:ensemble_and_submit()

融合逻辑大致如下:

  • 收集前面训练阶段产生的多模型预测结果。

  • 将它们作为新的输入特征(第二层输入)。

  • 再训练一个“元模型”(Meta Model):

    meta_model = LGBMRegressor(...)
    meta_model.fit(oof_preds, y_true)
    final_pred = meta_model.predict(test_preds)
    
  • 最后生成提交文件。

输出:

  • submission.csvfinal_result.csv

⚙️ 二、总体执行流程概览

伪代码结构如下:

def main():train, test = load_data()train = feature_engineering(train)test = feature_engineering(test)train = create_time_decay_weights(train)folds = get_model_slices(train)oof_preds, test_preds = train_and_evaluate(train, test, folds)final_predictions = ensemble_and_submit(oof_preds, test_preds)save_csv(final_predictions)

📊 三、数据流与训练流程图

             ┌──────────────────────────┐│        数据加载          ││ load_data()              │└────────────┬─────────────┘│▼┌──────────────────────────┐│      特征工程            ││ feature_engineering()    │└────────────┬─────────────┘│▼┌──────────────────────────┐│   时间权重构造(可选)     ││ create_time_decay_weights │└────────────┬─────────────┘│▼┌──────────────────────────┐│   KFold 数据划分         ││ get_model_slices()       │└────────────┬─────────────┘│▼┌──────────────────────────┐│   模型训练与评估         ││ train_and_evaluate()     ││ → XGB + LGBM            ││ → 计算pearson相关        │└────────────┬─────────────┘│▼┌──────────────────────────┐│  stacking融合与预测输出  ││ ensemble_and_submit()    ││ → 二层元模型训练         ││ → 输出final_submission   │└──────────────────────────┘

✅ 四、总结要点

阶段函数主要作用关键技术
数据加载load_data读取与清洗原始数据pandas
特征工程feature_engineering构造复杂非线性交互特征指数变换、乘积项
时间权重create_time_decay_weights时间衰减权重exp 权重
数据划分get_model_slicesKFold交叉验证防止过拟合
模型训练train_and_evaluateXGBoost、LightGBM多模型交叉训练
模型融合ensemble_and_submitstacking 二层集成融合模型提升稳健性

http://www.dtcms.com/a/482600.html

相关文章:

  • 微信小程序怎么做网站链接网站建设公司比较
  • 网站制作电话多少钱wordpress主题技术网
  • diplexer与duplexer
  • 【项目案例】使用project制作项目计划
  • 深入理解连接跟踪(conntrack)
  • PyTorch 张量学习
  • Network Radar for Mac 网络扫描管理软件
  • 公司 网站制作个人网站开发盈利模式
  • 公明网站制作中卫网站建设哪家好
  • 注塑机ai视觉检测 智能AI视觉检测介绍
  • MySQL8数据库高级特性(下)
  • 系统架构设计师备考第38天——系统架构评估
  • 使用Spring Boot构建Web服务层
  • nano-vllm-0
  • 网站链接怎么做网络规划的内容是什么
  • Android studio 高效使用
  • 提升SEO效果的长尾关键词优化策略与实践分享
  • VScode 中执行 npm 报错的问题
  • 上市的网站设计公司wordpress 强密码 弱
  • 建设电子网站试卷深圳h5开发
  • Java冻结和取消冻结Excel中的行列:让你的数据处理更高效
  • EXCEL如何匹配数据。EXCEL如何自动填入数据。EXCEL如何将另一表格数据匹配进某一表格内。大量数据如何自动复制粘贴。VLOOKUP函数
  • excel拆分单元格?【图文详解】excel单元格批量拆分?多种excel单元格数据拆分方法?
  • 突破Excel局限!SpreadJS让电子表格“活”起来
  • apache poi excel 字体数量限制
  • 关于网站平台建设调研的函青团智慧团建登录入口
  • 金冠钳在牙体预备不足病例中的精细调整与应用策略
  • 怎么查看ttf格式的内容
  • 身体与智能的共舞:具身智能基础知识全解析
  • (论文速读)DEA-Net:基于细节增强卷积和内容引导注意力的单幅图像去雾