当前位置：首页 > news >正文

ML4T - 第7章第5节用线性回归预测股票回报Prediction stock returns with linear regression

news 2025/10/1 10:17:52

一、Load Data 加载数据

二、Custom MultipleTimeSeriesCV 自定义时间序列交叉验证

三、Visualization helper functions 可视化辅助函数

四、Linear Regression with sklearn 线性回归模型

五、Linear Regression、Ridge Regression、Lasso Regression模型对比

六、Ridge Regression 岭回归

七、Lasso Regression 套索回归

八、Compare results 对比模型

一、Load Data 加载数据

这里只选取成交额前100名的数据进行分析（需要按照前面的帖子获取原始数据）

we select top 100 of data.dollar_vol_rank to analyse.

data = data[data.dollar_vol_rank<100]

import warnings
warnings.filterwarnings('ignore')%matplotlib inlinefrom time import time
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.stats import spearmanrfrom sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipelineimport seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormattersns.set_style('darkgrid')
idx = pd.IndexSliceYEAR = 252with pd.HDFStore('data.h5') as store:data = (store['model_data'].dropna().drop(['open', 'close', 'low', 'high'], axis=1))data.index.names = ['symbol', 'date']
data = data.drop([c for c in data.columns if 'lag' in c], axis=1)# Select Investment Universe 选择投资范围
data = data[data.dollar_vol_rank<100]# Create Model Data  创建模型数据
y = data.filter(like='target')
X = data.drop(y.columns, axis=1)
X = X.drop(['dollar_vol', 'dollar_vol_rank', 'volume', 'consumer_durables'], axis=1)# data.info(null_counts=True)
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 109434 entries, ('AAL', Timestamp('2013-07-26 00:00:00')) to ('YUM', Timestamp('2015-11-03 00:00:00'))
Data columns (total 45 columns):#   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  0   volume                 109434 non-null  float641   dollar_vol             109434 non-null  float642   dollar_vol_1m          109434 non-null  float643   dollar_vol_rank        109434 non-null  float644   rsi                    109434 non-null  float645   bb_high                109434 non-null  float646   bb_low                 109434 non-null  float647   atr                    109434 non-null  float648   macd                   109434 non-null  float649   return_1d              109434 non-null  float6410  return_5d              109434 non-null  float6411  return_10d             109434 non-null  float6412  return_21d             109434 non-null  float6413  return_42d             109434 non-null  float6414  return_63d             109434 non-null  float6415  target_1d              109434 non-null  float6416  target_5d              109434 non-null  float6417  target_10d             109434 non-null  float6418  target_21d             109434 non-null  float6419  year_2014              109434 non-null  bool   20  year_2015              109434 non-null  bool   21  year_2016              109434 non-null  bool   22  year_2017              109434 non-null  bool   23  month_2                109434 non-null  bool   24  month_3                109434 non-null  bool   25  month_4                109434 non-null  bool   26  month_5                109434 non-null  bool   27  month_6                109434 non-null  bool   28  month_7                109434 non-null  bool   29  month_8                109434 non-null  bool   30  month_9                109434 non-null  bool   31  month_10               109434 non-null  bool   32  month_11               109434 non-null  bool   33  month_12               109434 non-null  bool   34  capital_goods          109434 non-null  bool   35  consumer_durables      109434 non-null  bool   36  consumer_non-durables  109434 non-null  bool   37  consumer_services      109434 non-null  bool   38  energy                 109434 non-null  bool   39  finance                109434 non-null  bool   40  health_care            109434 non-null  bool   41  miscellaneous          109434 non-null  bool   42  public_utilities       109434 non-null  bool   43  technology             109434 non-null  bool   44  transportation         109434 non-null  bool   
dtypes: bool(26), float64(19)
memory usage: 19.8+ MB

二、Custom MultipleTimeSeriesCV 自定义时间序列交叉验证

这里其实只是将日期序列进行划分为训练集和测试集，后面再用时间结合原始数据得到新数据。

Here split train or test data by time, we get the date and then use them to build true data later.

class MultipleTimeSeriesCV:"""Generates tuples of train_idx, test_idx pairsAssumes the MultiIndex contains levels 'symbol' and 'date'purges overlapping outcomes"""# 这里其实只是将日期序列进行划分为训练集和测试集，后面再用时间结合原始数据得到新数据。def __init__(self,n_splits=3,train_period_length=126,test_period_length=21,lookahead=None,shuffle=False):self.n_splits = n_splitsself.lookahead = lookaheadself.test_length = test_period_lengthself.train_length = train_period_lengthself.shuffle = shuffledef split(self, X, y=None, groups=None):unique_dates = X.index.get_level_values('date').unique()days = sorted(unique_dates, reverse=True)split_idx = []for i in range(self.n_splits):test_end_idx = i * self.test_lengthtest_start_idx = test_end_idx + self.test_length# train_end_idx = test_start_idx + + self.lookahead - 1  # Maybe an extra plus sign 可能多加了+train_end_idx = test_start_idx + self.lookahead - 1train_start_idx = train_end_idx + self.train_length + self.lookahead - 1split_idx.append([train_start_idx, train_end_idx,test_start_idx, test_end_idx])dates = X.reset_index()[['date']]for train_start, train_end, test_start, test_end in split_idx:train_idx = dates[(dates.date > days[train_start])& (dates.date <= days[train_end])].indextest_idx = dates[(dates.date > days[test_start])& (dates.date <= days[test_end])].indexif self.shuffle:np.random.shuffle(list(train_idx))yield train_idx, test_idxdef get_n_splits(self, X, y, groups=None):return self.n_splits# Verify that it works 验证它是否有效
train_period_length = 63     # 3 months
test_period_length = 10      # 10 days
n_splits = int(3 * YEAR/test_period_length)  # 3 years
lookahead = 1                # 向前看1天cv = MultipleTimeSeriesCV(n_splits=n_splits,test_period_length=test_period_length,lookahead=lookahead,train_period_length=train_period_length)i = 0
for train_idx, test_idx in cv.split(X=data):train = data.iloc[train_idx]train_dates = train.index.get_level_values('date')test = data.iloc[test_idx]test_dates = test.index.get_level_values('date')# df = train.reset_index().append(test.reset_index()) # baddf = pd.concat([train.reset_index(), test.reset_index()])  # pandas ≥ 2.0 goodn = len(df)assert n== len(df.drop_duplicates())print(train.groupby(level='symbol').size().value_counts().index[0],  # 一个train的fold有多少天数据train_dates.min().date(), train_dates.max().date(),            # 起止时间test.groupby(level='symbol').size().value_counts().index[0],test_dates.min().date(), test_dates.max().date())i += 1if i == 10:break

每一行：训练集的数量、开始时间、结束时间；测试集的数量、开始时间、结束时间

三、Visualization helper functions 可视化辅助函数

这是用来后面做图用的

# Prediction vs Actual Scatter Plot 预测与实际散点图
def plot_preds_scatter(df, ticker=None):if ticker is not None:idx = pd.IndexSlicedf = df.loc[idx[ticker, :], :]j = sns.jointplot(x='predicted', y='actuals',robust=True, ci=None,line_kws={'lw': 1, 'color': 'k'},scatter_kws={'s': 1},data=df,kind='reg')j.ax_joint.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.1%}'.format(y)))j.ax_joint.xaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.1%}'.format(x)))j.ax_joint.set_xlabel('Predicted')j.ax_joint.set_ylabel('Actuals')# Daily IC Distribution 每日IC(Information Coefficient 信息系数)分布
def plot_ic_distribution(df, ax=None):if ax is not None:sns.distplot(df.ic, ax=ax)else:ax = sns.distplot(df.ic)mean, median = df.ic.mean(), df.ic.median()ax.axvline(0, lw=1, ls='--', c='k')ax.text(x=.05, y=.9,s=f'Mean: {mean:8.2f}\nMedian: {median:5.2f}',horizontalalignment='left',verticalalignment='center',transform=ax.transAxes)ax.set_xlabel('Information Coefficient')sns.despine()plt.tight_layout()def plot_ic_distribution(df, ax=None):if ax is not None:sns.distplot(df.ic, ax=ax)else:ax = sns.distplot(df.ic)mean, median = df.ic.mean(), df.ic.median()ax.axvline(0, lw=1, ls='--', c='k')ax.text(x=.05, y=.9,s=f'Mean: {mean:8.2f}\nMedian: {median:5.2f}',horizontalalignment='left',verticalalignment='center',transform=ax.transAxes)ax.set_xlabel('Information Coefficient')sns.despine()plt.tight_layout()# Rolling Daily IC 滚动每日IC
def plot_rolling_ic(df):fig, axes = plt.subplots(nrows=2, sharex=True, figsize=(14, 8))rolling_result = df.sort_index().rolling(21).mean().dropna()mean_ic = df.ic.mean()rolling_result.ic.plot(ax=axes[0],title=f'Information Coefficient (Mean: {mean_ic:.2f})',lw=1)axes[0].axhline(0, lw=.5, ls='-', color='k')axes[0].axhline(mean_ic, lw=1, ls='--', color='k')mean_rmse = df.rmse.mean()rolling_result.rmse.plot(ax=axes[1],title=f'Root Mean Squared Error (Mean: {mean_rmse:.2%})',lw=1,ylim=(0, df.rmse.max()))axes[1].axhline(df.rmse.mean(), lw=1, ls='--', color='k')sns.despine()plt.tight_layout()

四、Linear Regression with sklearn 线性回归模型

这里可以简单理解为，用很多个变量，拟合OLS模型，去预测某个变量。

We use many variables to fit OLS model, then predict some variable.

接下来，用历史数据训练线性回归模型，预测未来1天的目标值（target_1d），并通过交叉验证评估模型的预测能力（用IC和RMSE指标）。

指标	含义
IC（Information Coefficient）	每天预测值与真实值的Spearman秩相关系数，衡量排序能力。越高越好，>5%算不错。
RMSE（Root Mean Squared Error）	每天预测值与真实值的均方根误差，衡量预测精度。越小越好。

lr_r, lr_p = spearmanr(lr_predictions.actuals, lr_predictions.predicted)
print(f'Information Coefficient (overall): {lr_r:.3%} (p-value: {lr_p:.4%})')

整体IC：把所有预测值和真实值拼在一起，计算一个总的Spearman相关系数。
p-value：检验这个IC是否显著（<5%才算显著）。
如果这个IC是1.2%（p=12.3%），说明模型整体预测能力很弱，不显著。

%%time
# Set up cross-validation 交叉验证
train_period_length = 63
test_period_length = 10
n_splits = int(3 * YEAR / test_period_length)
lookahead = 1cv = MultipleTimeSeriesCV(n_splits=n_splits,test_period_length=test_period_length,lookahead=lookahead,train_period_length=train_period_length)# Run cross-validation with LinearRegression 运行线性回归模型的交叉验证target = f'target_{lookahead}d'
lr_predictions, lr_scores = [], []
lr = LinearRegression()
for i, (train_idx, test_idx) in enumerate(cv.split(X), 1):X_train, y_train, = X.iloc[train_idx], y[target].iloc[train_idx]X_test, y_test = X.iloc[test_idx], y[target].iloc[test_idx]lr.fit(X=X_train, y=y_train)y_pred = lr.predict(X_test)preds = y_test.to_frame('actuals').assign(predicted=y_pred)preds_by_day = preds.groupby(level='date')scores = pd.concat([preds_by_day.apply(lambda x: spearmanr(x.predicted,x.actuals)[0] * 100).to_frame('ic'),preds_by_day.apply(lambda x: np.sqrt(mean_squared_error(y_pred=x.predicted,y_true=x.actuals))).to_frame('rmse')], axis=1)lr_scores.append(scores)lr_predictions.append(preds)lr_scores = pd.concat(lr_scores)
lr_predictions = pd.concat(lr_predictions)# Persist results 保存
lr_scores.to_hdf('data.h5', 'lr/scores')
lr_predictions.to_hdf('data.h5', 'lr/predictions')
lr_scores = pd.read_hdf('data.h5', 'lr/scores')
lr_predictions = pd.read_hdf('data.h5', 'lr/predictions')# Evaluate results 评估结果
lr_r, lr_p = spearmanr(lr_predictions.actuals, lr_predictions.predicted)
print(f'Information Coefficient (overall): {lr_r:.3%} (p-value: {lr_p:.4%})')# Prediction vs Actuals Scatter 预测值与实际值散点图
plot_preds_scatter(lr_predictions)

结果：

Information Coefficient (overall): 1.346% (p-value: 0.0255%)
CPU times: user 23.7 s, sys: 2.9 s, total: 26.6 s
Wall time: 5.73 s

我的分析：

这个结果整体偏弱，虽然统计显著，但经济意义不强。

IC is too low, not good.

# Daily IC Distribution
plot_ic_distribution(lr_scores)

# Rolling Daily IC
plot_rolling_ic(lr_scores)

五、Linear Regression、Ridge Regression、Lasso Regression模型对比

因为我们后面还需要用到不同其他模型，所以这里我新增了一些对比知识。

英文	规范中文名	又常见叫法
Linear Regression	线性回归	普通最小二乘回归、OLS 回归
Ridge Regression	岭回归	吉洪诺夫正则化、L2 正则回归
Lasso Regression	套索回归	L1 正则回归、最小绝对值压缩选择算子

“套索”是 LASSO acronym（Least Absolute Shrinkage and Selection Operator）的音意合译，形象地表示“把无用变量套走”。

这三者都属于线性模型，核心区别在损失函数里有没有“惩罚项”，也就是对系数大小的约束。

模型	关键词	本质区别
Linear Regression	无惩罚	最小化残差平方和，系数无约束，容易过拟合
Ridge Regression	L2 惩罚	在损失函数里加 λ·Σβ²，让系数变小但不归零，解决多重共线性
Lasso Regression	L1 惩罚	加 λ·Σβ，能把无用特征的系数压到 0，实现自动特征选择

Linear Regression：解是最小二乘点，系数随便长。
Ridge：解被拉向原点，圆约束 ⇒ 系数变小但非零。
Lasso：解被拉向原点，菱形约束 ⇒ 顶点在坐标轴上 ⇒ 某些系数=0。

六、Ridge Regression 岭回归

在用时间序列交叉验证方式，遍历18个 Ridge 正则化强度 alpha，评估每个 alpha 下模型的预测能力（IC 和 RMSE），并记录模型系数，为后续选择最优 alpha 提供依据。

Using time series cross-validation, we evaluated the predictive performance (IC and RMSE) of the model for each of the 18 Ridge regularization strengths, denoted as alpha. We also recorded the model coefficients to inform the subsequent selection of the optimal alpha.

# %%time
# 在用时间序列交叉验证方式，遍历18个 Ridge 正则化强度 alpha，评估每个 alpha 下模型的预测能力（IC 和 RMSE），并记录模型系数，为后续选择最优 alpha 提供依据。
# Define cross-validation parameters
ridge_alphas = np.logspace(-4, 4, 9)
ridge_alphas = sorted(list(ridge_alphas) + list(ridge_alphas * 5))n_splits = int(3 * YEAR/test_period_length)
train_period_length = 63
test_period_length = 10
lookahead = 1cv = MultipleTimeSeriesCV(n_splits=n_splits,test_period_length=test_period_length,lookahead=lookahead,train_period_length=train_period_length)# Run cross-validation
target = f'target_{lookahead}d'X = X.drop([c for c in X.columns if 'year' in c], axis=1)ridge_coeffs, ridge_scores, ridge_predictions = {}, [], []for alpha in ridge_alphas:print(alpha, end=' ', flush=True)start = time()model = Ridge(alpha=alpha,fit_intercept=False,random_state=42)pipe = Pipeline([('scaler', StandardScaler()),('model', model)])coeffs = []for i, (train_idx, test_idx) in enumerate(cv.split(X), 1):X_train, y_train, = X.iloc[train_idx], y[target].iloc[train_idx]X_test, y_test = X.iloc[test_idx], y[target].iloc[test_idx]pipe.fit(X=X_train, y=y_train)y_pred = pipe.predict(X_test)preds = y_test.to_frame('actuals').assign(predicted=y_pred)preds_by_day = preds.groupby(level='date')scores = pd.concat([preds_by_day.apply(lambda x: spearmanr(x.predicted,x.actuals)[0] * 100).to_frame('ic'),preds_by_day.apply(lambda x: np.sqrt(mean_squared_error(y_pred=x.predicted,y_true=x.actuals))).to_frame('rmse')], axis=1)ridge_scores.append(scores.assign(alpha=alpha))ridge_predictions.append(preds.assign(alpha=alpha))coeffs.append(pipe.named_steps['model'].coef_)ridge_coeffs[alpha] = np.mean(coeffs, axis=0)print('\n')

Evaluate Ridge Results 评估

# Persist results
ridge_scores = pd.concat(ridge_scores)
ridge_scores.to_hdf('data.h5', 'ridge/scores')ridge_coeffs = pd.DataFrame(ridge_coeffs, index=X.columns).T
ridge_coeffs.to_hdf('data.h5', 'ridge/coeffs')ridge_predictions = pd.concat(ridge_predictions)
ridge_predictions.to_hdf('data.h5', 'ridge/predictions')ridge_scores = pd.read_hdf('data.h5', 'ridge/scores')
ridge_coeffs = pd.read_hdf('data.h5', 'ridge/coeffs')
ridge_predictions = pd.read_hdf('data.h5', 'ridge/predictions')# Evaluate Ridge Results
ridge_r, ridge_p = spearmanr(ridge_predictions.actuals, ridge_predictions.predicted)
print(f'Information Coefficient (overall): {ridge_r:.3%} (p-value: {ridge_p:.4%})')ridge_scores.groupby('alpha').ic.describe()


fig, axes = plt.subplots(ncols=2, sharex=True, figsize=(15, 5))scores_by_alpha = ridge_scores.groupby('alpha').ic.agg(['mean', 'median'])
best_alpha_mean = scores_by_alpha['mean'].idxmax()
best_alpha_median = scores_by_alpha['median'].idxmax()ax = sns.lineplot(x='alpha',y='ic',data=ridge_scores,estimator=np.mean,label='Mean',ax=axes[0])scores_by_alpha['median'].plot(logx=True,ax=axes[0],label='Median')axes[0].axvline(best_alpha_mean,ls='--',c='k',lw=1,label='Max. Mean')
axes[0].axvline(best_alpha_median,ls='-.',c='k',lw=1,label='Max. Median')
axes[0].legend()
axes[0].set_xscale('log')
axes[0].set_xlabel('Alpha')
axes[0].set_ylabel('Information Coefficient')
axes[0].set_title('Cross Validation Performance')ridge_coeffs.plot(logx=True,legend=False,ax=axes[1],title='Ridge Coefficient Path')axes[1].axvline(best_alpha_mean,ls='--',c='k',lw=1,label='Max. Mean')
axes[1].axvline(best_alpha_median,ls='-.',c='k',lw=1,label='Max. Median')
axes[1].set_xlabel('Alpha')
axes[1].set_ylabel('Coefficient Value')fig.suptitle('Ridge Results', fontsize=14)
sns.despine()
fig.tight_layout()
fig.subplots_adjust(top=.9)# 下面的左图：展示了不同 alpha 值下模型的 IC 分布。表示当alpha很大时效果变差了。
# 右图：展示了不同 alpha 值下模型的系数路径。表示当alpha很大时，模型的系数突然都变小了。

我的分析：

左图：展示了不同 alpha 值下模型的 IC 分布。表示当alpha很大时效果变差了。

右图：展示了不同 alpha 值下模型的系数路径。表示当alpha很大时，模型的系数突然都变小了。

# 只用最优 alpha 下的所有交叉验证折的结果
# 左图：如果分布集中在正值区域，说明模型大多数时候预测方向是对的。如果分布很宽、偏负，说明模型不稳定或信号弱。
# 右图：选出绝对值最大的10个特征，看哪些特征对预测贡献最大。best_alpha = ridge_scores.groupby('alpha').ic.mean().idxmax()
fig, axes = plt.subplots(ncols=2, figsize=(15, 5))
plot_ic_distribution(ridge_scores[ridge_scores.alpha == best_alpha],ax=axes[0])
axes[0].set_title('Daily Information Coefficients')
top_coeffs = ridge_coeffs.loc[best_alpha].abs().sort_values().head(10).index
top_coeffs.tolist()
ridge_coeffs.loc[best_alpha, top_coeffs].sort_values().plot.barh(ax=axes[1],title='Top 10 Coefficients')
sns.despine()
fig.tight_layout()

进一步分析：

左图：如果分布集中在正值区域，说明模型大多数时候预测方向是对的。如果分布很宽、偏负，说明模型不稳定或信号弱。
右图：选出绝对值最大的10个特征，看哪些特征对预测贡献最大。

plot_rolling_ic(ridge_scores[ridge_scores.alpha==best_alpha])

七、Lasso Regression 套索回归

和岭回归完全类似，

# 类似地
# Define cross-validation parameters
lasso_alphas = np.logspace(-10, -3, 8)train_period_length = 63
test_period_length = 10
YEAR = 252
n_splits = int(3 * YEAR / test_period_length) # three years
lookahead = 1cv = MultipleTimeSeriesCV(n_splits=n_splits,test_period_length=test_period_length,lookahead=lookahead,train_period_length=train_period_length)# Run cross-validation with Lasso regression
target = f'target_{lookahead}d'scaler = StandardScaler()
X = X.drop([c for c in X.columns if 'year' in c], axis=1)lasso_coeffs, lasso_scores, lasso_predictions = {}, [], []
for alpha in lasso_alphas:print(alpha, end=' ', flush=True)model = Lasso(alpha=alpha,fit_intercept=False,  # StandardScaler centers datarandom_state=42,tol=1e-3,max_iter=1000,warm_start=True,selection='random')pipe = Pipeline([('scaler', StandardScaler()),('model', model)])coeffs = []for i, (train_idx, test_idx) in enumerate(cv.split(X), 1):t = time()X_train, y_train, = X.iloc[train_idx], y[target].iloc[train_idx]X_test, y_test = X.iloc[test_idx], y[target].iloc[test_idx]pipe.fit(X=X_train, y=y_train)y_pred = pipe.predict(X_test)preds = y_test.to_frame('actuals').assign(predicted=y_pred)preds_by_day = preds.groupby(level='date')scores = pd.concat([preds_by_day.apply(lambda x: spearmanr(x.predicted,x.actuals)[0] * 100).to_frame('ic'),preds_by_day.apply(lambda x: np.sqrt(mean_squared_error(y_pred=x.predicted,y_true=x.actuals))).to_frame('rmse')],axis=1)lasso_scores.append(scores.assign(alpha=alpha))lasso_predictions.append(preds.assign(alpha=alpha))coeffs.append(pipe.named_steps['model'].coef_)lasso_coeffs[alpha] = np.mean(coeffs, axis=0)# Persist results
lasso_scores = pd.concat(lasso_scores)
lasso_scores.to_hdf('data.h5', 'lasso/scores')lasso_coeffs = pd.DataFrame(lasso_coeffs, index=X.columns).T
lasso_coeffs.to_hdf('data.h5', 'lasso/coeffs')lasso_predictions = pd.concat(lasso_predictions)
lasso_predictions.to_hdf('data.h5', 'lasso/predictions')

Evaluate Lasso Results 评估

best_alpha = lasso_scores.groupby('alpha').ic.mean().idxmax()
preds = lasso_predictions[lasso_predictions.alpha==best_alpha]lasso_r, lasso_p = spearmanr(preds.actuals, preds.predicted)
print(f'Information Coefficient (overall): {lasso_r:.3%} (p-value: {lasso_p:.4%})')lasso_scores.groupby('alpha').ic.agg(['mean', 'median'])

# Lasso Coefficient Path
fig, axes = plt.subplots(ncols=2, sharex=True, figsize=(15, 5))scores_by_alpha = lasso_scores.groupby('alpha').ic.agg(['mean', 'median'])
best_alpha_mean = scores_by_alpha['mean'].idxmax()
best_alpha_median = scores_by_alpha['median'].idxmax()# add code:
# Check if index is unique
print(lasso_scores.index.is_unique) # False表示索引不是唯一的
lasso_scores = lasso_scores.reset_index(drop=True)ax = sns.lineplot(x='alpha', y='ic', data=lasso_scores, estimator=np.mean, label='Mean', ax=axes[0])scores_by_alpha['median'].plot(logx=True, ax=axes[0], label='Median')axes[0].axvline(best_alpha_mean, ls='--', c='k', lw=1, label='Max. Mean')
axes[0].axvline(best_alpha_median, ls='-.', c='k', lw=1, label='Max. Median')
axes[0].legend()
axes[0].set_xscale('log')
axes[0].set_xlabel('Alpha')
axes[0].set_ylabel('Information Coefficient')
axes[0].set_title('Cross Validation Performance')lasso_coeffs.plot(logx=True, legend=False, ax=axes[1], title='Lasso Coefficient Path')
axes[1].axvline(best_alpha_mean, ls='--', c='k', lw=1, label='Max. Mean')
axes[1].axvline(best_alpha_median, ls='-.', c='k', lw=1, label='Max. Median')
axes[1].set_xlabel('Alpha')
axes[1].set_ylabel('Coefficient Value')fig.suptitle('Lasso Results', fontsize=14)
fig.tight_layout()
fig.subplots_adjust(top=.9)
sns.despine();

上面这段代码中，我们添加了：

# add code:

# Check if index is unique

print(lasso_scores.index.is_unique) # False表示索引不是唯一的

lasso_scores = lasso_scores.reset_index(drop=True)

否则可能不能正确执行。因为索引不唯一。

# Lasso IC Distribution and Top 10 Features
best_alpha = lasso_scores.groupby('alpha').ic.mean().idxmax()fig, axes = plt.subplots(ncols=2, figsize=(15, 5))
plot_ic_distribution(lasso_scores[lasso_scores.alpha==best_alpha], ax=axes[0])
axes[0].set_title('Daily Information Coefficients')top_coeffs = lasso_coeffs.loc[best_alpha].abs().sort_values().head(10).index
top_coeffs.tolist()
lasso_coeffs.loc[best_alpha, top_coeffs].sort_values().plot.barh(ax=axes[1], title='Top 10 Coefficients')sns.despine()
fig.tight_layout();

这里和原书作者结果有点不一样，感兴趣的可以参考原文：

https://github.com/stefan-jansen/machine-learning-for-trading/blob/main/07_linear_models/05_predicting_stock_returns_with_linear_regression.ipynb

# Lasso IC Distribution and Top 10 Features
best_alpha = lasso_scores.groupby('alpha').ic.mean().idxmax()fig, axes = plt.subplots(ncols=2, figsize=(15, 5))
plot_ic_distribution(lasso_scores[lasso_scores.alpha==best_alpha], ax=axes[0])
axes[0].set_title('Daily Information Coefficients')top_coeffs = lasso_coeffs.loc[best_alpha].abs().sort_values().head(10).index
top_coeffs.tolist()
lasso_coeffs.loc[best_alpha, top_coeffs].sort_values().plot.barh(ax=axes[1], title='Top 10 Coefficients')sns.despine()
fig.tight_layout();

八、Compare results 对比模型

best_ridge_alpha = ridge_scores.groupby('alpha').ic.mean().idxmax()
best_ridge_preds = ridge_predictions[ridge_predictions.alpha==best_ridge_alpha]
best_ridge_scores = ridge_scores[ridge_scores.alpha==best_ridge_alpha]best_lasso_alpha = lasso_scores.groupby('alpha').ic.mean().idxmax()
best_lasso_preds = lasso_predictions[lasso_predictions.alpha==best_lasso_alpha]
best_lasso_scores = lasso_scores[lasso_scores.alpha==best_lasso_alpha]df = pd.concat([lr_scores.assign(Model='Linear Regression'),best_ridge_scores.assign(Model='Ridge Regression'),best_lasso_scores.assign(Model='Lasso Regression')]).drop('alpha', axis=1)
df.columns = ['IC', 'RMSE', 'Model']scores = df.groupby('Model').IC.agg(['mean', 'median'])
fig, axes = plt.subplots(ncols=2, figsize=(14,4), sharey=True, sharex=True)scores['mean'].plot.barh(ax=axes[0], xlim=(1.85, 2), title='Mean')
scores['median'].plot.barh(ax=axes[1], xlim=(1.8, 2.1), title='Median')
axes[0].set_xlabel('Daily IC')
axes[1].set_xlabel('Daily IC')fig.suptitle('Daily Information Coefficient by Model', fontsize=14)
sns.despine()
fig.tight_layout()
fig.subplots_adjust(top=.9)