根据Task3:尝试使用深度学习方案 - 飞书云文档 (feishu.cn)优化方案详解的教学进行优化
1.进一步优化特征
(1)历史平移特征:通过历史平移获取上个阶段的信息;
(2)差分特征:可以帮助获取相邻阶段的增长差异,描述数据的涨减变化情况。
(3)窗口统计特征:然后基于窗口范围进统计均值、最大值、最小值、中位数、方差的信息,并尝试不同窗口测试
# 历史平移
for i in range(10,36):
data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)
# 历史平移 + 差分特征
for i in range(1,4):
data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
# 窗口统计
for win in [15,30,50,70]:
data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').mean().values
data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').max().values
data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').min().values
data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').std().values
# 历史平移 + 窗口统计
for win in [7,14,28,35,50,70]:
data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').mean().values
data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').max().values
data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').min().values
data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').sum().values
data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').std().values
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
2.通过模型融合改善预测结果
模型融合
尝试使用是catboost、xgboost和lightgbm三个模型分别输出三个结果,最终进行加权平均融合
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
# 进行数据切分
train = data[data.target.notnull()].reset_index(drop=True)
test = data[data.target.isnull()].reset_index(drop=True)
# 确定输入特征
train_cols = [f for f in train.columns if f not in ['id','target']]
test_cols = [f for f in test.columns if f not in ['id','target']]
def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2024):
'''
clf:调用模型
train_x:训练数据
train_y:训练数据对应标签
test_x:测试数据
clf_name:选择使用模型名
seed:随机种子
'''
folds = 5
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
oof = np.zeros(train_x.shape[0])
test_predict = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
if clf_name == "lgb":
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'mae',
'min_child_weight': 6,
'num_leaves': 2 ** 6,
'lambda_l2': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.1,
'seed': 2023,
'nthread' : 16,
'verbose' : -1,
}
model = clf.train(params, train_matrix, 1000, valid_sets=[train_matrix, valid_matrix],
categorical_feature=[])
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
if clf_name == "xgb":
xgb_params = {
'booster': 'gbtree',
'objective': 'reg:squarederror',
'eval_metric': 'mae',
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.1,
'tree_method': 'hist',
'seed': 520,
'nthread': 16
}
train_matrix = clf.DMatrix(trn_x , label=trn_y)
valid_matrix = clf.DMatrix(val_x , label=val_y)
test_matrix = clf.DMatrix(test_x)
watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
model = clf.train(xgb_params, train_matrix, num_boost_round=1000, evals=watchlist)
val_pred = model.predict(valid_matrix)
test_pred = model.predict(test_matrix)
if clf_name == "cat":
params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2023,
'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}
model = clf(iterations=1000, **params)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
metric_period=200,
use_best_model=True,
cat_features=[],
verbose=1)
val_pred = model.predict(val_x)
test_pred = model.predict(test_x)
oof[valid_index] = val_pred
test_predict += test_pred / kf.n_splits
score = mean_absolute_error(val_y, val_pred)
cv_scores.append(score)
print(cv_scores)
return oof, test_predict,model
lgb_oof, lgb_test, lgb_model = cv_model(lgb, train[train_cols], train['target'], test[train_cols], 'lgb')
xgb_oof, xgb_test, xgb_model = cv_model(xgb, train[train_cols], train['target'], test[train_cols], 'xgb')
cat_oof, cat_test, cat_model = cv_model(CatBoostRegressor, train[train_cols], train['target'], test[train_cols], 'cat')
# 进行取平均融合
final_test = (lgb_test + xgb_test + cat_test) / 3
train_cols = [f for f in train.columns if f not in ['id','target']]
test_cols = [f for f in test.columns if f not in ['id','target']]
print(final_test)import matplotlib.pyplot as plt
3.对特征重要性进行可视化,根据特征重要性调整特征,尝试改善预测结果(此处以lgb为例)
import matplotlib.pyplot as plt
# 获取特征重要性
importances = lgb_model.feature_importance()
# 将特征重要性和特征名称组合在一起,并按重要性降序排序
sorted_importances = sorted(zip(importances, train_cols), reverse=True)
# 创建更大的图形以容纳较长的特征名称
plt.figure(figsize=(15, 20)) # 增加图形的高度以容纳更多特征
# 生成水平条形图
plt.barh([name for _, name in sorted_importances], [imp for imp, _ in sorted_importances])
# 设置标签和标题
plt.xlabel('重要性')
plt.ylabel('特征')
plt.title('特征重要性 (LightGBM)')
plt.gca().invert_yaxis() # 反转y轴,使得最重要的特征显示在上方
# 旋转x轴标签以提高可读性
plt.xticks(rotation=45, ha='right')
# 显示图形
plt.show()