Task3的主要任务是从整体框架结构和深度学习模型的层面对于这个题目进行更加深层次的理解分析和尝试。
Task2结果
在这里我们先给出Task2进阶代码的运行结果,以供Task3改进对比。
特征优化
Task2中我们提到了特征工程还有差分特征等,所以特征优化可以将历史平移特征、窗口统计特征和差分特征融合起来,使得对于数据特征的处理和提取更加准确。
# 合并训练数据和测试数据
data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = data.sort_values(['id','dt'], ascending=False).reset_index(drop=True)
# 历史平移
for i in range(10,36):
data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)
# 历史平移 + 差分特征
for i in range(1,4):
data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
# 窗口统计
for win in [15,30,50,70]:
data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').mean().values
data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').max().values
data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').min().values
data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').std().values
# 历史平移 + 窗口统计
for win in [7,14,28,35,50,70]:
data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').mean().values
data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').max().values
data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').min().values
data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').sum().values
data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').std().values
优化后结果:
模型融合
进行模型融合的前提是有多个模型的输出结果,比如使用catboost、xgboost和lightgbm三个模型分别输出三个结果,这时就可以将三个结果进行融合,最常见的是将结果直接进行加权平均融合。
1、加权平均
!pip install xgboost
!pip install --user catboost
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2024):
'''
clf:调用模型
train_x:训练数据
train_y:训练数据对应标签
test_x:测试数据
clf_name:选择使用模型名
seed:随机种子
'''
folds = 5
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
oof = np.zeros(train_x.shape[0])
test_predict = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
if clf_name == "lgb":
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'mae',
'min_child_weight': 6,
'num_leaves': 2 ** 6,
'lambda_l2': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.1,
'seed': 2023,
'nthread' : 16,
'verbose' : -1,
'device': 'gpu'
}
callbacks1 = [log_evaluation(period=200), early_stopping(stopping_rounds=100)]
model = clf.train(params, train_matrix, 1000, valid_sets=[train_matrix, valid_matrix],categorical_feature=[],callbacks=callbacks1)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
if clf_name == "xgb":
xgb_params = {
'booster': 'gbtree',
'objective': 'reg:squarederror',
'eval_metric': 'mae',
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.1,
'tree_method': 'gpu_hist',
'seed': 520,
'nthread': 16
}
train_matrix = clf.DMatrix(trn_x , label=trn_y)
valid_matrix = clf.DMatrix(val_x , label=val_y)
test_matrix = clf.DMatrix(test_x)
watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
model = clf.train(xgb_params, train_matrix, num_boost_round=1000, evals=watchlist, verbose_eval=200, early_stopping_rounds=100)
val_pred = model.predict(valid_matrix)
test_pred = model.predict(test_matrix)
if clf_name == "cat":
params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2023,
'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}
model = clf(iterations=1000, **params)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
metric_period=200,
use_best_model=True,
cat_features=[],
verbose=1)
val_pred = model.predict(val_x)
test_pred = model.predict(test_x)
oof[valid_index] = val_pred
test_predict += test_pred / kf.n_splits
score = mean_absolute_error(val_y, val_pred)
cv_scores.append(score)
print(cv_scores)
return oof, test_predict
# 选择lightgbm模型
lgb_oof, lgb_test = cv_model(lgb, train[train_cols], train['target'], test[train_cols], 'lgb')
# 选择xgboost模型
xgb_oof, xgb_test = cv_model(xgb, train[train_cols], train['target'], test[train_cols], 'xgb')
# 选择catboost模型
cat_oof, cat_test = cv_model(CatBoostRegressor, train[train_cols], train['target'], test[train_cols], 'cat')
# 进行取平均融合
final_test = (lgb_test + xgb_test + cat_test) / 3
# 保存结果文件到本地
test['target'] = final_test
test[['id','dt','target']].to_csv('submit.csv', index=None)
优化后结果:
************************************ 1 ************************************
Training until validation scores don't improve for 100 rounds
[200] training's l1: 6.78632 valid_1's l1: 6.95248
[400] training's l1: 6.62347 valid_1's l1: 6.87885
[600] training's l1: 6.50065 valid_1's l1: 6.8272
[800] training's l1: 6.39807 valid_1's l1: 6.7882
[1000] training's l1: 6.31308 valid_1's l1: 6.76125
Did not meet early stopping. Best iteration is:
[1000] training's l1: 6.31308 valid_1's l1: 6.76125
[6.761251494361233]
************************************ 2 ************************************
Training until validation scores don't improve for 100 rounds
[200] training's l1: 6.79715 valid_1's l1: 6.92185
[400] training's l1: 6.6326 valid_1's l1: 6.8479
[600] training's l1: 6.50564 valid_1's l1: 6.79556
[800] training's l1: 6.40581 valid_1's l1: 6.7603
[1000] training's l1: 6.32145 valid_1's l1: 6.73159
Did not meet early stopping. Best iteration is:
[1000] training's l1: 6.32145 valid_1's l1: 6.73159
[6.761251494361233, 6.7315913230872315]
************************************ 3 ************************************
Training until validation scores don't improve for 100 rounds
[200] training's l1: 6.7964 valid_1's l1: 6.91835
[400] training's l1: 6.63158 valid_1's l1: 6.8433
[600] training's l1: 6.50699 valid_1's l1: 6.79131
[800] training's l1: 6.40766 valid_1's l1: 6.75521
[1000] training's l1: 6.32223 valid_1's l1: 6.72883
Did not meet early stopping. Best iteration is:
[1000] training's l1: 6.32223 valid_1's l1: 6.72883
[6.761251494361233, 6.7315913230872315, 6.728832001672867]
************************************ 4 ************************************
Training until validation scores don't improve for 100 rounds
[200] training's l1: 6.7943 valid_1's l1: 6.92993
[400] training's l1: 6.62444 valid_1's l1: 6.84888
[600] training's l1: 6.50276 valid_1's l1: 6.79908
[800] training's l1: 6.4026 valid_1's l1: 6.76296
[1000] training's l1: 6.31771 valid_1's l1: 6.73371
Did not meet early stopping. Best iteration is:
[997] training's l1: 6.31833 valid_1's l1: 6.73367
[6.761251494361233, 6.7315913230872315, 6.728832001672867, 6.733672011909271]
************************************ 5 ************************************
Training until validation scores don't improve for 100 rounds
[200] training's l1: 6.80152 valid_1's l1: 6.9033
[400] training's l1: 6.63215 valid_1's l1: 6.82452
[600] training's l1: 6.51247 valid_1's l1: 6.778
[800] training's l1: 6.41195 valid_1's l1: 6.74017
[1000] training's l1: 6.32629 valid_1's l1: 6.71202
Did not meet early stopping. Best iteration is:
[1000] training's l1: 6.32629 valid_1's l1: 6.71202
[6.761251494361233, 6.7315913230872315, 6.728832001672867, 6.733672011909271, 6.712020485981799]
************************************ 1 ************************************
[0] train-mae:20.06720 eval-mae:20.09302
[200] train-mae:7.04024 eval-mae:7.13910
[400] train-mae:6.92315 eval-mae:7.06463
[600] train-mae:6.84113 eval-mae:7.02153
[800] train-mae:6.77516 eval-mae:6.98873
[999] train-mae:6.71672 eval-mae:6.96031
[6.960308911029275]
************************************ 2 ************************************
[0] train-mae:20.09680 eval-mae:20.03719
[200] train-mae:7.05168 eval-mae:7.09695
[400] train-mae:6.93349 eval-mae:7.01993
[600] train-mae:6.84741 eval-mae:6.97052
[800] train-mae:6.78573 eval-mae:6.94216
[999] train-mae:6.73247 eval-mae:6.92021
[6.960308911029275, 6.920208228240343]
************************************ 3 ************************************
[0] train-mae:20.08618 eval-mae:20.06633
[200] train-mae:7.05566 eval-mae:7.10080
[400] train-mae:6.92888 eval-mae:7.01809
[600] train-mae:6.84402 eval-mae:6.96830
[800] train-mae:6.77794 eval-mae:6.93574
[999] train-mae:6.72353 eval-mae:6.91141
[6.960308911029275, 6.920208228240343, 6.911406622339947]
************************************ 4 ************************************
[0] train-mae:20.06614 eval-mae:20.10679
[200] train-mae:7.04413 eval-mae:7.09352
[400] train-mae:6.92015 eval-mae:7.01450
[600] train-mae:6.84239 eval-mae:6.97546
[800] train-mae:6.77458 eval-mae:6.94225
[999] train-mae:6.72141 eval-mae:6.91981
[6.960308911029275, 6.920208228240343, 6.911406622339947, 6.919812966623677]
************************************ 5 ************************************
[0] train-mae:20.08397 eval-mae:20.10031
[200] train-mae:7.05085 eval-mae:7.07912
[400] train-mae:6.93037 eval-mae:7.00484
[600] train-mae:6.84745 eval-mae:6.96061
[800] train-mae:6.78121 eval-mae:6.92794
[999] train-mae:6.72759 eval-mae:6.90559
[6.960308911029275, 6.920208228240343, 6.911406622339947, 6.919812966623677, 6.905594216687921]
************************************ 1 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: learn: 46.3205033 test: 46.3576785 best: 46.3576785 (0) total: 266ms remaining: 4m 25s
200: learn: 13.9126247 test: 14.3410950 best: 14.3410950 (200) total: 22.3s remaining: 1m 28s
400: learn: 13.3661375 test: 13.8737376 best: 13.8737376 (400) total: 46s remaining: 1m 8s
600: learn: 13.0598841 test: 13.6529329 best: 13.6529329 (600) total: 1m 8s remaining: 45.7s
800: learn: 12.8496899 test: 13.5192282 best: 13.5192282 (800) total: 1m 31s remaining: 22.7s
999: learn: 12.6834443 test: 13.4166585 best: 13.4165960 (998) total: 1m 54s remaining: 0us
bestTest = 13.41659598
bestIteration = 998
Shrink model to first 999 iterations.
[7.099260258793829]
************************************ 2 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: learn: 46.4033873 test: 46.0217607 best: 46.0217607 (0) total: 99.3ms remaining: 1m 39s
200: learn: 13.9994523 test: 14.0023774 best: 14.0023774 (200) total: 22.7s remaining: 1m 30s
400: learn: 13.4470921 test: 13.5583153 best: 13.5583153 (400) total: 45s remaining: 1m 7s
600: learn: 13.1430679 test: 13.3498180 best: 13.3498180 (600) total: 1m 8s remaining: 45.3s
800: learn: 12.9218966 test: 13.2060609 best: 13.2060609 (800) total: 1m 31s remaining: 22.6s
999: learn: 12.7554102 test: 13.1069390 best: 13.1069390 (999) total: 1m 53s remaining: 0us
bestTest = 13.10693899
bestIteration = 999
[7.099260258793829, 7.060875208529937]
************************************ 3 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: learn: 46.3737008 test: 46.1273708 best: 46.1273708 (0) total: 178ms remaining: 2m 57s
200: learn: 13.9623718 test: 14.1185952 best: 14.1185952 (200) total: 23.3s remaining: 1m 32s
400: learn: 13.4091167 test: 13.6283616 best: 13.6283616 (400) total: 46.1s remaining: 1m 8s
600: learn: 13.1055305 test: 13.4185102 best: 13.4185102 (600) total: 1m 8s remaining: 45.7s
800: learn: 12.8884213 test: 13.2864531 best: 13.2864531 (800) total: 1m 31s remaining: 22.7s
999: learn: 12.7186680 test: 13.1846856 best: 13.1846856 (999) total: 1m 54s remaining: 0us
bestTest = 13.1846856
bestIteration = 999
[7.099260258793829, 7.060875208529937, 7.059477679678101]
************************************ 4 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: learn: 46.2527443 test: 46.6700736 best: 46.6700736 (0) total: 83.2ms remaining: 1m 23s
200: learn: 14.0091015 test: 13.9772248 best: 13.9772248 (200) total: 22.7s remaining: 1m 30s
400: learn: 13.4316130 test: 13.5311924 best: 13.5311924 (400) total: 45.3s remaining: 1m 7s
600: learn: 13.1216918 test: 13.3256936 best: 13.3256936 (600) total: 1m 8s remaining: 45.7s
800: learn: 12.9078973 test: 13.2027070 best: 13.2027070 (800) total: 1m 32s remaining: 22.9s
999: learn: 12.7307205 test: 13.1134695 best: 13.1134695 (999) total: 1m 55s remaining: 0us
bestTest = 13.1134695
bestIteration = 999
[7.099260258793829, 7.060875208529937, 7.059477679678101, 7.069109887881056]
************************************ 5 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: learn: 46.3127061 test: 46.4800237 best: 46.4800237 (0) total: 92.2ms remaining: 1m 32s
200: learn: 13.9496626 test: 13.9743784 best: 13.9743784 (200) total: 31.5s remaining: 2m 5s
400: learn: 13.4066428 test: 13.5415785 best: 13.5415785 (400) total: 55.5s remaining: 1m 22s
600: learn: 13.1092822 test: 13.3363320 best: 13.3363320 (600) total: 1m 18s remaining: 52s
800: learn: 12.8992729 test: 13.2192934 best: 13.2192934 (800) total: 1m 40s remaining: 25s
999: learn: 12.7321157 test: 13.1257847 best: 13.1257847 (999) total: 2m 3s remaining: 0us
bestTest = 13.12578465
bestIteration = 999
[7.099260258793829, 7.060875208529937, 7.059477679678101, 7.069109887881056, 7.044158168142322]
2、stacking融合
stacking是一种分层模型集成框架。以两层为例,第一层由多个基学习器组成,其输入为原始训练集,第二层的模型则是以第一层基学习器的输出作为特征加入训练集进行再训练,从而得到完整的stacking模型。
第一层:(类比cv_model函数)
-
划分训练数据为K折(5折为例,每次选择其中四份作为训练集,一份作为验证集);
-
针对各个模型RF、ET、GBDT、XGB,分别进行5次训练,每次训练保留一份样本用作训练时的验证,训练完成后分别对Validation set,Test set进行预测,对于Test set一个模型会对应5个预测结果,将这5个结果取平均;对于Validation set一个模型经过5次交叉验证后,所有验证集数据都含有一个标签。此步骤结束后:5个验证集(总数相当于训练集全部)在每个模型下分别有一个预测标签,每行数据共有4个标签(4个算法模型),测试集每行数据也拥有四个标签(4个模型分别预测得到的)
第二层:(类比stack_model函数)
将训练集中的四个标签外加真实标签当作五列新的特征作为新的训练集,选取一个训练模型,根据新的训练集进行训练,然后应用测试集的四个标签组成的测试集进行预测作为最终的result。
from sklearn.linear_model import Ridge
def stack_model(oof_1, oof_2, oof_3, predictions_1, predictions_2, predictions_3, y):
'''
输入的oof_1, oof_2, oof_3可以对应lgb_oof,xgb_oof,cat_oof
predictions_1, predictions_2, predictions_3对应lgb_test,xgb_test,cat_test
'''
train_stack = pd.concat([oof_1, oof_2, oof_3], axis=1)
test_stack = pd.concat([predictions_1, predictions_2, predictions_3], axis=1)
oof = np.zeros((train_stack.shape[0],))
predictions = np.zeros((test_stack.shape[0],))
scores = []
from sklearn.model_selection import RepeatedKFold
folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2021)
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, train_stack)):
print("fold n°{}".format(fold_+1))
trn_data, trn_y = train_stack.loc[trn_idx], y[trn_idx]
val_data, val_y = train_stack.loc[val_idx], y[val_idx]
clf = Ridge(random_state=2021)
clf.fit(trn_data, trn_y)
oof[val_idx] = clf.predict(val_data)
predictions += clf.predict(test_stack) / (5 * 2)
score_single = mean_absolute_error(val_y, oof[val_idx])
scores.append(score_single)
print(f'{fold_+1}/{5}', score_single)
print('mean: ',np.mean(scores))
return oof, predictions
stack_oof, stack_pred = stack_model(pd.DataFrame(lgb_oof), pd.DataFrame(xgb_oof), pd.DataFrame(cat_oof),
pd.DataFrame(lgb_test), pd.DataFrame(xgb_test), pd.DataFrame(cat_test), train['target'])
# 保存结果文件到本地
test['target'] = stack_pred
test[['id','dt','target']].to_csv('submit.csv', index=None)
优化后结果:
************************************ 1 ************************************
Training until validation scores don't improve for 100 rounds
[200] training's l1: 6.78631 valid_1's l1: 6.95243
[400] training's l1: 6.61549 valid_1's l1: 6.8724
[600] training's l1: 6.49529 valid_1's l1: 6.82365
[800] training's l1: 6.3934 valid_1's l1: 6.78396
[1000] training's l1: 6.30565 valid_1's l1: 6.75324
Did not meet early stopping. Best iteration is:
[1000] training's l1: 6.30565 valid_1's l1: 6.75324
[6.753240414713993]
************************************ 2 ************************************
Training until validation scores don't improve for 100 rounds
[200] training's l1: 6.79749 valid_1's l1: 6.92171
[400] training's l1: 6.62836 valid_1's l1: 6.84238
[600] training's l1: 6.5102 valid_1's l1: 6.79913
[800] training's l1: 6.40863 valid_1's l1: 6.76193
[1000] training's l1: 6.32057 valid_1's l1: 6.72975
Did not meet early stopping. Best iteration is:
[1000] training's l1: 6.32057 valid_1's l1: 6.72975
[6.753240414713993, 6.729754334868036]
************************************ 3 ************************************
Training until validation scores don't improve for 100 rounds
[200] training's l1: 6.80203 valid_1's l1: 6.92696
[400] training's l1: 6.63223 valid_1's l1: 6.84552
[600] training's l1: 6.51031 valid_1's l1: 6.79671
[800] training's l1: 6.40858 valid_1's l1: 6.75748
[1000] training's l1: 6.32021 valid_1's l1: 6.72623
Did not meet early stopping. Best iteration is:
[1000] training's l1: 6.32021 valid_1's l1: 6.72623
[6.753240414713993, 6.729754334868036, 6.726230013028263]
************************************ 4 ************************************
Training until validation scores don't improve for 100 rounds
[200] training's l1: 6.79139 valid_1's l1: 6.92244
[400] training's l1: 6.6234 valid_1's l1: 6.84798
[600] training's l1: 6.50205 valid_1's l1: 6.79755
[800] training's l1: 6.40186 valid_1's l1: 6.76061
[1000] training's l1: 6.31948 valid_1's l1: 6.7333
Did not meet early stopping. Best iteration is:
[996] training's l1: 6.32045 valid_1's l1: 6.73313
[6.753240414713993, 6.729754334868036, 6.726230013028263, 6.733130077495729]
************************************ 5 ************************************
Training until validation scores don't improve for 100 rounds
[200] training's l1: 6.80002 valid_1's l1: 6.90037
[400] training's l1: 6.63216 valid_1's l1: 6.82357
[600] training's l1: 6.51069 valid_1's l1: 6.77679
[800] training's l1: 6.41138 valid_1's l1: 6.74125
[1000] training's l1: 6.32466 valid_1's l1: 6.71099
Did not meet early stopping. Best iteration is:
[999] training's l1: 6.32494 valid_1's l1: 6.71098
[6.753240414713993, 6.729754334868036, 6.726230013028263, 6.733130077495729, 6.710976025778095]
************************************ 1 ************************************
[0] train-mae:20.06720 eval-mae:20.09302
[200] train-mae:7.04024 eval-mae:7.13910
[400] train-mae:6.92315 eval-mae:7.06463
[600] train-mae:6.84113 eval-mae:7.02153
[800] train-mae:6.77516 eval-mae:6.98873
[999] train-mae:6.71672 eval-mae:6.96031
[6.960308911029275]
************************************ 2 ************************************
[0] train-mae:20.09680 eval-mae:20.03719
[200] train-mae:7.05168 eval-mae:7.09695
[400] train-mae:6.93349 eval-mae:7.01993
[600] train-mae:6.84741 eval-mae:6.97052
[800] train-mae:6.78573 eval-mae:6.94216
[999] train-mae:6.73247 eval-mae:6.92021
[6.960308911029275, 6.920208228240343]
************************************ 3 ************************************
[0] train-mae:20.08618 eval-mae:20.06633
[200] train-mae:7.05566 eval-mae:7.10080
[400] train-mae:6.92888 eval-mae:7.01809
[600] train-mae:6.84402 eval-mae:6.96830
[800] train-mae:6.77794 eval-mae:6.93574
[999] train-mae:6.72353 eval-mae:6.91141
[6.960308911029275, 6.920208228240343, 6.911406622339947]
************************************ 4 ************************************
[0] train-mae:20.06614 eval-mae:20.10679
[200] train-mae:7.04413 eval-mae:7.09352
[400] train-mae:6.92015 eval-mae:7.01450
[600] train-mae:6.84239 eval-mae:6.97546
[800] train-mae:6.77458 eval-mae:6.94225
[999] train-mae:6.72141 eval-mae:6.91981
[6.960308911029275, 6.920208228240343, 6.911406622339947, 6.919812966623677]
************************************ 5 ************************************
[0] train-mae:20.08397 eval-mae:20.10031
[200] train-mae:7.05085 eval-mae:7.07912
[400] train-mae:6.93037 eval-mae:7.00484
[600] train-mae:6.84745 eval-mae:6.96061
[800] train-mae:6.78121 eval-mae:6.92794
[999] train-mae:6.72759 eval-mae:6.90559
[6.960308911029275, 6.920208228240343, 6.911406622339947, 6.919812966623677, 6.905594216687921]
************************************ 1 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: learn: 46.3205033 test: 46.3576785 best: 46.3576785 (0) total: 422ms remaining: 7m 1s
200: learn: 13.9126247 test: 14.3410950 best: 14.3410950 (200) total: 39.8s remaining: 2m 38s
400: learn: 13.3661375 test: 13.8737376 best: 13.8737376 (400) total: 1m 20s remaining: 2m
600: learn: 13.0598841 test: 13.6529329 best: 13.6529329 (600) total: 1m 45s remaining: 1m 9s
800: learn: 12.8496899 test: 13.5192282 best: 13.5192282 (800) total: 2m 13s remaining: 33.2s
999: learn: 12.6834443 test: 13.4166585 best: 13.4165960 (998) total: 2m 54s remaining: 0us
bestTest = 13.41659598
bestIteration = 998
Shrink model to first 999 iterations.
[7.099260258793829]
************************************ 2 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: learn: 46.4033873 test: 46.0217607 best: 46.0217607 (0) total: 267ms remaining: 4m 27s
200: learn: 13.9994523 test: 14.0023774 best: 14.0023774 (200) total: 40.3s remaining: 2m 40s
400: learn: 13.4470921 test: 13.5583153 best: 13.5583153 (400) total: 1m 20s remaining: 2m
600: learn: 13.1430679 test: 13.3498180 best: 13.3498180 (600) total: 2m 1s remaining: 1m 20s
800: learn: 12.9218966 test: 13.2060609 best: 13.2060609 (800) total: 2m 41s remaining: 40.2s
999: learn: 12.7554102 test: 13.1069390 best: 13.1069390 (999) total: 3m 18s remaining: 0us
bestTest = 13.10693899
bestIteration = 999
[7.099260258793829, 7.060875208529937]
************************************ 3 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: learn: 46.3737008 test: 46.1273708 best: 46.1273708 (0) total: 252ms remaining: 4m 11s
200: learn: 13.9623718 test: 14.1185952 best: 14.1185952 (200) total: 40.9s remaining: 2m 42s
400: learn: 13.4091167 test: 13.6283616 best: 13.6283616 (400) total: 1m 22s remaining: 2m 2s
600: learn: 13.1055305 test: 13.4185102 best: 13.4185102 (600) total: 2m 3s remaining: 1m 21s
800: learn: 12.8884213 test: 13.2864531 best: 13.2864531 (800) total: 2m 42s remaining: 40.4s
999: learn: 12.7186680 test: 13.1846856 best: 13.1846856 (999) total: 3m 22s remaining: 0us
bestTest = 13.1846856
bestIteration = 999
[7.099260258793829, 7.060875208529937, 7.059477679678101]
************************************ 4 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: learn: 46.2527443 test: 46.6700736 best: 46.6700736 (0) total: 239ms remaining: 3m 59s
200: learn: 14.0091015 test: 13.9772248 best: 13.9772248 (200) total: 37.7s remaining: 2m 29s
400: learn: 13.4316130 test: 13.5311924 best: 13.5311924 (400) total: 1m 1s remaining: 1m 31s
600: learn: 13.1216918 test: 13.3256936 best: 13.3256936 (600) total: 1m 24s remaining: 56s
800: learn: 12.9078973 test: 13.2027070 best: 13.2027070 (800) total: 1m 47s remaining: 26.7s
999: learn: 12.7307205 test: 13.1134695 best: 13.1134695 (999) total: 2m 10s remaining: 0us
bestTest = 13.1134695
bestIteration = 999
[7.099260258793829, 7.060875208529937, 7.059477679678101, 7.069109887881056]
************************************ 5 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: learn: 46.3127061 test: 46.4800237 best: 46.4800237 (0) total: 88ms remaining: 1m 27s
200: learn: 13.9496626 test: 13.9743784 best: 13.9743784 (200) total: 22.7s remaining: 1m 30s
400: learn: 13.4066428 test: 13.5415785 best: 13.5415785 (400) total: 51.4s remaining: 1m 16s
600: learn: 13.1092822 test: 13.3363320 best: 13.3363320 (600) total: 1m 32s remaining: 1m 1s
800: learn: 12.8992729 test: 13.2192934 best: 13.2192934 (800) total: 2m 12s remaining: 32.9s
999: learn: 12.7321157 test: 13.1257847 best: 13.1257847 (999) total: 2m 52s remaining: 0us
bestTest = 13.12578465
bestIteration = 999
[7.099260258793829, 7.060875208529937, 7.059477679678101, 7.069109887881056, 7.044158168142322]
fold n°1
1/5 6.719509406634522
fold n°2
2/5 6.72299326754129
fold n°3
3/5 6.746608252531554
fold n°4
4/5 6.725625750031283
fold n°5
5/5 6.720224785722014
fold n°6
6/5 6.7103004355113995
fold n°7
7/5 6.732119822600345
fold n°8
8/5 6.715782457305116
fold n°9
9/5 6.731914243360591
fold n°10
10/5 6.744683998081653
mean: 6.726976241931976
深度学习
1、代码方案
!pip install keras
!pip install tensorflow
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# 数据预处理
def preprocess_data(df, look_back=100):
# 将数据按照id进行分组
grouped = df.groupby('id')
datasets = {}
for id, group in grouped:
datasets[id] = group.values
# 准备训练数据集
X, Y = [], []
for id, data in datasets.items():
for i in range(10, 15): # 每个id构建5个序列
a = data[i:(i + look_back), 3]
a = np.append(a, np.array([0]*(100-len(a))))
X.append(a[::-1])
Y.append(data[i-10:i, 3][::-1])
# 准备测试数据集
OOT = []
for id, data in datasets.items():
a = data[:100, 3]
a = np.append(a, np.array([0]*(100-len(a))))
OOT.append(a[::-1])
return np.array(X, dtype=np.float64), np.array(Y, dtype=np.float64), np.array(OOT, dtype=np.float64)
# 定义模型
def build_model(look_back, n_features, n_output):
model = Sequential()
model.add(LSTM(100, input_shape=(look_back, n_features)))
model.add(Dropout(0.2))
model.add(RepeatVector(n_output))
model.add(LSTM(100, return_sequences=True))
model.add(TimeDistributed(Dense(1)))
model.compile(loss='mean_squared_error', optimizer=Adam(0.001))
return model
# 构建和训练模型
look_back = 100 # 序列长度
n_features = 1 # 假设每个时间点只有一个特征
n_output = 10 # 预测未来10个时间单位的值
# 预处理数据
X, Y, OOT = preprocess_data(train, look_back=look_back)
# 构建模型
model = build_model(look_back, n_features, n_output)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# 训练模型
model.fit(X, Y, epochs=50, batch_size=64, verbose=1, validation_split=0.2, callbacks=[early_stopping])
# 进行预测
predicted_values = model.predict(OOT)
# 准备符合要求格式的预测结果
predictions = []
for i, data in enumerate(OOT):
id = train['id'].unique()[i]
for j in range(n_output):
predictions.append([id, j+1, predicted_values[i, j, 0]])
# 将预测结果转换为DataFrame
df_predictions = pd.DataFrame(predictions, columns=['id', 'dt', 'target'])
# 导出预测结果到CSV文件
df_predictions.to_csv('predictions.csv', index=False, encoding='utf-8')
print("预测结果已保存至 predictions.csv 文件")
2、代码解析
(1)导入库和加载数据
!pip install keras
!pip install tensorflow
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
- 导入库: 导入了必要的库,包括
numpy
、pandas
、从sklearn
中的MinMaxScaler
,以及从keras
中导入的多个组件 (Sequential
,LSTM
,Dense
,RepeatVector
,TimeDistributed
,Adam
优化器)。 - 加载数据: 使用
pandas
加载训练数据 (train.csv
) 和测试数据 (test.csv
)。
(2)数据预处理函数 (preprocess_data
)
def preprocess_data(df, look_back=100):
grouped = df.groupby('id')
datasets = {}
for id, group in grouped:
datasets[id] = group.values
X, Y = [], []
for id, data in datasets.items():
for i in range(10, 15): # 每个id构建5个序列
a = data[i:(i + look_back), 3]
a = np.append(a, np.array([0]*(100-len(a))))
X.append(a[::-1])
Y.append(data[i-10:i, 3][::-1])
OOT = []
for id, data in datasets.items():
a = data[:100, 3]
a = np.append(a, np.array([0]*(100-len(a))))
OOT.append(a[::-1])
return np.array(X, dtype=np.float64), np.array(Y, dtype=np.float64), np.array(OOT, dtype=np.float64)
- 数据分组: 根据
'id'
列对数据进行分组,处理多个时间序列数据。 - 训练数据准备 (
X
和Y
): 为训练准备序列 (X
和Y
):X
: 每个序列长度为look_back
,如果不足则用零填充。Y
: 对应于每个X
的目标序列,取每个序列前10
个时间步的数据。
- 测试数据准备 (
OOT
): 准备用于测试的数据,类似于X
,但是是每个时间序列的最初部分。
(3)模型构建函数 (build_model
)
def build_model(look_back, n_features, n_output):
model = Sequential()
model.add(LSTM(100, input_shape=(look_back, n_features)))
model.add(Dropout(0.2))
model.add(RepeatVector(n_output))
model.add(LSTM(100, return_sequences=True))
model.add(TimeDistributed(Dense(1)))
model.compile(loss='mean_squared_error', optimizer=Adam(0.001))
return model
- LSTM 模型配置:
- 使用
Sequential
模型。 - 第一个 LSTM 层有
100
个单元,输入形状由look_back
和n_features
确定。 RepeatVector
层用于重复输出序列 (n_output
次)。- 第二个 LSTM 层也有
100
个单元,并且设置return_sequences=True
以输出序列给TimeDistributed
层。 TimeDistributed
层使用Dense(1)
,在每个时间步输出单一的预测值。- 优化器设置为 Adam,损失函数为均方误差 (
'mean_squared_error'
)。
- 使用
(4)模型训练和预测
look_back = 100
n_features = 1
n_output = 10
# 数据预处理
X, Y, OOT = preprocess_data(train, look_back=look_back)
# 构建模型
model = build_model(look_back, n_features, n_output)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# 训练模型
model.fit(X, Y, epochs=50, batch_size=64, verbose=1, validation_split=0.2, callbacks=[early_stopping])
# 预测
predicted_values = model.predict(OOT)
- 模型训练:
- 使用
preprocess_data
函数处理训练数据 (train
),生成X
和Y
。 - 调用
build_model
函数构建模型。 - 使用
fit
方法训练模型,采用50
个周期 (epochs
),每个批次 (batch_size
) 大小为64
。
- 使用
- 预测:
- 使用
preprocess_data
函数处理测试数据 (test
) 生成OOT
。 - 使用
model.predict(OOT)
对测试数据进行预测。
- 使用
3、运行后结果
Epoch 1/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 24s 60ms/step - loss: 3071.0012 - val_loss: 2511.0078
Epoch 2/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 24s 64ms/step - loss: 2268.7061 - val_loss: 2194.5291
Epoch 3/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 1983.3658 - val_loss: 1960.4478
Epoch 4/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 59ms/step - loss: 1881.2417 - val_loss: 1771.8282
Epoch 5/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 57ms/step - loss: 1306.5902 - val_loss: 1613.6753
Epoch 6/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 58ms/step - loss: 1481.9780 - val_loss: 1476.9568
Epoch 7/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 60ms/step - loss: 1259.1500 - val_loss: 1369.5964
Epoch 8/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 57ms/step - loss: 1410.8124 - val_loss: 1262.5564
Epoch 9/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 56ms/step - loss: 864.4413 - val_loss: 1173.4703
Epoch 10/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 954.0659 - val_loss: 1086.0402
Epoch 11/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 60ms/step - loss: 901.7515 - val_loss: 1015.5499
Epoch 12/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 783.8499 - val_loss: 945.5701
Epoch 13/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 60ms/step - loss: 819.7301 - val_loss: 885.4941
Epoch 14/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 57ms/step - loss: 672.4242 - val_loss: 855.5969
Epoch 15/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 59ms/step - loss: 635.2537 - val_loss: 797.3814
Epoch 16/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 62ms/step - loss: 641.1038 - val_loss: 760.0760
Epoch 17/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 62ms/step - loss: 649.9288 - val_loss: 719.8991
Epoch 18/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 698.9720 - val_loss: 704.7115
Epoch 19/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 58ms/step - loss: 596.2827 - val_loss: 654.0480
Epoch 20/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 57ms/step - loss: 535.0333 - val_loss: 636.0656
Epoch 21/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 56ms/step - loss: 479.3533 - val_loss: 602.2112
Epoch 22/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 478.9056 - val_loss: 590.2931
Epoch 23/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 58ms/step - loss: 517.2261 - val_loss: 561.5989
Epoch 24/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 57ms/step - loss: 496.8016 - val_loss: 547.9958
Epoch 25/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 24s 64ms/step - loss: 399.3697 - val_loss: 528.7585
Epoch 26/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 62ms/step - loss: 422.9390 - val_loss: 511.4002
Epoch 27/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 58ms/step - loss: 447.9508 - val_loss: 495.9063
Epoch 28/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 410.4958 - val_loss: 484.4503
Epoch 29/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 60ms/step - loss: 431.5000 - val_loss: 469.6380
Epoch 30/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 458.8902 - val_loss: 453.5676
Epoch 31/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 56ms/step - loss: 406.6499 - val_loss: 453.7151
Epoch 32/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 56ms/step - loss: 408.7878 - val_loss: 428.7639
Epoch 33/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 59ms/step - loss: 433.1059 - val_loss: 437.8275
Epoch 34/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 64ms/step - loss: 372.2057 - val_loss: 411.6165
Epoch 35/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 24s 65ms/step - loss: 425.1301 - val_loss: 404.3243
Epoch 36/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 63ms/step - loss: 398.0203 - val_loss: 428.1054
Epoch 37/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 62ms/step - loss: 332.0200 - val_loss: 386.3285
Epoch 38/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 424.4984 - val_loss: 381.9043
Epoch 39/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 62ms/step - loss: 387.1086 - val_loss: 371.9949
Epoch 40/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 63ms/step - loss: 361.7874 - val_loss: 358.6019
Epoch 41/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 44s 120ms/step - loss: 391.1438 - val_loss: 355.7131
Epoch 42/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 26s 72ms/step - loss: 408.8496 - val_loss: 356.3145
Epoch 43/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 316.3817 - val_loss: 335.3656
Epoch 44/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 390.0861 - val_loss: 371.6464
Epoch 45/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 60ms/step - loss: 365.2065 - val_loss: 350.7266
Epoch 46/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 356.5405 - val_loss: 333.1058
Epoch 47/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 62ms/step - loss: 348.1533 - val_loss: 326.3665
Epoch 48/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 61s 167ms/step - loss: 377.0595 - val_loss: 317.8427
Epoch 49/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 20s 56ms/step - loss: 300.5044 - val_loss: 317.0854
Epoch 50/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 58ms/step - loss: 321.8989 - val_loss: 316.6048
183/183 ━━━━━━━━━━━━━━━━━━━━ 3s 13ms/step
预测结果已保存至 predictions.csv 文件
改进方向
1、序列到序列模型
- 可以使用编码器-解码器架构,如Seq2Seq模型,用于处理输入序列和输出序列之间的复杂关系。这种方法在处理多步时间序列预测时特别有效。
2、注意力机制
- 将注意力机制引入到模型中,能够使模型在处理长序列时更加有效,专注于关键时间点的信息。
3、变分自编码器 (VAE) 和 生成对抗网络 (GAN)
- 这些模型通常用于生成数据,但也可以应用于时间序列预测问题,例如通过生成更真实的未来数据分布来提高预测准确性。
4、多尺度和多层次的模型
- 结合不同时间尺度的信息,例如同时考虑小时、日和周的模式,可以利用多层次的结构来捕捉数据中的不同变化模式。
5、概率预测模型
- 通过建立概率模型,如贝叶斯神经网络或深度概率模型,可以生成预测的置信区间,提供更全面的预测结果评估。
hahaha都看到这里了,要是觉得有用的话就辛苦动动小手点个赞吧!