Datawhale AI夏令营机器学习-Task3学习笔记

        Task3的主要任务是从整体框架结构和深度学习模型的层面对于这个题目进行更加深层次的理解分析和尝试。

Task2结果

        在这里我们先给出Task2进阶代码的运行结果,以供Task3改进对比。

特征优化

        Task2中我们提到了特征工程还有差分特征等,所以特征优化可以将历史平移特征、窗口统计特征和差分特征融合起来,使得对于数据特征的处理和提取更加准确。

# 合并训练数据和测试数据
data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = data.sort_values(['id','dt'], ascending=False).reset_index(drop=True)

# 历史平移
for i in range(10,36):
    data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)

# 历史平移 + 差分特征
for i in range(1,4):
    data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
    
# 窗口统计
for win in [15,30,50,70]:
    data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').mean().values
    data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').max().values
    data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').min().values
    data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').std().values

# 历史平移 + 窗口统计
for win in [7,14,28,35,50,70]:
    data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').mean().values
    data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').max().values
    data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').min().values
    data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').sum().values
    data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').std().values

        优化后结果: 

模型融合

        进行模型融合的前提是有多个模型的输出结果,比如使用catboost、xgboost和lightgbm三个模型分别输出三个结果,这时就可以将三个结果进行融合,最常见的是将结果直接进行加权平均融合。

        1、加权平均

!pip install xgboost
!pip install --user catboost
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2024):
    '''
    clf:调用模型
    train_x:训练数据
    train_y:训练数据对应标签
    test_x:测试数据
    clf_name:选择使用模型名
    seed:随机种子
    '''
    folds = 5
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    oof = np.zeros(train_x.shape[0])
    test_predict = np.zeros(test_x.shape[0])
    cv_scores = []
    
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        
        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)
            params = {
                'boosting_type': 'gbdt',
                'objective': 'regression',
                'metric': 'mae',
                'min_child_weight': 6,
                'num_leaves': 2 ** 6,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2023,
                'nthread' : 16,
                'verbose' : -1,
                'device': 'gpu'
            }
            callbacks1 = [log_evaluation(period=200), early_stopping(stopping_rounds=100)]
            model = clf.train(params, train_matrix, 1000, valid_sets=[train_matrix, valid_matrix],categorical_feature=[],callbacks=callbacks1)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        
        if clf_name == "xgb":
            xgb_params = {
              'booster': 'gbtree', 
              'objective': 'reg:squarederror',
              'eval_metric': 'mae',
              'max_depth': 5,
              'lambda': 10,
              'subsample': 0.7,
              'colsample_bytree': 0.7,
              'colsample_bylevel': 0.7,
              'eta': 0.1,
              'tree_method': 'gpu_hist',
              'seed': 520,
              'nthread': 16
              }
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(xgb_params, train_matrix, num_boost_round=1000, evals=watchlist, verbose_eval=200, early_stopping_rounds=100)
            val_pred  = model.predict(valid_matrix)
            test_pred = model.predict(test_matrix)
            
        if clf_name == "cat":
            params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2023,
                      'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=1000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      metric_period=200,
                      use_best_model=True, 
                      cat_features=[],
                      verbose=1)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
        
        oof[valid_index] = val_pred
        test_predict += test_pred / kf.n_splits
        
        score = mean_absolute_error(val_y, val_pred)
        cv_scores.append(score)
        print(cv_scores)
        
    return oof, test_predict

# 选择lightgbm模型
lgb_oof, lgb_test = cv_model(lgb, train[train_cols], train['target'], test[train_cols], 'lgb')
# 选择xgboost模型
xgb_oof, xgb_test = cv_model(xgb, train[train_cols], train['target'], test[train_cols], 'xgb')
# 选择catboost模型
cat_oof, cat_test = cv_model(CatBoostRegressor, train[train_cols], train['target'], test[train_cols], 'cat')

# 进行取平均融合
final_test = (lgb_test + xgb_test + cat_test) / 3

# 保存结果文件到本地
test['target'] = final_test
test[['id','dt','target']].to_csv('submit.csv', index=None)

        优化后结果: 

************************************ 1 ************************************
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 6.78632	valid_1's l1: 6.95248
[400]	training's l1: 6.62347	valid_1's l1: 6.87885
[600]	training's l1: 6.50065	valid_1's l1: 6.8272
[800]	training's l1: 6.39807	valid_1's l1: 6.7882
[1000]	training's l1: 6.31308	valid_1's l1: 6.76125
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 6.31308	valid_1's l1: 6.76125
[6.761251494361233]
************************************ 2 ************************************
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 6.79715	valid_1's l1: 6.92185
[400]	training's l1: 6.6326	valid_1's l1: 6.8479
[600]	training's l1: 6.50564	valid_1's l1: 6.79556
[800]	training's l1: 6.40581	valid_1's l1: 6.7603
[1000]	training's l1: 6.32145	valid_1's l1: 6.73159
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 6.32145	valid_1's l1: 6.73159
[6.761251494361233, 6.7315913230872315]
************************************ 3 ************************************
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 6.7964	valid_1's l1: 6.91835
[400]	training's l1: 6.63158	valid_1's l1: 6.8433
[600]	training's l1: 6.50699	valid_1's l1: 6.79131
[800]	training's l1: 6.40766	valid_1's l1: 6.75521
[1000]	training's l1: 6.32223	valid_1's l1: 6.72883
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 6.32223	valid_1's l1: 6.72883
[6.761251494361233, 6.7315913230872315, 6.728832001672867]
************************************ 4 ************************************
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 6.7943	valid_1's l1: 6.92993
[400]	training's l1: 6.62444	valid_1's l1: 6.84888
[600]	training's l1: 6.50276	valid_1's l1: 6.79908
[800]	training's l1: 6.4026	valid_1's l1: 6.76296
[1000]	training's l1: 6.31771	valid_1's l1: 6.73371
Did not meet early stopping. Best iteration is:
[997]	training's l1: 6.31833	valid_1's l1: 6.73367
[6.761251494361233, 6.7315913230872315, 6.728832001672867, 6.733672011909271]
************************************ 5 ************************************
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 6.80152	valid_1's l1: 6.9033
[400]	training's l1: 6.63215	valid_1's l1: 6.82452
[600]	training's l1: 6.51247	valid_1's l1: 6.778
[800]	training's l1: 6.41195	valid_1's l1: 6.74017
[1000]	training's l1: 6.32629	valid_1's l1: 6.71202
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 6.32629	valid_1's l1: 6.71202
[6.761251494361233, 6.7315913230872315, 6.728832001672867, 6.733672011909271, 6.712020485981799]
************************************ 1 ************************************
[0]	train-mae:20.06720	eval-mae:20.09302
[200]	train-mae:7.04024	eval-mae:7.13910
[400]	train-mae:6.92315	eval-mae:7.06463
[600]	train-mae:6.84113	eval-mae:7.02153
[800]	train-mae:6.77516	eval-mae:6.98873
[999]	train-mae:6.71672	eval-mae:6.96031
[6.960308911029275]
************************************ 2 ************************************
[0]	train-mae:20.09680	eval-mae:20.03719
[200]	train-mae:7.05168	eval-mae:7.09695
[400]	train-mae:6.93349	eval-mae:7.01993
[600]	train-mae:6.84741	eval-mae:6.97052
[800]	train-mae:6.78573	eval-mae:6.94216
[999]	train-mae:6.73247	eval-mae:6.92021
[6.960308911029275, 6.920208228240343]
************************************ 3 ************************************
[0]	train-mae:20.08618	eval-mae:20.06633
[200]	train-mae:7.05566	eval-mae:7.10080
[400]	train-mae:6.92888	eval-mae:7.01809
[600]	train-mae:6.84402	eval-mae:6.96830
[800]	train-mae:6.77794	eval-mae:6.93574
[999]	train-mae:6.72353	eval-mae:6.91141
[6.960308911029275, 6.920208228240343, 6.911406622339947]
************************************ 4 ************************************
[0]	train-mae:20.06614	eval-mae:20.10679
[200]	train-mae:7.04413	eval-mae:7.09352
[400]	train-mae:6.92015	eval-mae:7.01450
[600]	train-mae:6.84239	eval-mae:6.97546
[800]	train-mae:6.77458	eval-mae:6.94225
[999]	train-mae:6.72141	eval-mae:6.91981
[6.960308911029275, 6.920208228240343, 6.911406622339947, 6.919812966623677]
************************************ 5 ************************************
[0]	train-mae:20.08397	eval-mae:20.10031
[200]	train-mae:7.05085	eval-mae:7.07912
[400]	train-mae:6.93037	eval-mae:7.00484
[600]	train-mae:6.84745	eval-mae:6.96061
[800]	train-mae:6.78121	eval-mae:6.92794
[999]	train-mae:6.72759	eval-mae:6.90559
[6.960308911029275, 6.920208228240343, 6.911406622339947, 6.919812966623677, 6.905594216687921]
************************************ 1 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 46.3205033	test: 46.3576785	best: 46.3576785 (0)	total: 266ms	remaining: 4m 25s
200:	learn: 13.9126247	test: 14.3410950	best: 14.3410950 (200)	total: 22.3s	remaining: 1m 28s
400:	learn: 13.3661375	test: 13.8737376	best: 13.8737376 (400)	total: 46s	remaining: 1m 8s
600:	learn: 13.0598841	test: 13.6529329	best: 13.6529329 (600)	total: 1m 8s	remaining: 45.7s
800:	learn: 12.8496899	test: 13.5192282	best: 13.5192282 (800)	total: 1m 31s	remaining: 22.7s
999:	learn: 12.6834443	test: 13.4166585	best: 13.4165960 (998)	total: 1m 54s	remaining: 0us

bestTest = 13.41659598
bestIteration = 998

Shrink model to first 999 iterations.
[7.099260258793829]
************************************ 2 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 46.4033873	test: 46.0217607	best: 46.0217607 (0)	total: 99.3ms	remaining: 1m 39s
200:	learn: 13.9994523	test: 14.0023774	best: 14.0023774 (200)	total: 22.7s	remaining: 1m 30s
400:	learn: 13.4470921	test: 13.5583153	best: 13.5583153 (400)	total: 45s	remaining: 1m 7s
600:	learn: 13.1430679	test: 13.3498180	best: 13.3498180 (600)	total: 1m 8s	remaining: 45.3s
800:	learn: 12.9218966	test: 13.2060609	best: 13.2060609 (800)	total: 1m 31s	remaining: 22.6s
999:	learn: 12.7554102	test: 13.1069390	best: 13.1069390 (999)	total: 1m 53s	remaining: 0us

bestTest = 13.10693899
bestIteration = 999

[7.099260258793829, 7.060875208529937]
************************************ 3 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 46.3737008	test: 46.1273708	best: 46.1273708 (0)	total: 178ms	remaining: 2m 57s
200:	learn: 13.9623718	test: 14.1185952	best: 14.1185952 (200)	total: 23.3s	remaining: 1m 32s
400:	learn: 13.4091167	test: 13.6283616	best: 13.6283616 (400)	total: 46.1s	remaining: 1m 8s
600:	learn: 13.1055305	test: 13.4185102	best: 13.4185102 (600)	total: 1m 8s	remaining: 45.7s
800:	learn: 12.8884213	test: 13.2864531	best: 13.2864531 (800)	total: 1m 31s	remaining: 22.7s
999:	learn: 12.7186680	test: 13.1846856	best: 13.1846856 (999)	total: 1m 54s	remaining: 0us

bestTest = 13.1846856
bestIteration = 999

[7.099260258793829, 7.060875208529937, 7.059477679678101]
************************************ 4 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 46.2527443	test: 46.6700736	best: 46.6700736 (0)	total: 83.2ms	remaining: 1m 23s
200:	learn: 14.0091015	test: 13.9772248	best: 13.9772248 (200)	total: 22.7s	remaining: 1m 30s
400:	learn: 13.4316130	test: 13.5311924	best: 13.5311924 (400)	total: 45.3s	remaining: 1m 7s
600:	learn: 13.1216918	test: 13.3256936	best: 13.3256936 (600)	total: 1m 8s	remaining: 45.7s
800:	learn: 12.9078973	test: 13.2027070	best: 13.2027070 (800)	total: 1m 32s	remaining: 22.9s
999:	learn: 12.7307205	test: 13.1134695	best: 13.1134695 (999)	total: 1m 55s	remaining: 0us

bestTest = 13.1134695
bestIteration = 999

[7.099260258793829, 7.060875208529937, 7.059477679678101, 7.069109887881056]
************************************ 5 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 46.3127061	test: 46.4800237	best: 46.4800237 (0)	total: 92.2ms	remaining: 1m 32s
200:	learn: 13.9496626	test: 13.9743784	best: 13.9743784 (200)	total: 31.5s	remaining: 2m 5s
400:	learn: 13.4066428	test: 13.5415785	best: 13.5415785 (400)	total: 55.5s	remaining: 1m 22s
600:	learn: 13.1092822	test: 13.3363320	best: 13.3363320 (600)	total: 1m 18s	remaining: 52s
800:	learn: 12.8992729	test: 13.2192934	best: 13.2192934 (800)	total: 1m 40s	remaining: 25s
999:	learn: 12.7321157	test: 13.1257847	best: 13.1257847 (999)	total: 2m 3s	remaining: 0us

bestTest = 13.12578465
bestIteration = 999

[7.099260258793829, 7.060875208529937, 7.059477679678101, 7.069109887881056, 7.044158168142322]

        2、stacking融合

        stacking是一种分层模型集成框架。以两层为例,第一层由多个基学习器组成,其输入为原始训练集,第二层的模型则是以第一层基学习器的输出作为特征加入训练集进行再训练,从而得到完整的stacking模型。

第一层:(类比cv_model函数)

  1. 划分训练数据为K折(5折为例,每次选择其中四份作为训练集,一份作为验证集);

  2. 针对各个模型RF、ET、GBDT、XGB,分别进行5次训练,每次训练保留一份样本用作训练时的验证,训练完成后分别对Validation set,Test set进行预测,对于Test set一个模型会对应5个预测结果,将这5个结果取平均;对于Validation set一个模型经过5次交叉验证后,所有验证集数据都含有一个标签。此步骤结束后:5个验证集(总数相当于训练集全部)在每个模型下分别有一个预测标签,每行数据共有4个标签(4个算法模型),测试集每行数据也拥有四个标签(4个模型分别预测得到的)

第二层:(类比stack_model函数)

        将训练集中的四个标签外加真实标签当作五列新的特征作为新的训练集,选取一个训练模型,根据新的训练集进行训练,然后应用测试集的四个标签组成的测试集进行预测作为最终的result。

from sklearn.linear_model import Ridge 
def stack_model(oof_1, oof_2, oof_3, predictions_1, predictions_2, predictions_3, y):
    '''
    输入的oof_1, oof_2, oof_3可以对应lgb_oof,xgb_oof,cat_oof
    predictions_1, predictions_2, predictions_3对应lgb_test,xgb_test,cat_test
    '''
    train_stack = pd.concat([oof_1, oof_2, oof_3], axis=1)
    test_stack = pd.concat([predictions_1, predictions_2, predictions_3], axis=1)
    
    oof = np.zeros((train_stack.shape[0],))
    predictions = np.zeros((test_stack.shape[0],))
    scores = []
    
    from sklearn.model_selection import RepeatedKFold
    folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2021)
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, train_stack)): 
        print("fold n°{}".format(fold_+1))
        trn_data, trn_y = train_stack.loc[trn_idx], y[trn_idx]
        val_data, val_y = train_stack.loc[val_idx], y[val_idx]
        
        clf = Ridge(random_state=2021)
        clf.fit(trn_data, trn_y)

        oof[val_idx] = clf.predict(val_data)
        predictions += clf.predict(test_stack) / (5 * 2)
        
        score_single = mean_absolute_error(val_y, oof[val_idx])
        scores.append(score_single)
        print(f'{fold_+1}/{5}', score_single)
    print('mean: ',np.mean(scores))
   
    return oof, predictions
    
stack_oof, stack_pred = stack_model(pd.DataFrame(lgb_oof), pd.DataFrame(xgb_oof), pd.DataFrame(cat_oof), 
                                    pd.DataFrame(lgb_test), pd.DataFrame(xgb_test), pd.DataFrame(cat_test), train['target'])
 
# 保存结果文件到本地
test['target'] = stack_pred
test[['id','dt','target']].to_csv('submit.csv', index=None)

        优化后结果: 

************************************ 1 ************************************
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 6.78631	valid_1's l1: 6.95243
[400]	training's l1: 6.61549	valid_1's l1: 6.8724
[600]	training's l1: 6.49529	valid_1's l1: 6.82365
[800]	training's l1: 6.3934	valid_1's l1: 6.78396
[1000]	training's l1: 6.30565	valid_1's l1: 6.75324
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 6.30565	valid_1's l1: 6.75324
[6.753240414713993]
************************************ 2 ************************************
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 6.79749	valid_1's l1: 6.92171
[400]	training's l1: 6.62836	valid_1's l1: 6.84238
[600]	training's l1: 6.5102	valid_1's l1: 6.79913
[800]	training's l1: 6.40863	valid_1's l1: 6.76193
[1000]	training's l1: 6.32057	valid_1's l1: 6.72975
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 6.32057	valid_1's l1: 6.72975
[6.753240414713993, 6.729754334868036]
************************************ 3 ************************************
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 6.80203	valid_1's l1: 6.92696
[400]	training's l1: 6.63223	valid_1's l1: 6.84552
[600]	training's l1: 6.51031	valid_1's l1: 6.79671
[800]	training's l1: 6.40858	valid_1's l1: 6.75748
[1000]	training's l1: 6.32021	valid_1's l1: 6.72623
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 6.32021	valid_1's l1: 6.72623
[6.753240414713993, 6.729754334868036, 6.726230013028263]
************************************ 4 ************************************
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 6.79139	valid_1's l1: 6.92244
[400]	training's l1: 6.6234	valid_1's l1: 6.84798
[600]	training's l1: 6.50205	valid_1's l1: 6.79755
[800]	training's l1: 6.40186	valid_1's l1: 6.76061
[1000]	training's l1: 6.31948	valid_1's l1: 6.7333
Did not meet early stopping. Best iteration is:
[996]	training's l1: 6.32045	valid_1's l1: 6.73313
[6.753240414713993, 6.729754334868036, 6.726230013028263, 6.733130077495729]
************************************ 5 ************************************
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 6.80002	valid_1's l1: 6.90037
[400]	training's l1: 6.63216	valid_1's l1: 6.82357
[600]	training's l1: 6.51069	valid_1's l1: 6.77679
[800]	training's l1: 6.41138	valid_1's l1: 6.74125
[1000]	training's l1: 6.32466	valid_1's l1: 6.71099
Did not meet early stopping. Best iteration is:
[999]	training's l1: 6.32494	valid_1's l1: 6.71098
[6.753240414713993, 6.729754334868036, 6.726230013028263, 6.733130077495729, 6.710976025778095]
************************************ 1 ************************************
[0]	train-mae:20.06720	eval-mae:20.09302
[200]	train-mae:7.04024	eval-mae:7.13910
[400]	train-mae:6.92315	eval-mae:7.06463
[600]	train-mae:6.84113	eval-mae:7.02153
[800]	train-mae:6.77516	eval-mae:6.98873
[999]	train-mae:6.71672	eval-mae:6.96031
[6.960308911029275]
************************************ 2 ************************************
[0]	train-mae:20.09680	eval-mae:20.03719
[200]	train-mae:7.05168	eval-mae:7.09695
[400]	train-mae:6.93349	eval-mae:7.01993
[600]	train-mae:6.84741	eval-mae:6.97052
[800]	train-mae:6.78573	eval-mae:6.94216
[999]	train-mae:6.73247	eval-mae:6.92021
[6.960308911029275, 6.920208228240343]
************************************ 3 ************************************
[0]	train-mae:20.08618	eval-mae:20.06633
[200]	train-mae:7.05566	eval-mae:7.10080
[400]	train-mae:6.92888	eval-mae:7.01809
[600]	train-mae:6.84402	eval-mae:6.96830
[800]	train-mae:6.77794	eval-mae:6.93574
[999]	train-mae:6.72353	eval-mae:6.91141
[6.960308911029275, 6.920208228240343, 6.911406622339947]
************************************ 4 ************************************
[0]	train-mae:20.06614	eval-mae:20.10679
[200]	train-mae:7.04413	eval-mae:7.09352
[400]	train-mae:6.92015	eval-mae:7.01450
[600]	train-mae:6.84239	eval-mae:6.97546
[800]	train-mae:6.77458	eval-mae:6.94225
[999]	train-mae:6.72141	eval-mae:6.91981
[6.960308911029275, 6.920208228240343, 6.911406622339947, 6.919812966623677]
************************************ 5 ************************************
[0]	train-mae:20.08397	eval-mae:20.10031
[200]	train-mae:7.05085	eval-mae:7.07912
[400]	train-mae:6.93037	eval-mae:7.00484
[600]	train-mae:6.84745	eval-mae:6.96061
[800]	train-mae:6.78121	eval-mae:6.92794
[999]	train-mae:6.72759	eval-mae:6.90559
[6.960308911029275, 6.920208228240343, 6.911406622339947, 6.919812966623677, 6.905594216687921]
************************************ 1 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 46.3205033	test: 46.3576785	best: 46.3576785 (0)	total: 422ms	remaining: 7m 1s
200:	learn: 13.9126247	test: 14.3410950	best: 14.3410950 (200)	total: 39.8s	remaining: 2m 38s
400:	learn: 13.3661375	test: 13.8737376	best: 13.8737376 (400)	total: 1m 20s	remaining: 2m
600:	learn: 13.0598841	test: 13.6529329	best: 13.6529329 (600)	total: 1m 45s	remaining: 1m 9s
800:	learn: 12.8496899	test: 13.5192282	best: 13.5192282 (800)	total: 2m 13s	remaining: 33.2s
999:	learn: 12.6834443	test: 13.4166585	best: 13.4165960 (998)	total: 2m 54s	remaining: 0us

bestTest = 13.41659598
bestIteration = 998

Shrink model to first 999 iterations.
[7.099260258793829]
************************************ 2 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 46.4033873	test: 46.0217607	best: 46.0217607 (0)	total: 267ms	remaining: 4m 27s
200:	learn: 13.9994523	test: 14.0023774	best: 14.0023774 (200)	total: 40.3s	remaining: 2m 40s
400:	learn: 13.4470921	test: 13.5583153	best: 13.5583153 (400)	total: 1m 20s	remaining: 2m
600:	learn: 13.1430679	test: 13.3498180	best: 13.3498180 (600)	total: 2m 1s	remaining: 1m 20s
800:	learn: 12.9218966	test: 13.2060609	best: 13.2060609 (800)	total: 2m 41s	remaining: 40.2s
999:	learn: 12.7554102	test: 13.1069390	best: 13.1069390 (999)	total: 3m 18s	remaining: 0us

bestTest = 13.10693899
bestIteration = 999

[7.099260258793829, 7.060875208529937]
************************************ 3 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 46.3737008	test: 46.1273708	best: 46.1273708 (0)	total: 252ms	remaining: 4m 11s
200:	learn: 13.9623718	test: 14.1185952	best: 14.1185952 (200)	total: 40.9s	remaining: 2m 42s
400:	learn: 13.4091167	test: 13.6283616	best: 13.6283616 (400)	total: 1m 22s	remaining: 2m 2s
600:	learn: 13.1055305	test: 13.4185102	best: 13.4185102 (600)	total: 2m 3s	remaining: 1m 21s
800:	learn: 12.8884213	test: 13.2864531	best: 13.2864531 (800)	total: 2m 42s	remaining: 40.4s
999:	learn: 12.7186680	test: 13.1846856	best: 13.1846856 (999)	total: 3m 22s	remaining: 0us

bestTest = 13.1846856
bestIteration = 999

[7.099260258793829, 7.060875208529937, 7.059477679678101]
************************************ 4 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 46.2527443	test: 46.6700736	best: 46.6700736 (0)	total: 239ms	remaining: 3m 59s
200:	learn: 14.0091015	test: 13.9772248	best: 13.9772248 (200)	total: 37.7s	remaining: 2m 29s
400:	learn: 13.4316130	test: 13.5311924	best: 13.5311924 (400)	total: 1m 1s	remaining: 1m 31s
600:	learn: 13.1216918	test: 13.3256936	best: 13.3256936 (600)	total: 1m 24s	remaining: 56s
800:	learn: 12.9078973	test: 13.2027070	best: 13.2027070 (800)	total: 1m 47s	remaining: 26.7s
999:	learn: 12.7307205	test: 13.1134695	best: 13.1134695 (999)	total: 2m 10s	remaining: 0us

bestTest = 13.1134695
bestIteration = 999

[7.099260258793829, 7.060875208529937, 7.059477679678101, 7.069109887881056]
************************************ 5 ************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 46.3127061	test: 46.4800237	best: 46.4800237 (0)	total: 88ms	remaining: 1m 27s
200:	learn: 13.9496626	test: 13.9743784	best: 13.9743784 (200)	total: 22.7s	remaining: 1m 30s
400:	learn: 13.4066428	test: 13.5415785	best: 13.5415785 (400)	total: 51.4s	remaining: 1m 16s
600:	learn: 13.1092822	test: 13.3363320	best: 13.3363320 (600)	total: 1m 32s	remaining: 1m 1s
800:	learn: 12.8992729	test: 13.2192934	best: 13.2192934 (800)	total: 2m 12s	remaining: 32.9s
999:	learn: 12.7321157	test: 13.1257847	best: 13.1257847 (999)	total: 2m 52s	remaining: 0us

bestTest = 13.12578465
bestIteration = 999

[7.099260258793829, 7.060875208529937, 7.059477679678101, 7.069109887881056, 7.044158168142322]
fold n°1
1/5 6.719509406634522
fold n°2
2/5 6.72299326754129
fold n°3
3/5 6.746608252531554
fold n°4
4/5 6.725625750031283
fold n°5
5/5 6.720224785722014
fold n°6
6/5 6.7103004355113995
fold n°7
7/5 6.732119822600345
fold n°8
8/5 6.715782457305116
fold n°9
9/5 6.731914243360591
fold n°10
10/5 6.744683998081653
mean:  6.726976241931976

深度学习 

        1、代码方案

!pip install keras
!pip install tensorflow
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 数据预处理
def preprocess_data(df, look_back=100):
    # 将数据按照id进行分组
    grouped = df.groupby('id')
    datasets = {}
    for id, group in grouped:
        datasets[id] = group.values
        
    # 准备训练数据集
    X, Y = [], []
    for id, data in datasets.items():
        for i in range(10, 15): # 每个id构建5个序列
            a = data[i:(i + look_back), 3]
            a = np.append(a, np.array([0]*(100-len(a))))
            X.append(a[::-1])
            Y.append(data[i-10:i, 3][::-1])
    
    # 准备测试数据集
    OOT = []
    for id, data in datasets.items():
        a = data[:100, 3]
        a = np.append(a, np.array([0]*(100-len(a))))
        OOT.append(a[::-1])
    
    return np.array(X, dtype=np.float64), np.array(Y, dtype=np.float64), np.array(OOT, dtype=np.float64)

# 定义模型
def build_model(look_back, n_features, n_output):
    model = Sequential()
    model.add(LSTM(100, input_shape=(look_back, n_features)))
    model.add(Dropout(0.2))
    model.add(RepeatVector(n_output))
    model.add(LSTM(100, return_sequences=True))
    model.add(TimeDistributed(Dense(1)))
    model.compile(loss='mean_squared_error', optimizer=Adam(0.001))
    return model

# 构建和训练模型
look_back = 100  # 序列长度
n_features = 1  # 假设每个时间点只有一个特征
n_output = 10  # 预测未来10个时间单位的值

# 预处理数据
X, Y, OOT = preprocess_data(train, look_back=look_back)

# 构建模型
model = build_model(look_back, n_features, n_output)

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# 训练模型
model.fit(X, Y, epochs=50, batch_size=64, verbose=1, validation_split=0.2, callbacks=[early_stopping])

# 进行预测
predicted_values = model.predict(OOT)

# 准备符合要求格式的预测结果
predictions = []
for i, data in enumerate(OOT):
    id = train['id'].unique()[i]
    for j in range(n_output):
        predictions.append([id, j+1, predicted_values[i, j, 0]])

# 将预测结果转换为DataFrame
df_predictions = pd.DataFrame(predictions, columns=['id', 'dt', 'target'])

# 导出预测结果到CSV文件
df_predictions.to_csv('predictions.csv', index=False, encoding='utf-8')

print("预测结果已保存至 predictions.csv 文件")

        2、代码解析

(1)导入库和加载数据

!pip install keras
!pip install tensorflow
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
  • 导入库: 导入了必要的库,包括 numpypandas、从 sklearn 中的 MinMaxScaler,以及从 keras 中导入的多个组件 (Sequential, LSTM, Dense, RepeatVector, TimeDistributed, Adam 优化器)。
  • 加载数据: 使用 pandas 加载训练数据 (train.csv) 和测试数据 (test.csv)。

(2)数据预处理函数 (preprocess_data)

def preprocess_data(df, look_back=100):
    grouped = df.groupby('id')
    datasets = {}
    for id, group in grouped:
        datasets[id] = group.values
        
    X, Y = [], []
    for id, data in datasets.items():
        for i in range(10, 15): # 每个id构建5个序列
            a = data[i:(i + look_back), 3]
            a = np.append(a, np.array([0]*(100-len(a))))
            X.append(a[::-1])
            Y.append(data[i-10:i, 3][::-1])
    
    OOT = []
    for id, data in datasets.items():
        a = data[:100, 3]
        a = np.append(a, np.array([0]*(100-len(a))))
        OOT.append(a[::-1])
    
    return np.array(X, dtype=np.float64), np.array(Y, dtype=np.float64), np.array(OOT, dtype=np.float64)
  • 数据分组: 根据 'id' 列对数据进行分组,处理多个时间序列数据。
  • 训练数据准备 (XY): 为训练准备序列 (XY):
    • X: 每个序列长度为 look_back,如果不足则用零填充。
    • Y: 对应于每个 X 的目标序列,取每个序列前 10 个时间步的数据。
  • 测试数据准备 (OOT): 准备用于测试的数据,类似于 X,但是是每个时间序列的最初部分。

 (3)模型构建函数 (build_model)

def build_model(look_back, n_features, n_output):
    model = Sequential()
    model.add(LSTM(100, input_shape=(look_back, n_features)))
    model.add(Dropout(0.2))
    model.add(RepeatVector(n_output))
    model.add(LSTM(100, return_sequences=True))
    model.add(TimeDistributed(Dense(1)))
    model.compile(loss='mean_squared_error', optimizer=Adam(0.001))
    return model
  • LSTM 模型配置:
    • 使用 Sequential 模型。
    • 第一个 LSTM 层有 100 个单元,输入形状由 look_backn_features 确定。
    • RepeatVector 层用于重复输出序列 (n_output 次)。
    • 第二个 LSTM 层也有 100 个单元,并且设置 return_sequences=True 以输出序列给 TimeDistributed 层。
    • TimeDistributed 层使用 Dense(1),在每个时间步输出单一的预测值。
    • 优化器设置为 Adam,损失函数为均方误差 ('mean_squared_error')。

 (4)模型训练和预测

look_back = 100
n_features = 1
n_output = 10

# 数据预处理
X, Y, OOT = preprocess_data(train, look_back=look_back)

# 构建模型
model = build_model(look_back, n_features, n_output)

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# 训练模型
model.fit(X, Y, epochs=50, batch_size=64, verbose=1, validation_split=0.2, callbacks=[early_stopping])

# 预测
predicted_values = model.predict(OOT)
  • 模型训练:
    • 使用 preprocess_data 函数处理训练数据 (train),生成 XY
    • 调用 build_model 函数构建模型。
    • 使用 fit 方法训练模型,采用 50 个周期 (epochs),每个批次 (batch_size) 大小为 64
  • 预测:
    • 使用 preprocess_data 函数处理测试数据 (test) 生成 OOT
    • 使用 model.predict(OOT) 对测试数据进行预测。

        3、运行后结果

Epoch 1/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 24s 60ms/step - loss: 3071.0012 - val_loss: 2511.0078
Epoch 2/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 24s 64ms/step - loss: 2268.7061 - val_loss: 2194.5291
Epoch 3/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 1983.3658 - val_loss: 1960.4478
Epoch 4/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 59ms/step - loss: 1881.2417 - val_loss: 1771.8282
Epoch 5/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 57ms/step - loss: 1306.5902 - val_loss: 1613.6753
Epoch 6/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 58ms/step - loss: 1481.9780 - val_loss: 1476.9568
Epoch 7/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 60ms/step - loss: 1259.1500 - val_loss: 1369.5964
Epoch 8/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 57ms/step - loss: 1410.8124 - val_loss: 1262.5564
Epoch 9/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 56ms/step - loss: 864.4413 - val_loss: 1173.4703
Epoch 10/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 954.0659 - val_loss: 1086.0402
Epoch 11/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 60ms/step - loss: 901.7515 - val_loss: 1015.5499
Epoch 12/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 783.8499 - val_loss: 945.5701
Epoch 13/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 60ms/step - loss: 819.7301 - val_loss: 885.4941
Epoch 14/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 57ms/step - loss: 672.4242 - val_loss: 855.5969
Epoch 15/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 59ms/step - loss: 635.2537 - val_loss: 797.3814
Epoch 16/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 62ms/step - loss: 641.1038 - val_loss: 760.0760
Epoch 17/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 62ms/step - loss: 649.9288 - val_loss: 719.8991
Epoch 18/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 698.9720 - val_loss: 704.7115
Epoch 19/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 58ms/step - loss: 596.2827 - val_loss: 654.0480
Epoch 20/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 57ms/step - loss: 535.0333 - val_loss: 636.0656
Epoch 21/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 56ms/step - loss: 479.3533 - val_loss: 602.2112
Epoch 22/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 478.9056 - val_loss: 590.2931
Epoch 23/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 58ms/step - loss: 517.2261 - val_loss: 561.5989
Epoch 24/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 57ms/step - loss: 496.8016 - val_loss: 547.9958
Epoch 25/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 24s 64ms/step - loss: 399.3697 - val_loss: 528.7585
Epoch 26/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 62ms/step - loss: 422.9390 - val_loss: 511.4002
Epoch 27/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 58ms/step - loss: 447.9508 - val_loss: 495.9063
Epoch 28/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 410.4958 - val_loss: 484.4503
Epoch 29/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 60ms/step - loss: 431.5000 - val_loss: 469.6380
Epoch 30/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 458.8902 - val_loss: 453.5676
Epoch 31/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 56ms/step - loss: 406.6499 - val_loss: 453.7151
Epoch 32/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 56ms/step - loss: 408.7878 - val_loss: 428.7639
Epoch 33/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 59ms/step - loss: 433.1059 - val_loss: 437.8275
Epoch 34/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 64ms/step - loss: 372.2057 - val_loss: 411.6165
Epoch 35/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 24s 65ms/step - loss: 425.1301 - val_loss: 404.3243
Epoch 36/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 63ms/step - loss: 398.0203 - val_loss: 428.1054
Epoch 37/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 62ms/step - loss: 332.0200 - val_loss: 386.3285
Epoch 38/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 424.4984 - val_loss: 381.9043
Epoch 39/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 62ms/step - loss: 387.1086 - val_loss: 371.9949
Epoch 40/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 63ms/step - loss: 361.7874 - val_loss: 358.6019
Epoch 41/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 44s 120ms/step - loss: 391.1438 - val_loss: 355.7131
Epoch 42/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 26s 72ms/step - loss: 408.8496 - val_loss: 356.3145
Epoch 43/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 316.3817 - val_loss: 335.3656
Epoch 44/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 390.0861 - val_loss: 371.6464
Epoch 45/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 60ms/step - loss: 365.2065 - val_loss: 350.7266
Epoch 46/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 22s 61ms/step - loss: 356.5405 - val_loss: 333.1058
Epoch 47/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 23s 62ms/step - loss: 348.1533 - val_loss: 326.3665
Epoch 48/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 61s 167ms/step - loss: 377.0595 - val_loss: 317.8427
Epoch 49/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 20s 56ms/step - loss: 300.5044 - val_loss: 317.0854
Epoch 50/50
365/365 ━━━━━━━━━━━━━━━━━━━━ 21s 58ms/step - loss: 321.8989 - val_loss: 316.6048
183/183 ━━━━━━━━━━━━━━━━━━━━ 3s 13ms/step
预测结果已保存至 predictions.csv 文件

改进方向

        1、序列到序列模型

  • 可以使用编码器-解码器架构,如Seq2Seq模型,用于处理输入序列和输出序列之间的复杂关系。这种方法在处理多步时间序列预测时特别有效。

        2、注意力机制

  • 将注意力机制引入到模型中,能够使模型在处理长序列时更加有效,专注于关键时间点的信息。

        3、变分自编码器 (VAE)生成对抗网络 (GAN)

  • 这些模型通常用于生成数据,但也可以应用于时间序列预测问题,例如通过生成更真实的未来数据分布来提高预测准确性。

        4、多尺度和多层次的模型

  • 结合不同时间尺度的信息,例如同时考虑小时、日和周的模式,可以利用多层次的结构来捕捉数据中的不同变化模式。

        5、概率预测模型

  • 通过建立概率模型,如贝叶斯神经网络或深度概率模型,可以生成预测的置信区间,提供更全面的预测结果评估。

hahaha都看到这里了,要是觉得有用的话就辛苦动动小手点个赞吧!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Pluses

喜欢就给颗糖果吧╰(*°▽°*

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值