手动实现Stacking模型融合(模型交叉训练+Bagging优化)

手动实现Stacking模型融合(模型交叉训练+Bagging优化)

# 基础数据科学运算库
import numpy as np
import sklearn
import pandas as pd
from sklearn.model_selection import KFold, cross_validate, train_test_split
from joblib import dump, load
# 时间模块
import time

import warnings
warnings.filterwarnings('ignore')

# boston房价预测数据集
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = pd.DataFrame(np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]))
target = pd.DataFrame(raw_df.values[1::2, 2])
X_tr, X_ts, y_tr, y_ts = train_test_split(data, target, random_state=22)
def cv_index(X_tr,y_tr,n_splite=5):
    X_tr = X_tr.reset_index(drop=True)
    y_tr = y_tr.reset_index(drop=True)
    kf = KFold(n_splits=n_splite, random_state=12, shuffle=True)
    train_part_index_l = []
    eval_index_l = []
    X_train = []
    X_eval=[]
    y_train=[]
    y_eval=[]
    for train_part_index, eval_index in kf.split(X_tr, y_tr):
        train_part_index_l.append(train_part_index)
        eval_index_l.append(eval_index)
    
    for i in range(n_splite):
        X_train.append(X_tr.loc[train_part_index_l[i]])
        X_eval.append(X_tr.loc[eval_index_l[i]])
        y_train.append(y_tr.loc[train_part_index_l[i]])
        y_eval.append(y_tr.loc[eval_index_l[i]])
        
    train_set=[]
    eval_set=[]
    for i in range(n_splite):
        train_set.append((X_train[i], y_train[i]))
        eval_set.append((X_eval[i], y_eval[i]))
    
    return train_set,eval_set
train_set,eval_set=cv_index(X_tr,y_tr,n_splite=5)

随机森林模型交叉训练

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV,GridSearchCV
from sklearn.ensemble import RandomForestRegressor as RFR
start = time.time()

# 设置超参数空间
parameter_space = {'n_estimators': [*range(20,100,5)]
                     , 'max_depth': [*range(10,25,1)]
                     , "min_impurity_decrease":[*range(0,5,1)]
                    }

# 实例化模型与评估器
RF_1 = RFR(random_state=12)
grid_RF_1 = HalvingGridSearchCV(
                        estimator=RF_1
                        ,param_grid=parameter_space
                        ,factor=3
                        ,min_resources="exhaust"
                        ,scoring = "neg_mean_absolute_percentage_error"
                        ,verbose = True
                        ,random_state=1412
                        ,cv = 5
                        ,n_jobs=-1)

# 模型训练
grid_RF_1.fit(train_set[0][0], train_set[0][1])
dump(grid_RF_1, 'grid_RF_1.joblib') 
print('所用时{0}       交叉验证得分{1}'.format(time.time()-start,grid_RF_1.best_score_*(-1)))
print('参数的最佳取值:{0}'.format(grid_RF_1.best_params_))
n_iterations: 4
n_required_iterations: 7
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1200
n_resources: 10
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
----------
iter: 1
n_candidates: 400
n_resources: 30
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
----------
iter: 2
n_candidates: 134
n_resources: 90
Fitting 5 folds for each of 134 candidates, totalling 670 fits
----------
iter: 3
n_candidates: 45
n_resources: 270
Fitting 5 folds for each of 45 candidates, totalling 225 fits
所用时133.5072295665741       交叉验证得分0.12187522595857585
参数的最佳取值:{'max_depth': 16, 'min_impurity_decrease': 0, 'n_estimators': 50}
start = time.time()

# 实例化模型与评估器
RF_2 = RFR(random_state=12)
grid_RF_2 = HalvingGridSearchCV(
                        estimator=RF_2
                        ,param_grid=parameter_space
                        ,factor=3
                        ,min_resources="exhaust"
                        ,scoring = "neg_mean_absolute_percentage_error"
                        ,verbose = True
                        ,random_state=1412
                        ,cv = 5
                        ,n_jobs=-1)

# 模型训练
grid_RF_2.fit(train_set[1][0], train_set[1][1])
dump(grid_RF_2, 'grid_RF_2.joblib') 
print('所用时{0}       交叉验证得分{1}'.format(time.time()-start,grid_RF_2.best_score_*(-1)))
print('参数的最佳取值:{0}'.format(grid_RF_2.best_params_))
n_iterations: 4
n_required_iterations: 7
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1200
n_resources: 10
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
----------
iter: 1
n_candidates: 400
n_resources: 30
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
----------
iter: 2
n_candidates: 134
n_resources: 90
Fitting 5 folds for each of 134 candidates, totalling 670 fits
----------
iter: 3
n_candidates: 45
n_resources: 270
Fitting 5 folds for each of 45 candidates, totalling 225 fits
所用时128.56333446502686       交叉验证得分0.15842174037672266
参数的最佳取值:{'max_depth': 19, 'min_impurity_decrease': 2, 'n_estimators': 50}
start = time.time()

# 实例化模型与评估器
RF_3 = RFR(random_state=12)
grid_RF_3 = HalvingGridSearchCV(
                        estimator=RF_3
                        ,param_grid=parameter_space
                        ,factor=3
                        ,min_resources="exhaust"
                        ,scoring = "neg_mean_absolute_percentage_error"
                        ,verbose = True
                        ,random_state=1412
                        ,cv = 5
                        ,n_jobs=-1)

# 模型训练
grid_RF_3.fit(train_set[2][0], train_set[2][1])
dump(grid_RF_3, 'grid_RF_3.joblib') 
print('所用时{0}       交叉验证得分{1}'.format(time.time()-start,grid_RF_3.best_score_*(-1)))
print('参数的最佳取值:{0}'.format(grid_RF_3.best_params_))
n_iterations: 4
n_required_iterations: 7
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1200
n_resources: 10
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
----------
iter: 1
n_candidates: 400
n_resources: 30
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
----------
iter: 2
n_candidates: 134
n_resources: 90
Fitting 5 folds for each of 134 candidates, totalling 670 fits
----------
iter: 3
n_candidates: 45
n_resources: 270
Fitting 5 folds for each of 45 candidates, totalling 225 fits
所用时121.87363767623901       交叉验证得分0.11385248096051595
参数的最佳取值:{'max_depth': 13, 'min_impurity_decrease': 0, 'n_estimators': 30}
start = time.time()

# 实例化模型与评估器
RF_4 = RFR(random_state=12)
grid_RF_4 = HalvingGridSearchCV(
                        estimator=RF_4
                        ,param_grid=parameter_space
                        ,factor=3
                        ,min_resources="exhaust"
                        ,scoring = "neg_mean_absolute_percentage_error"
                        ,verbose = True
                        ,random_state=1412
                        ,cv = 5
                        ,n_jobs=-1)

# 模型训练
grid_RF_4.fit(train_set[3][0], train_set[3][1])
dump(grid_RF_4, 'grid_RF_4.joblib') 
print('所用时{0}       交叉验证得分{1}'.format(time.time()-start,grid_RF_4.best_score_*(-1)))
print('参数的最佳取值:{0}'.format(grid_RF_4.best_params_))
n_iterations: 4
n_required_iterations: 7
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1200
n_resources: 10
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
----------
iter: 1
n_candidates: 400
n_resources: 30
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
----------
iter: 2
n_candidates: 134
n_resources: 90
Fitting 5 folds for each of 134 candidates, totalling 670 fits
----------
iter: 3
n_candidates: 45
n_resources: 270
Fitting 5 folds for each of 45 candidates, totalling 225 fits
所用时117.33430767059326       交叉验证得分0.12372213173338457
参数的最佳取值:{'max_depth': 17, 'min_impurity_decrease': 0, 'n_estimators': 20}
start = time.time()

# 实例化模型与评估器
RF_5 = RFR(random_state=12)
grid_RF_5 = HalvingGridSearchCV(
                        estimator=RF_5
                        ,param_grid=parameter_space
                        ,factor=3
                        ,min_resources="exhaust"
                        ,scoring = "neg_mean_absolute_percentage_error"
                        ,verbose = True
                        ,random_state=1412
                        ,cv = 5
                        ,n_jobs=-1)

# 模型训练
grid_RF_5.fit(train_set[4][0], train_set[4][1])
dump(grid_RF_5, 'grid_RF_5.joblib') 
print('所用时{0}       交叉验证得分{1}'.format(time.time()-start,grid_RF_5.best_score_*(-1)))
print('参数的最佳取值:{0}'.format(grid_RF_5.best_params_))
n_iterations: 4
n_required_iterations: 7
n_possible_iterations: 4
min_resources_: 10
max_resources_: 304
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1200
n_resources: 10
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
----------
iter: 1
n_candidates: 400
n_resources: 30
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
----------
iter: 2
n_candidates: 134
n_resources: 90
Fitting 5 folds for each of 134 candidates, totalling 670 fits
----------
iter: 3
n_candidates: 45
n_resources: 270
Fitting 5 folds for each of 45 candidates, totalling 225 fits
所用时109.67498230934143       交叉验证得分0.12366574106691293
参数的最佳取值:{'max_depth': 12, 'min_impurity_decrease': 0, 'n_estimators': 30}
RF_1 = grid_RF_1.best_estimator_
RF_2 = grid_RF_2.best_estimator_
RF_3 = grid_RF_3.best_estimator_
RF_4 = grid_RF_4.best_estimator_
RF_5 = grid_RF_5.best_estimator_
RF_l = [RF_1, RF_2, RF_3, RF_4, RF_5]

交叉验证中每个验证集的预测结果

eval1_predict_RF = pd.Series(RF_l[0].predict(eval_set[0][0]), index=eval_set[0][0].index)
eval2_predict_RF = pd.Series(RF_l[1].predict(eval_set[1][0]), index=eval_set[1][0].index)
eval3_predict_RF = pd.Series(RF_l[2].predict(eval_set[2][0]), index=eval_set[2][0].index)
eval4_predict_RF = pd.Series(RF_l[3].predict(eval_set[3][0]), index=eval_set[3][0].index)
eval5_predict_RF = pd.Series(RF_l[4].predict(eval_set[4][0]), index=eval_set[4][0].index)

然后拼接为一个完整的Series,并对index进行顺序排序:

from sklearn.metrics import mean_absolute_percentage_error as mape
eval_predict_RF = pd.concat([eval1_predict_RF, 
                                   eval2_predict_RF, 
                                   eval3_predict_RF, 
                                   eval4_predict_RF, 
                                   eval5_predict_RF]).sort_index()

eval_predict_RF
0      20.088067
1      25.700000
2      21.045000
3      13.250215
4      48.310000
         ...    
374    13.724333
375    15.030000
376    28.643333
377    19.805000
378    11.142000
Length: 379, dtype: float64
mape(eval_predict_RF,y_tr)   #5个验证集拼凑成一个完整的“训练集”,然后这个“训练集”上的mape
0.12240756077149015

测试集上mape

test_score = 0

for i in range(5):
    test_score += mape(RF_l[i].predict(X_ts), y_ts)
    
test_score / 5
0.12120745186247324

lgb模型交叉训练

from lightgbm import LGBMRegressor
lgb_param_grid = {
        'max_depth': range(3, 9, 2),
        'n_estimators':range(50, 200, 20),
}
from tqdm import tqdm
lgb_search_l = []

for X, y in tqdm(train_set):
    lgb_model =  LGBMRegressor(random_state=12)
    lgb_search = HalvingGridSearchCV(
                        estimator=lgb_model
                        ,param_grid=lgb_param_grid
                        ,factor=3
                        ,min_resources="exhaust"
                        ,scoring = "neg_mean_absolute_percentage_error"
                        ,verbose = True
                        ,random_state=1412
                        ,cv = 5
                        ,n_jobs=-1).fit(X,y)
    lgb_search_l.append(lgb_search)
  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 33
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 24
n_resources: 33
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 8
n_resources: 99
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 2
n_candidates: 3
n_resources: 297
Fitting 5 folds for each of 3 candidates, totalling 15 fits


 20%|████████████████▊                                                                   | 1/5 [00:02<00:11,  2.90s/it]

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 33
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 24
n_resources: 33
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 8
n_resources: 99
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 2
n_candidates: 3
n_resources: 297
Fitting 5 folds for each of 3 candidates, totalling 15 fits


 40%|█████████████████████████████████▌                                                  | 2/5 [00:03<00:04,  1.48s/it]

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 33
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 24
n_resources: 33
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 8
n_resources: 99
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 2
n_candidates: 3
n_resources: 297
Fitting 5 folds for each of 3 candidates, totalling 15 fits


 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:03<00:02,  1.02s/it]

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 33
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 24
n_resources: 33
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 8
n_resources: 99
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 2
n_candidates: 3
n_resources: 297
Fitting 5 folds for each of 3 candidates, totalling 15 fits


 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:04<00:00,  1.24it/s]

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 33
max_resources_: 304
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 24
n_resources: 33
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 8
n_resources: 99
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 2
n_candidates: 3
n_resources: 297
Fitting 5 folds for each of 3 candidates, totalling 15 fits


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.03it/s]
dump(lgb_search_l[0], 'lgb_search_l.joblib') 
dump(lgb_search_l[1], 'lgb_search_2.joblib') 
dump(lgb_search_l[2], 'lgb_search_3.joblib') 
dump(lgb_search_l[3], 'lgb_search_4.joblib') 
dump(lgb_search_l[4], 'lgb_search_5.joblib') 
['lgb_search_5.joblib']
lgb_1 = lgb_search_l[0].best_estimator_
lgb_2 = lgb_search_l[1].best_estimator_
lgb_3 = lgb_search_l[2].best_estimator_
lgb_4 = lgb_search_l[3].best_estimator_
lgb_5 = lgb_search_l[4].best_estimator_

lgb_l = [lgb_1, lgb_2, lgb_3, lgb_4, lgb_5]
eval1_predict_lgb = pd.Series(lgb_l[0].predict(eval_set[0][0]), index=eval_set[0][0].index)
eval2_predict_lgb = pd.Series(lgb_l[1].predict(eval_set[1][0]), index=eval_set[1][0].index)
eval3_predict_lgb = pd.Series(lgb_l[2].predict(eval_set[2][0]), index=eval_set[2][0].index)
eval4_predict_lgb = pd.Series(lgb_l[3].predict(eval_set[3][0]), index=eval_set[3][0].index)
eval5_predict_lgb = pd.Series(lgb_l[4].predict(eval_set[4][0]), index=eval_set[4][0].index)
eval_predict_lgb = pd.concat([eval1_predict_lgb, 
                                   eval2_predict_lgb, 
                                   eval3_predict_lgb, 
                                   eval4_predict_lgb, 
                                   eval5_predict_lgb]).sort_index()

eval_predict_lgb
0      19.987837
1      29.141741
2      22.295913
3      10.888321
4      45.808818
         ...    
374    13.905662
375    14.467702
376    27.452163
377    19.700542
378    16.788349
Length: 379, dtype: float64
mape(eval_predict_lgb,y_tr)  
0.11769809555747357
test_score = 0

for i in range(5):
    test_score += mape(lgb_l[i].predict(X_ts), y_ts)
    
test_score / 5
0.11979140165199782

GBDT模型交叉训练

from sklearn.ensemble import GradientBoostingRegressor as GBR
gbr_param_grid = {
                     'n_estimators': [*range(20,100,5)],
                     "learning_rate": [0.01,0.03,0.05,0.07,0.1],
                     'max_depth': [*range(3,9,2)]  
}
gbr_search_l = []

for X, y in tqdm(train_set):
    gbr_model =  GBR(random_state=12)
    gbr_search = HalvingGridSearchCV(
                        estimator=gbr_model
                        ,param_grid=gbr_param_grid
                        ,factor=3
                        ,min_resources="exhaust"
                        ,scoring = "neg_mean_absolute_percentage_error"
                        ,verbose = True
                        ,random_state=1412
                        ,cv = 5
                        ,n_jobs=-1).fit(X,y)
    gbr_search_l.append(gbr_search)
  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 240
n_resources: 10
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
----------
iter: 1
n_candidates: 80
n_resources: 30
Fitting 5 folds for each of 80 candidates, totalling 400 fits
----------
iter: 2
n_candidates: 27
n_resources: 90
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 3
n_candidates: 9
n_resources: 270
Fitting 5 folds for each of 9 candidates, totalling 45 fits


 20%|████████████████▊                                                                   | 1/5 [00:11<00:46, 11.72s/it]

n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 240
n_resources: 10
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
----------
iter: 1
n_candidates: 80
n_resources: 30
Fitting 5 folds for each of 80 candidates, totalling 400 fits
----------
iter: 2
n_candidates: 27
n_resources: 90
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 3
n_candidates: 9
n_resources: 270
Fitting 5 folds for each of 9 candidates, totalling 45 fits


 40%|█████████████████████████████████▌                                                  | 2/5 [00:25<00:38, 12.80s/it]

n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 240
n_resources: 10
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
----------
iter: 1
n_candidates: 80
n_resources: 30
Fitting 5 folds for each of 80 candidates, totalling 400 fits
----------
iter: 2
n_candidates: 27
n_resources: 90
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 3
n_candidates: 9
n_resources: 270
Fitting 5 folds for each of 9 candidates, totalling 45 fits


 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:37<00:24, 12.37s/it]

n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 240
n_resources: 10
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
----------
iter: 1
n_candidates: 80
n_resources: 30
Fitting 5 folds for each of 80 candidates, totalling 400 fits
----------
iter: 2
n_candidates: 27
n_resources: 90
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 3
n_candidates: 9
n_resources: 270
Fitting 5 folds for each of 9 candidates, totalling 45 fits


 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:50<00:12, 12.86s/it]

n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 10
max_resources_: 304
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 240
n_resources: 10
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
----------
iter: 1
n_candidates: 80
n_resources: 30
Fitting 5 folds for each of 80 candidates, totalling 400 fits
----------
iter: 2
n_candidates: 27
n_resources: 90
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 3
n_candidates: 9
n_resources: 270
Fitting 5 folds for each of 9 candidates, totalling 45 fits


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:04<00:00, 12.85s/it]
dump(gbr_search_l[0], 'gbr_search_l.joblib') 
dump(gbr_search_l[1], 'gbr_search_2.joblib') 
dump(gbr_search_l[2], 'gbr_search_3.joblib') 
dump(gbr_search_l[3], 'gbr_search_4.joblib') 
dump(gbr_search_l[4], 'gbr_search_5.joblib') 
['gbr_search_5.joblib']
gbr_1 = gbr_search_l[0].best_estimator_
gbr_2 = gbr_search_l[1].best_estimator_
gbr_3 = gbr_search_l[2].best_estimator_
gbr_4 = gbr_search_l[3].best_estimator_
gbr_5 = gbr_search_l[4].best_estimator_

gbr_1 = [gbr_1, gbr_2, gbr_3, gbr_4, gbr_5]
eval1_predict_gbr = pd.Series(gbr_1[0].predict(eval_set[0][0]), index=eval_set[0][0].index)
eval2_predict_gbr = pd.Series(gbr_1[1].predict(eval_set[1][0]), index=eval_set[1][0].index)
eval3_predict_gbr = pd.Series(gbr_1[2].predict(eval_set[2][0]), index=eval_set[2][0].index)
eval4_predict_gbr = pd.Series(gbr_1[3].predict(eval_set[3][0]), index=eval_set[3][0].index)
eval5_predict_gbr = pd.Series(gbr_1[4].predict(eval_set[4][0]), index=eval_set[4][0].index)
eval_predict_gbr = pd.concat([eval1_predict_gbr, 
                                   eval2_predict_gbr, 
                                   eval3_predict_gbr, 
                                   eval4_predict_gbr, 
                                   eval5_predict_gbr]).sort_index()

mape(eval_predict_gbr,y_tr)
0.12452838106854108
test_score = 0

for i in range(5):
    test_score += mape(gbr_1[i].predict(X_ts), y_ts)
    
test_score / 5
0.11625029172882025

将训练好的15个模型进行融合

gbr_1 = load('gbr_search_l.joblib').best_estimator_ 
gbr_2 = load('gbr_search_2.joblib').best_estimator_ 
gbr_3 = load('gbr_search_3.joblib').best_estimator_ 
gbr_4 = load('gbr_search_4.joblib').best_estimator_ 
gbr_5 = load('gbr_search_5.joblib').best_estimator_ 

rf_1 = load('grid_RF_1.joblib').best_estimator_ 
rf_2 = load('grid_RF_2.joblib').best_estimator_ 
rf_3 = load('grid_RF_3.joblib').best_estimator_ 
rf_4 = load('grid_RF_4.joblib').best_estimator_ 
rf_5 = load('grid_RF_5.joblib').best_estimator_ 

lgb_1 = load('lgb_search_l.joblib').best_estimator_
lgb_2 = load('lgb_search_2.joblib').best_estimator_
lgb_3 = load('lgb_search_3.joblib').best_estimator_
lgb_4 = load('lgb_search_4.joblib').best_estimator_
lgb_5 = load('lgb_search_5.joblib').best_estimator_
def train_cross(X_train, y_train, X_test, estimators, n_splits=5, random_state=12):
    """
    Stacking融合过程一级学习器交叉训练函数
    
    :param X_train: 训练集特征
    :param y_train: 训练集标签
    :param X_test: 测试集特征
    :param estimators: 一级学习器,由(名称,评估器)组成的列表
    :param n_splits: 交叉训练折数
    :param random_state: 随机数种子
    
    :return:交叉训练后创建训练数据和测试集平均预测结果
    """
    # 重置数据集的index,若数据集index是顺序排列,则此步骤可省略
    X = X_train.reset_index(drop=True)
    y = y_train.reset_index(drop=True)
    
    # 实例化重复交叉验证评估器
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # 创建一级评估器输出的训练集预测结果和测试集预测结果数据集
    # 数据集当前数值暂用零值填充
    m = X.shape[0]
    n = len(estimators)
    m_test = X_test.shape[0]
    
    columns = []
    for estimator in estimators:
        columns.append(estimator[0] + '_oof')
    
    train_oof = pd.DataFrame(np.zeros((m, n)), columns=columns)
    
    columns = []
    for estimator in estimators:
        columns.append(estimator[0] + '_predict')
    
    test_predict = pd.DataFrame(np.zeros((m_test, n)), columns=columns)

    # 执行交叉训练
    for estimator in estimators:
        model = estimator[1]
        oof_colName = estimator[0] + '_oof'
        predict_colName = estimator[0] + '_predict'
        
        for train_part_index, eval_index in kf.split(X, y):
            # 在训练集上训练
            X_train_part = X.loc[train_part_index]
            y_train_part = y.loc[train_part_index]
            model.fit(X_train_part, y_train_part)
            # 在验证集上进行验证
            X_eval_part = X.loc[eval_index]
            # 将验证集上预测结果拼接入oof数据集
            train_oof[oof_colName].loc[eval_index] = model.predict(X_eval_part)
            # 将测试集上预测结果填入predict数据集
            test_predict[predict_colName] += model.predict(X_test) / n_splits
        
    return train_oof, test_predict
estimators = [('gbr_1', gbr_1), ('gbr_2', gbr_2), ('gbr_3', gbr_3), ('gbr_4', gbr_4), ('gbr_5', gbr_5),
              ('lgb_1', lgb_1), ('lgb_2', lgb_2), ('lgb_3', lgb_3), ('lgb_4', lgb_4), ('lgb_5', lgb_5),
              ('rf_1', rf_1), ('rf_2', rf_2), ('rf_3', rf_3), ('rf_4', rf_4), ('rf_5', rf_5)]

train_oof, test_predict = train_cross(X_tr, y_tr, X_ts, estimators=estimators)
train_oof #每个模型在训练集上输出结果
gbr_1_oofgbr_2_oofgbr_3_oofgbr_4_oofgbr_5_ooflgb_1_ooflgb_2_ooflgb_3_ooflgb_4_ooflgb_5_oofrf_1_oofrf_2_oofrf_3_oofrf_4_oofrf_5_oof
021.58496720.32817620.69978420.75641720.68973619.98783719.98783719.71892619.98783719.98783720.08806720.78703120.10336219.89020.188280
124.00933924.99845125.81618425.90516724.57865829.14174129.14174129.03368129.14174129.14174125.70000025.11266825.52750026.00025.522500
221.78892421.91692521.92071321.66462222.00740722.29591322.29591322.16558222.29591322.29591320.54520020.72242220.75155321.04520.931364
316.71609214.42302710.99681810.49068915.45328810.88832110.8883219.33926310.88832110.88832112.30800013.25021512.35333312.35012.346667
443.48120249.57119346.52742947.59629547.46954045.33056445.33056445.80881845.33056445.33056448.60000045.85371448.31000048.65048.530000
................................................
37416.10709614.67682814.45465514.81613514.65337413.90566213.90566214.02023013.90566213.90566214.11200013.32705213.71125013.78013.724333
37516.76358914.90679415.35982114.99530415.62997214.46770214.46770214.94818914.46770214.46770215.03000014.65975715.26333315.35015.151111
37627.44555527.90862929.49229729.21687227.70777028.44997528.44997527.45216328.44997528.44997528.66800029.92354728.64333329.51528.769167
37720.94553120.21643019.79749919.20886820.15616419.70054219.70054219.47144019.70054219.70054220.34600021.24505720.05266719.80520.364444
37815.09522613.01200413.40302313.60944813.64680416.78834916.78834916.96951416.78834916.78834911.14200011.97591811.15333310.88511.866667

379 rows × 15 columns

test_predict #每个模型在测试集上输出结果
gbr_1_predictgbr_2_predictgbr_3_predictgbr_4_predictgbr_5_predictlgb_1_predictlgb_2_predictlgb_3_predictlgb_4_predictlgb_5_predictrf_1_predictrf_2_predictrf_3_predictrf_4_predictrf_5_predict
028.14652427.79425829.51216728.14408927.86237028.25356428.25356426.96825928.25356428.25356428.61680031.54492828.66000029.10400028.654667
127.32165928.88623428.93451929.42860828.17425529.09561829.09561828.67074929.09561829.09561828.30020029.16265928.53653328.88400028.628798
220.93003020.55409120.31393820.58180020.54347020.55225020.55225020.64236920.55225020.55225020.33197821.12149520.26285320.21200020.254775
325.05774027.06176626.29498826.51021226.28976125.60991825.60991825.58535125.60991825.60991826.78360026.84542727.17224427.13100027.151316
421.52775820.21376420.53541320.10358420.74323919.29435619.29435618.87227519.29435619.29435621.03284322.03899820.99687020.75375021.067122
................................................
12224.02169824.73780824.00526423.48656824.49400223.62070423.62070423.12765523.62070423.62070423.96613323.60353623.76996923.79200023.787126
12319.35233518.50572818.38557818.25871718.74380718.49517818.49517818.61976218.49517818.49517818.74213719.06300518.55232618.55600018.576157
12421.23390820.76219620.96658120.48106320.97276020.37982820.37982819.97559520.37982820.37982820.98889321.15491921.01528921.00753321.029847
12526.19383427.05736426.83791826.55634026.73239626.30004326.30004325.82994726.30004326.30004326.72980025.88841327.05158126.89300027.052874
12623.16641723.98476124.31581124.64592923.64059928.14273928.14273928.48592328.14273928.14273925.04640024.25780024.89900024.66200024.926019

127 rows × 15 columns

由于参与融合的模型都是树模型,易于过拟合,所以最后的元学习器选择Ridge回归

from sklearn.linear_model import  Ridge
ridge = Ridge(random_state=14)
ridge.fit(np.array(train_oof),y_tr)
mape(ridge.predict(np.array(test_predict)),y_ts) , mape(ridge.predict(np.array(train_oof)),y_tr)  #(测试集得分,训练集得分)
(0.11265052301513544, 0.11028453138512788)

继续对元学习器进行优化,当然也可以试试其他的回归器

params_Ridge = {'alpha': [1,0.1,0.01,0.001,0.0001,0] , 
                "fit_intercept": [True, False], 
                "solver": ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
ridge_search = GridSearchCV(ridge, param_grid=params_Ridge, n_jobs=-1,scoring = "neg_mean_absolute_percentage_error").fit(np.array(train_oof),y_tr)
ridge_search.best_score_*(-1)  #训练集交叉验证得分
0.11221630224382506
mape(ridge_search.best_estimator_.predict(np.array(test_predict)),y_ts)  #测试集得分
0.1129211924561571

  接下来是Bagging作为较为基础的集成算法

from sklearn.ensemble import BaggingRegressor
parameter_space = {
    "n_estimators": range(10, 21), 
    "max_samples": np.arange(0.1, 1.1, 0.1).tolist()}
bagging_final = BaggingRegressor(ridge_search.best_estimator_) 
BG = GridSearchCV(bagging_final, parameter_space, n_jobs=-1,scoring = "neg_mean_absolute_percentage_error").fit(np.array(train_oof),y_tr)
BG.best_score_*(-1) 
0.10923933635660936
mape(BG.best_estimator_.predict(np.array(test_predict)),y_ts)
0.11169070871907398

能够发现,带入超参数优化后的ridge模型后,Bagging作为元学习器的效果有了更进一步的提升:

基础评估器训练集交叉验证测试集
随机森林0.12240.1212
lightgbm0.11760.1197
GBDT0.12450.1162
stacking融合未调优ridge0.11260.1102
stacking融合调优ridge0.11220.1129
stacking融合bagging调优ridge0.10920.1116

继续优化的可能

1.增加算法的种类,参与算法融合的三组算法全是树模型,增加算法多样性
2.将对半网格搜索改为随机网格搜素
3.更换最后的元学习器,或许能得到更好的结果


  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值