手动实现Stacking模型融合(模型交叉训练+Bagging优化)
# 基础数据科学运算库
import numpy as np
import sklearn
import pandas as pd
from sklearn.model_selection import KFold, cross_validate, train_test_split
from joblib import dump, load
# 时间模块
import time
import warnings
warnings.filterwarnings('ignore')
# boston房价预测数据集
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = pd.DataFrame(np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]))
target = pd.DataFrame(raw_df.values[1::2, 2])
X_tr, X_ts, y_tr, y_ts = train_test_split(data, target, random_state=22)
def cv_index(X_tr,y_tr,n_splite=5):
X_tr = X_tr.reset_index(drop=True)
y_tr = y_tr.reset_index(drop=True)
kf = KFold(n_splits=n_splite, random_state=12, shuffle=True)
train_part_index_l = []
eval_index_l = []
X_train = []
X_eval=[]
y_train=[]
y_eval=[]
for train_part_index, eval_index in kf.split(X_tr, y_tr):
train_part_index_l.append(train_part_index)
eval_index_l.append(eval_index)
for i in range(n_splite):
X_train.append(X_tr.loc[train_part_index_l[i]])
X_eval.append(X_tr.loc[eval_index_l[i]])
y_train.append(y_tr.loc[train_part_index_l[i]])
y_eval.append(y_tr.loc[eval_index_l[i]])
train_set=[]
eval_set=[]
for i in range(n_splite):
train_set.append((X_train[i], y_train[i]))
eval_set.append((X_eval[i], y_eval[i]))
return train_set,eval_set
train_set,eval_set=cv_index(X_tr,y_tr,n_splite=5)
随机森林模型交叉训练
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV,GridSearchCV
from sklearn.ensemble import RandomForestRegressor as RFR
start = time.time()
# 设置超参数空间
parameter_space = {'n_estimators': [*range(20,100,5)]
, 'max_depth': [*range(10,25,1)]
, "min_impurity_decrease":[*range(0,5,1)]
}
# 实例化模型与评估器
RF_1 = RFR(random_state=12)
grid_RF_1 = HalvingGridSearchCV(
estimator=RF_1
,param_grid=parameter_space
,factor=3
,min_resources="exhaust"
,scoring = "neg_mean_absolute_percentage_error"
,verbose = True
,random_state=1412
,cv = 5
,n_jobs=-1)
# 模型训练
grid_RF_1.fit(train_set[0][0], train_set[0][1])
dump(grid_RF_1, 'grid_RF_1.joblib')
print('所用时{0} 交叉验证得分{1}'.format(time.time()-start,grid_RF_1.best_score_*(-1)))
print('参数的最佳取值:{0}'.format(grid_RF_1.best_params_))
n_iterations: 4
n_required_iterations: 7
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1200
n_resources: 10
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
----------
iter: 1
n_candidates: 400
n_resources: 30
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
----------
iter: 2
n_candidates: 134
n_resources: 90
Fitting 5 folds for each of 134 candidates, totalling 670 fits
----------
iter: 3
n_candidates: 45
n_resources: 270
Fitting 5 folds for each of 45 candidates, totalling 225 fits
所用时133.5072295665741 交叉验证得分0.12187522595857585
参数的最佳取值:{'max_depth': 16, 'min_impurity_decrease': 0, 'n_estimators': 50}
start = time.time()
# 实例化模型与评估器
RF_2 = RFR(random_state=12)
grid_RF_2 = HalvingGridSearchCV(
estimator=RF_2
,param_grid=parameter_space
,factor=3
,min_resources="exhaust"
,scoring = "neg_mean_absolute_percentage_error"
,verbose = True
,random_state=1412
,cv = 5
,n_jobs=-1)
# 模型训练
grid_RF_2.fit(train_set[1][0], train_set[1][1])
dump(grid_RF_2, 'grid_RF_2.joblib')
print('所用时{0} 交叉验证得分{1}'.format(time.time()-start,grid_RF_2.best_score_*(-1)))
print('参数的最佳取值:{0}'.format(grid_RF_2.best_params_))
n_iterations: 4
n_required_iterations: 7
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1200
n_resources: 10
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
----------
iter: 1
n_candidates: 400
n_resources: 30
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
----------
iter: 2
n_candidates: 134
n_resources: 90
Fitting 5 folds for each of 134 candidates, totalling 670 fits
----------
iter: 3
n_candidates: 45
n_resources: 270
Fitting 5 folds for each of 45 candidates, totalling 225 fits
所用时128.56333446502686 交叉验证得分0.15842174037672266
参数的最佳取值:{'max_depth': 19, 'min_impurity_decrease': 2, 'n_estimators': 50}
start = time.time()
# 实例化模型与评估器
RF_3 = RFR(random_state=12)
grid_RF_3 = HalvingGridSearchCV(
estimator=RF_3
,param_grid=parameter_space
,factor=3
,min_resources="exhaust"
,scoring = "neg_mean_absolute_percentage_error"
,verbose = True
,random_state=1412
,cv = 5
,n_jobs=-1)
# 模型训练
grid_RF_3.fit(train_set[2][0], train_set[2][1])
dump(grid_RF_3, 'grid_RF_3.joblib')
print('所用时{0} 交叉验证得分{1}'.format(time.time()-start,grid_RF_3.best_score_*(-1)))
print('参数的最佳取值:{0}'.format(grid_RF_3.best_params_))
n_iterations: 4
n_required_iterations: 7
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1200
n_resources: 10
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
----------
iter: 1
n_candidates: 400
n_resources: 30
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
----------
iter: 2
n_candidates: 134
n_resources: 90
Fitting 5 folds for each of 134 candidates, totalling 670 fits
----------
iter: 3
n_candidates: 45
n_resources: 270
Fitting 5 folds for each of 45 candidates, totalling 225 fits
所用时121.87363767623901 交叉验证得分0.11385248096051595
参数的最佳取值:{'max_depth': 13, 'min_impurity_decrease': 0, 'n_estimators': 30}
start = time.time()
# 实例化模型与评估器
RF_4 = RFR(random_state=12)
grid_RF_4 = HalvingGridSearchCV(
estimator=RF_4
,param_grid=parameter_space
,factor=3
,min_resources="exhaust"
,scoring = "neg_mean_absolute_percentage_error"
,verbose = True
,random_state=1412
,cv = 5
,n_jobs=-1)
# 模型训练
grid_RF_4.fit(train_set[3][0], train_set[3][1])
dump(grid_RF_4, 'grid_RF_4.joblib')
print('所用时{0} 交叉验证得分{1}'.format(time.time()-start,grid_RF_4.best_score_*(-1)))
print('参数的最佳取值:{0}'.format(grid_RF_4.best_params_))
n_iterations: 4
n_required_iterations: 7
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1200
n_resources: 10
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
----------
iter: 1
n_candidates: 400
n_resources: 30
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
----------
iter: 2
n_candidates: 134
n_resources: 90
Fitting 5 folds for each of 134 candidates, totalling 670 fits
----------
iter: 3
n_candidates: 45
n_resources: 270
Fitting 5 folds for each of 45 candidates, totalling 225 fits
所用时117.33430767059326 交叉验证得分0.12372213173338457
参数的最佳取值:{'max_depth': 17, 'min_impurity_decrease': 0, 'n_estimators': 20}
start = time.time()
# 实例化模型与评估器
RF_5 = RFR(random_state=12)
grid_RF_5 = HalvingGridSearchCV(
estimator=RF_5
,param_grid=parameter_space
,factor=3
,min_resources="exhaust"
,scoring = "neg_mean_absolute_percentage_error"
,verbose = True
,random_state=1412
,cv = 5
,n_jobs=-1)
# 模型训练
grid_RF_5.fit(train_set[4][0], train_set[4][1])
dump(grid_RF_5, 'grid_RF_5.joblib')
print('所用时{0} 交叉验证得分{1}'.format(time.time()-start,grid_RF_5.best_score_*(-1)))
print('参数的最佳取值:{0}'.format(grid_RF_5.best_params_))
n_iterations: 4
n_required_iterations: 7
n_possible_iterations: 4
min_resources_: 10
max_resources_: 304
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1200
n_resources: 10
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
----------
iter: 1
n_candidates: 400
n_resources: 30
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
----------
iter: 2
n_candidates: 134
n_resources: 90
Fitting 5 folds for each of 134 candidates, totalling 670 fits
----------
iter: 3
n_candidates: 45
n_resources: 270
Fitting 5 folds for each of 45 candidates, totalling 225 fits
所用时109.67498230934143 交叉验证得分0.12366574106691293
参数的最佳取值:{'max_depth': 12, 'min_impurity_decrease': 0, 'n_estimators': 30}
RF_1 = grid_RF_1.best_estimator_
RF_2 = grid_RF_2.best_estimator_
RF_3 = grid_RF_3.best_estimator_
RF_4 = grid_RF_4.best_estimator_
RF_5 = grid_RF_5.best_estimator_
RF_l = [RF_1, RF_2, RF_3, RF_4, RF_5]
交叉验证中每个验证集的预测结果
eval1_predict_RF = pd.Series(RF_l[0].predict(eval_set[0][0]), index=eval_set[0][0].index)
eval2_predict_RF = pd.Series(RF_l[1].predict(eval_set[1][0]), index=eval_set[1][0].index)
eval3_predict_RF = pd.Series(RF_l[2].predict(eval_set[2][0]), index=eval_set[2][0].index)
eval4_predict_RF = pd.Series(RF_l[3].predict(eval_set[3][0]), index=eval_set[3][0].index)
eval5_predict_RF = pd.Series(RF_l[4].predict(eval_set[4][0]), index=eval_set[4][0].index)
然后拼接为一个完整的Series,并对index进行顺序排序:
from sklearn.metrics import mean_absolute_percentage_error as mape
eval_predict_RF = pd.concat([eval1_predict_RF,
eval2_predict_RF,
eval3_predict_RF,
eval4_predict_RF,
eval5_predict_RF]).sort_index()
eval_predict_RF
0 20.088067
1 25.700000
2 21.045000
3 13.250215
4 48.310000
...
374 13.724333
375 15.030000
376 28.643333
377 19.805000
378 11.142000
Length: 379, dtype: float64
mape(eval_predict_RF,y_tr) #5个验证集拼凑成一个完整的“训练集”,然后这个“训练集”上的mape
0.12240756077149015
测试集上mape
test_score = 0
for i in range(5):
test_score += mape(RF_l[i].predict(X_ts), y_ts)
test_score / 5
0.12120745186247324
lgb模型交叉训练
from lightgbm import LGBMRegressor
lgb_param_grid = {
'max_depth': range(3, 9, 2),
'n_estimators':range(50, 200, 20),
}
from tqdm import tqdm
lgb_search_l = []
for X, y in tqdm(train_set):
lgb_model = LGBMRegressor(random_state=12)
lgb_search = HalvingGridSearchCV(
estimator=lgb_model
,param_grid=lgb_param_grid
,factor=3
,min_resources="exhaust"
,scoring = "neg_mean_absolute_percentage_error"
,verbose = True
,random_state=1412
,cv = 5
,n_jobs=-1).fit(X,y)
lgb_search_l.append(lgb_search)
0%| | 0/5 [00:00<?, ?it/s]
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 33
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 24
n_resources: 33
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 8
n_resources: 99
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 2
n_candidates: 3
n_resources: 297
Fitting 5 folds for each of 3 candidates, totalling 15 fits
20%|████████████████▊ | 1/5 [00:02<00:11, 2.90s/it]
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 33
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 24
n_resources: 33
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 8
n_resources: 99
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 2
n_candidates: 3
n_resources: 297
Fitting 5 folds for each of 3 candidates, totalling 15 fits
40%|█████████████████████████████████▌ | 2/5 [00:03<00:04, 1.48s/it]
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 33
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 24
n_resources: 33
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 8
n_resources: 99
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 2
n_candidates: 3
n_resources: 297
Fitting 5 folds for each of 3 candidates, totalling 15 fits
60%|██████████████████████████████████████████████████▍ | 3/5 [00:03<00:02, 1.02s/it]
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 33
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 24
n_resources: 33
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 8
n_resources: 99
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 2
n_candidates: 3
n_resources: 297
Fitting 5 folds for each of 3 candidates, totalling 15 fits
80%|███████████████████████████████████████████████████████████████████▏ | 4/5 [00:04<00:00, 1.24it/s]
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 33
max_resources_: 304
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 24
n_resources: 33
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 8
n_resources: 99
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 2
n_candidates: 3
n_resources: 297
Fitting 5 folds for each of 3 candidates, totalling 15 fits
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00, 1.03it/s]
dump(lgb_search_l[0], 'lgb_search_l.joblib')
dump(lgb_search_l[1], 'lgb_search_2.joblib')
dump(lgb_search_l[2], 'lgb_search_3.joblib')
dump(lgb_search_l[3], 'lgb_search_4.joblib')
dump(lgb_search_l[4], 'lgb_search_5.joblib')
['lgb_search_5.joblib']
lgb_1 = lgb_search_l[0].best_estimator_
lgb_2 = lgb_search_l[1].best_estimator_
lgb_3 = lgb_search_l[2].best_estimator_
lgb_4 = lgb_search_l[3].best_estimator_
lgb_5 = lgb_search_l[4].best_estimator_
lgb_l = [lgb_1, lgb_2, lgb_3, lgb_4, lgb_5]
eval1_predict_lgb = pd.Series(lgb_l[0].predict(eval_set[0][0]), index=eval_set[0][0].index)
eval2_predict_lgb = pd.Series(lgb_l[1].predict(eval_set[1][0]), index=eval_set[1][0].index)
eval3_predict_lgb = pd.Series(lgb_l[2].predict(eval_set[2][0]), index=eval_set[2][0].index)
eval4_predict_lgb = pd.Series(lgb_l[3].predict(eval_set[3][0]), index=eval_set[3][0].index)
eval5_predict_lgb = pd.Series(lgb_l[4].predict(eval_set[4][0]), index=eval_set[4][0].index)
eval_predict_lgb = pd.concat([eval1_predict_lgb,
eval2_predict_lgb,
eval3_predict_lgb,
eval4_predict_lgb,
eval5_predict_lgb]).sort_index()
eval_predict_lgb
0 19.987837
1 29.141741
2 22.295913
3 10.888321
4 45.808818
...
374 13.905662
375 14.467702
376 27.452163
377 19.700542
378 16.788349
Length: 379, dtype: float64
mape(eval_predict_lgb,y_tr)
0.11769809555747357
test_score = 0
for i in range(5):
test_score += mape(lgb_l[i].predict(X_ts), y_ts)
test_score / 5
0.11979140165199782
GBDT模型交叉训练
from sklearn.ensemble import GradientBoostingRegressor as GBR
gbr_param_grid = {
'n_estimators': [*range(20,100,5)],
"learning_rate": [0.01,0.03,0.05,0.07,0.1],
'max_depth': [*range(3,9,2)]
}
gbr_search_l = []
for X, y in tqdm(train_set):
gbr_model = GBR(random_state=12)
gbr_search = HalvingGridSearchCV(
estimator=gbr_model
,param_grid=gbr_param_grid
,factor=3
,min_resources="exhaust"
,scoring = "neg_mean_absolute_percentage_error"
,verbose = True
,random_state=1412
,cv = 5
,n_jobs=-1).fit(X,y)
gbr_search_l.append(gbr_search)
0%| | 0/5 [00:00<?, ?it/s]
n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 240
n_resources: 10
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
----------
iter: 1
n_candidates: 80
n_resources: 30
Fitting 5 folds for each of 80 candidates, totalling 400 fits
----------
iter: 2
n_candidates: 27
n_resources: 90
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 3
n_candidates: 9
n_resources: 270
Fitting 5 folds for each of 9 candidates, totalling 45 fits
20%|████████████████▊ | 1/5 [00:11<00:46, 11.72s/it]
n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 240
n_resources: 10
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
----------
iter: 1
n_candidates: 80
n_resources: 30
Fitting 5 folds for each of 80 candidates, totalling 400 fits
----------
iter: 2
n_candidates: 27
n_resources: 90
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 3
n_candidates: 9
n_resources: 270
Fitting 5 folds for each of 9 candidates, totalling 45 fits
40%|█████████████████████████████████▌ | 2/5 [00:25<00:38, 12.80s/it]
n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 240
n_resources: 10
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
----------
iter: 1
n_candidates: 80
n_resources: 30
Fitting 5 folds for each of 80 candidates, totalling 400 fits
----------
iter: 2
n_candidates: 27
n_resources: 90
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 3
n_candidates: 9
n_resources: 270
Fitting 5 folds for each of 9 candidates, totalling 45 fits
60%|██████████████████████████████████████████████████▍ | 3/5 [00:37<00:24, 12.37s/it]
n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 10
max_resources_: 303
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 240
n_resources: 10
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
----------
iter: 1
n_candidates: 80
n_resources: 30
Fitting 5 folds for each of 80 candidates, totalling 400 fits
----------
iter: 2
n_candidates: 27
n_resources: 90
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 3
n_candidates: 9
n_resources: 270
Fitting 5 folds for each of 9 candidates, totalling 45 fits
80%|███████████████████████████████████████████████████████████████████▏ | 4/5 [00:50<00:12, 12.86s/it]
n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 10
max_resources_: 304
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 240
n_resources: 10
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
----------
iter: 1
n_candidates: 80
n_resources: 30
Fitting 5 folds for each of 80 candidates, totalling 400 fits
----------
iter: 2
n_candidates: 27
n_resources: 90
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 3
n_candidates: 9
n_resources: 270
Fitting 5 folds for each of 9 candidates, totalling 45 fits
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:04<00:00, 12.85s/it]
dump(gbr_search_l[0], 'gbr_search_l.joblib')
dump(gbr_search_l[1], 'gbr_search_2.joblib')
dump(gbr_search_l[2], 'gbr_search_3.joblib')
dump(gbr_search_l[3], 'gbr_search_4.joblib')
dump(gbr_search_l[4], 'gbr_search_5.joblib')
['gbr_search_5.joblib']
gbr_1 = gbr_search_l[0].best_estimator_
gbr_2 = gbr_search_l[1].best_estimator_
gbr_3 = gbr_search_l[2].best_estimator_
gbr_4 = gbr_search_l[3].best_estimator_
gbr_5 = gbr_search_l[4].best_estimator_
gbr_1 = [gbr_1, gbr_2, gbr_3, gbr_4, gbr_5]
eval1_predict_gbr = pd.Series(gbr_1[0].predict(eval_set[0][0]), index=eval_set[0][0].index)
eval2_predict_gbr = pd.Series(gbr_1[1].predict(eval_set[1][0]), index=eval_set[1][0].index)
eval3_predict_gbr = pd.Series(gbr_1[2].predict(eval_set[2][0]), index=eval_set[2][0].index)
eval4_predict_gbr = pd.Series(gbr_1[3].predict(eval_set[3][0]), index=eval_set[3][0].index)
eval5_predict_gbr = pd.Series(gbr_1[4].predict(eval_set[4][0]), index=eval_set[4][0].index)
eval_predict_gbr = pd.concat([eval1_predict_gbr,
eval2_predict_gbr,
eval3_predict_gbr,
eval4_predict_gbr,
eval5_predict_gbr]).sort_index()
mape(eval_predict_gbr,y_tr)
0.12452838106854108
test_score = 0
for i in range(5):
test_score += mape(gbr_1[i].predict(X_ts), y_ts)
test_score / 5
0.11625029172882025
将训练好的15个模型进行融合
gbr_1 = load('gbr_search_l.joblib').best_estimator_
gbr_2 = load('gbr_search_2.joblib').best_estimator_
gbr_3 = load('gbr_search_3.joblib').best_estimator_
gbr_4 = load('gbr_search_4.joblib').best_estimator_
gbr_5 = load('gbr_search_5.joblib').best_estimator_
rf_1 = load('grid_RF_1.joblib').best_estimator_
rf_2 = load('grid_RF_2.joblib').best_estimator_
rf_3 = load('grid_RF_3.joblib').best_estimator_
rf_4 = load('grid_RF_4.joblib').best_estimator_
rf_5 = load('grid_RF_5.joblib').best_estimator_
lgb_1 = load('lgb_search_l.joblib').best_estimator_
lgb_2 = load('lgb_search_2.joblib').best_estimator_
lgb_3 = load('lgb_search_3.joblib').best_estimator_
lgb_4 = load('lgb_search_4.joblib').best_estimator_
lgb_5 = load('lgb_search_5.joblib').best_estimator_
def train_cross(X_train, y_train, X_test, estimators, n_splits=5, random_state=12):
"""
Stacking融合过程一级学习器交叉训练函数
:param X_train: 训练集特征
:param y_train: 训练集标签
:param X_test: 测试集特征
:param estimators: 一级学习器,由(名称,评估器)组成的列表
:param n_splits: 交叉训练折数
:param random_state: 随机数种子
:return:交叉训练后创建训练数据和测试集平均预测结果
"""
# 重置数据集的index,若数据集index是顺序排列,则此步骤可省略
X = X_train.reset_index(drop=True)
y = y_train.reset_index(drop=True)
# 实例化重复交叉验证评估器
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
# 创建一级评估器输出的训练集预测结果和测试集预测结果数据集
# 数据集当前数值暂用零值填充
m = X.shape[0]
n = len(estimators)
m_test = X_test.shape[0]
columns = []
for estimator in estimators:
columns.append(estimator[0] + '_oof')
train_oof = pd.DataFrame(np.zeros((m, n)), columns=columns)
columns = []
for estimator in estimators:
columns.append(estimator[0] + '_predict')
test_predict = pd.DataFrame(np.zeros((m_test, n)), columns=columns)
# 执行交叉训练
for estimator in estimators:
model = estimator[1]
oof_colName = estimator[0] + '_oof'
predict_colName = estimator[0] + '_predict'
for train_part_index, eval_index in kf.split(X, y):
# 在训练集上训练
X_train_part = X.loc[train_part_index]
y_train_part = y.loc[train_part_index]
model.fit(X_train_part, y_train_part)
# 在验证集上进行验证
X_eval_part = X.loc[eval_index]
# 将验证集上预测结果拼接入oof数据集
train_oof[oof_colName].loc[eval_index] = model.predict(X_eval_part)
# 将测试集上预测结果填入predict数据集
test_predict[predict_colName] += model.predict(X_test) / n_splits
return train_oof, test_predict
estimators = [('gbr_1', gbr_1), ('gbr_2', gbr_2), ('gbr_3', gbr_3), ('gbr_4', gbr_4), ('gbr_5', gbr_5),
('lgb_1', lgb_1), ('lgb_2', lgb_2), ('lgb_3', lgb_3), ('lgb_4', lgb_4), ('lgb_5', lgb_5),
('rf_1', rf_1), ('rf_2', rf_2), ('rf_3', rf_3), ('rf_4', rf_4), ('rf_5', rf_5)]
train_oof, test_predict = train_cross(X_tr, y_tr, X_ts, estimators=estimators)
train_oof #每个模型在训练集上输出结果
gbr_1_oof | gbr_2_oof | gbr_3_oof | gbr_4_oof | gbr_5_oof | lgb_1_oof | lgb_2_oof | lgb_3_oof | lgb_4_oof | lgb_5_oof | rf_1_oof | rf_2_oof | rf_3_oof | rf_4_oof | rf_5_oof | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 21.584967 | 20.328176 | 20.699784 | 20.756417 | 20.689736 | 19.987837 | 19.987837 | 19.718926 | 19.987837 | 19.987837 | 20.088067 | 20.787031 | 20.103362 | 19.890 | 20.188280 |
1 | 24.009339 | 24.998451 | 25.816184 | 25.905167 | 24.578658 | 29.141741 | 29.141741 | 29.033681 | 29.141741 | 29.141741 | 25.700000 | 25.112668 | 25.527500 | 26.000 | 25.522500 |
2 | 21.788924 | 21.916925 | 21.920713 | 21.664622 | 22.007407 | 22.295913 | 22.295913 | 22.165582 | 22.295913 | 22.295913 | 20.545200 | 20.722422 | 20.751553 | 21.045 | 20.931364 |
3 | 16.716092 | 14.423027 | 10.996818 | 10.490689 | 15.453288 | 10.888321 | 10.888321 | 9.339263 | 10.888321 | 10.888321 | 12.308000 | 13.250215 | 12.353333 | 12.350 | 12.346667 |
4 | 43.481202 | 49.571193 | 46.527429 | 47.596295 | 47.469540 | 45.330564 | 45.330564 | 45.808818 | 45.330564 | 45.330564 | 48.600000 | 45.853714 | 48.310000 | 48.650 | 48.530000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
374 | 16.107096 | 14.676828 | 14.454655 | 14.816135 | 14.653374 | 13.905662 | 13.905662 | 14.020230 | 13.905662 | 13.905662 | 14.112000 | 13.327052 | 13.711250 | 13.780 | 13.724333 |
375 | 16.763589 | 14.906794 | 15.359821 | 14.995304 | 15.629972 | 14.467702 | 14.467702 | 14.948189 | 14.467702 | 14.467702 | 15.030000 | 14.659757 | 15.263333 | 15.350 | 15.151111 |
376 | 27.445555 | 27.908629 | 29.492297 | 29.216872 | 27.707770 | 28.449975 | 28.449975 | 27.452163 | 28.449975 | 28.449975 | 28.668000 | 29.923547 | 28.643333 | 29.515 | 28.769167 |
377 | 20.945531 | 20.216430 | 19.797499 | 19.208868 | 20.156164 | 19.700542 | 19.700542 | 19.471440 | 19.700542 | 19.700542 | 20.346000 | 21.245057 | 20.052667 | 19.805 | 20.364444 |
378 | 15.095226 | 13.012004 | 13.403023 | 13.609448 | 13.646804 | 16.788349 | 16.788349 | 16.969514 | 16.788349 | 16.788349 | 11.142000 | 11.975918 | 11.153333 | 10.885 | 11.866667 |
379 rows × 15 columns
test_predict #每个模型在测试集上输出结果
gbr_1_predict | gbr_2_predict | gbr_3_predict | gbr_4_predict | gbr_5_predict | lgb_1_predict | lgb_2_predict | lgb_3_predict | lgb_4_predict | lgb_5_predict | rf_1_predict | rf_2_predict | rf_3_predict | rf_4_predict | rf_5_predict | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 28.146524 | 27.794258 | 29.512167 | 28.144089 | 27.862370 | 28.253564 | 28.253564 | 26.968259 | 28.253564 | 28.253564 | 28.616800 | 31.544928 | 28.660000 | 29.104000 | 28.654667 |
1 | 27.321659 | 28.886234 | 28.934519 | 29.428608 | 28.174255 | 29.095618 | 29.095618 | 28.670749 | 29.095618 | 29.095618 | 28.300200 | 29.162659 | 28.536533 | 28.884000 | 28.628798 |
2 | 20.930030 | 20.554091 | 20.313938 | 20.581800 | 20.543470 | 20.552250 | 20.552250 | 20.642369 | 20.552250 | 20.552250 | 20.331978 | 21.121495 | 20.262853 | 20.212000 | 20.254775 |
3 | 25.057740 | 27.061766 | 26.294988 | 26.510212 | 26.289761 | 25.609918 | 25.609918 | 25.585351 | 25.609918 | 25.609918 | 26.783600 | 26.845427 | 27.172244 | 27.131000 | 27.151316 |
4 | 21.527758 | 20.213764 | 20.535413 | 20.103584 | 20.743239 | 19.294356 | 19.294356 | 18.872275 | 19.294356 | 19.294356 | 21.032843 | 22.038998 | 20.996870 | 20.753750 | 21.067122 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
122 | 24.021698 | 24.737808 | 24.005264 | 23.486568 | 24.494002 | 23.620704 | 23.620704 | 23.127655 | 23.620704 | 23.620704 | 23.966133 | 23.603536 | 23.769969 | 23.792000 | 23.787126 |
123 | 19.352335 | 18.505728 | 18.385578 | 18.258717 | 18.743807 | 18.495178 | 18.495178 | 18.619762 | 18.495178 | 18.495178 | 18.742137 | 19.063005 | 18.552326 | 18.556000 | 18.576157 |
124 | 21.233908 | 20.762196 | 20.966581 | 20.481063 | 20.972760 | 20.379828 | 20.379828 | 19.975595 | 20.379828 | 20.379828 | 20.988893 | 21.154919 | 21.015289 | 21.007533 | 21.029847 |
125 | 26.193834 | 27.057364 | 26.837918 | 26.556340 | 26.732396 | 26.300043 | 26.300043 | 25.829947 | 26.300043 | 26.300043 | 26.729800 | 25.888413 | 27.051581 | 26.893000 | 27.052874 |
126 | 23.166417 | 23.984761 | 24.315811 | 24.645929 | 23.640599 | 28.142739 | 28.142739 | 28.485923 | 28.142739 | 28.142739 | 25.046400 | 24.257800 | 24.899000 | 24.662000 | 24.926019 |
127 rows × 15 columns
由于参与融合的模型都是树模型,易于过拟合,所以最后的元学习器选择Ridge回归
from sklearn.linear_model import Ridge
ridge = Ridge(random_state=14)
ridge.fit(np.array(train_oof),y_tr)
mape(ridge.predict(np.array(test_predict)),y_ts) , mape(ridge.predict(np.array(train_oof)),y_tr) #(测试集得分,训练集得分)
(0.11265052301513544, 0.11028453138512788)
继续对元学习器进行优化,当然也可以试试其他的回归器
params_Ridge = {'alpha': [1,0.1,0.01,0.001,0.0001,0] ,
"fit_intercept": [True, False],
"solver": ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
ridge_search = GridSearchCV(ridge, param_grid=params_Ridge, n_jobs=-1,scoring = "neg_mean_absolute_percentage_error").fit(np.array(train_oof),y_tr)
ridge_search.best_score_*(-1) #训练集交叉验证得分
0.11221630224382506
mape(ridge_search.best_estimator_.predict(np.array(test_predict)),y_ts) #测试集得分
0.1129211924561571
接下来是Bagging作为较为基础的集成算法
from sklearn.ensemble import BaggingRegressor
parameter_space = {
"n_estimators": range(10, 21),
"max_samples": np.arange(0.1, 1.1, 0.1).tolist()}
bagging_final = BaggingRegressor(ridge_search.best_estimator_)
BG = GridSearchCV(bagging_final, parameter_space, n_jobs=-1,scoring = "neg_mean_absolute_percentage_error").fit(np.array(train_oof),y_tr)
BG.best_score_*(-1)
0.10923933635660936
mape(BG.best_estimator_.predict(np.array(test_predict)),y_ts)
0.11169070871907398
能够发现,带入超参数优化后的ridge模型后,Bagging作为元学习器的效果有了更进一步的提升:
基础评估器 | 训练集交叉验证 | 测试集 |
---|---|---|
随机森林 | 0.1224 | 0.1212 |
lightgbm | 0.1176 | 0.1197 |
GBDT | 0.1245 | 0.1162 |
stacking融合未调优ridge | 0.1126 | 0.1102 |
stacking融合调优ridge | 0.1122 | 0.1129 |
stacking融合bagging调优ridge | 0.1092 | 0.1116 |
继续优化的可能
1.增加算法的种类,参与算法融合的三组算法全是树模型,增加算法多样性
2.将对半网格搜索改为随机网格搜素
3.更换最后的元学习器,或许能得到更好的结果