前言:
目前传统机器学习的比赛中,基本是树模型打天下了,xgboost、lightgbm和catboost各有优劣,如果把三者结合起来,即使使用投票效果也会很好。另外如果再使用5折交叉验证的方法(KFold),同时使用oob(out of bag,袋外值)作为训练集,xgb、lgb和cat对test的结果作为验证集,再使用stacking的方法,那么模型成功率和稳定率将会大大提升。本文以代码为主,主要讲述这一系列工作该如何做。
注:数据使用的是清洗干净的纯数字、无缺失、无杂音,而且树模型还不需要数据标准化,所以数据清洗和特征工程部分就略过了,数据因保密性也无法提供。
一、基础导入
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import catboost as ctb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
pd.options.display.max_columns=200
train_data = pd.read_csv('high_diamond_ranked_10min.csv')
train_data_X = train_data.drop(['blueWins','gameId'],axis=1)
train_data_y = train_data['blueWins']
X_train, X_test, y_train, y_test = train_test_split(train_data_X,train_data_y,test_size=0.2, random_state=42)
二、单模型测试
1、LightGBM
#这里是gridsearchcv进行参数选择,用的是LGBMClassifier,LGBMRegressor也是同样的方法
#这里只是供参考,后面xgboost和catboost都没有gridsearch
lgb_cv_params = {
'n_estimators':range(10,20),
}
lgb_params = {
# 'n_estimators':300,
'max_depth':8, #一般这个值设为7-10即可
'num_leaves':30, #要小于2**max_depth,比如2**7也就是128,
'learning_rate':0.1,
'subsample':0.8,
'feature_frequency':0.8,
'random_state':32,
'class_weight':'balanced',
'n_jobs':4,
'subsample_freq':2
}
lgb_cv = GridSearchCV(lgb.LGBMClassifier(**lgb_params),
lgb_cv_params,
scoring='accuracy',
cv=5,
n_jobs=-1)
lgb_cv.fit(X_train, y_train)
print(lgb_cv.best_params_)
print(lgb_cv.best_score_)
#这里就用网格搜索出来的参数重新训练一次
lgb_reg = lgb.LGBMRegressor(max_depth=8,
n_estimators=300,
subsample=0.8,
colsample_bytree=0.8)
lgb_reg.fit(X_train, y_train, eval_set=[(X_train, y_train),(X_test, y_test)],early_stopping_rounds=30,verbose=5,eval_metric='auc')
lgb_predict = lgb_reg.predict(test_data.drop('gameId',axis=1))
2、catboost
cat_reg = ctb.CatBoostRegressor(learning_rate=0.1,
depth=8,
random_seed=32,
)
cat_reg.fit(X_train, y_train, eval_set=[(X_train, y_train),(X_test, y_test)],early_stopping_rounds=30,verbose=5)
cat_predict = cat_reg.predict(test_data.drop('gameId',axis=1))
3、xgboost
xgb_reg = xgb.XGBRegressor(max_depth=8,
learning_rate=0.1,
n_estimators=300,
n_jobs=4,
colsample_bytree=0.8,
subsample=0.8,
random_state=32,
)
xgb_reg.fit(X_train, y_train, eval_set=[(X_train, y_train),(X_test, y_test)],early_stopping_rounds=30,verbose=5,eval_metric='auc')
xgb_predict = xgb_reg.predict(test_data.drop('gameId',axis=1))
4、三个单模型投票融合
output = pd.DataFrame({'lgb':lgb_predict, 'xgb':xgb_predict, 'cat':cat_predict})
output['pred'] = output.apply(lambda x:x.mean(),axis=1)
output['pred'] = output['pred'].apply(lambda x: 1 if x>=0.5 else 0)
#导出数据
output['gameId'] = test_data['gameId']
output = output.rename(columns={'pred':'blueWins'})
output[['gameId','blueWins']].to_csv('zhanglei_output.csv', index=False)
三、5折交叉+stacking
1、5折交叉(lgb+xgb+cat,含oof-袋外计算)
1)、先预备好验证集的oof和测试集的df
import numpy as np
oof_lgb = np.zeros(train_data_X.shape[0])
oof_xgb = np.zeros(train_data_X.shape[0])
oof_cat = np.zeros(train_data_X.shape[0])
test_output_df = pd.DataFrame(columns=['lgb','xgb','cat'],index=range(test_data.shape[0]))
test_output_df = test_output_df.fillna(0)
kfold = KFold(random_state=2021)
2)、三个基模型
lgb_reg = lgb.LGBMRegressor(max_depth=8,
n_estimators=300,
subsample=0.8,
colsample_bytree=0.8,
random_state=32)
cat_reg = ctb.CatBoostRegressor(learning_rate=0.1,
depth=8,
random_seed=32,
)
xgb_reg = xgb.XGBRegressor(max_depth=8,
learning_rate=0.1,
n_estimators=300,
n_jobs=4,
colsample_bytree=0.8,
subsample=0.8,
random_state=32,
)
3)、5折交叉
for train_idx, valid_idx in kfold.split(train_data_X):
train_x = train_data_X.loc[train_idx]
train_y = train_data_y.loc[train_idx]
valid_x = train_data_X.loc[valid_idx]
valid_y = train_data_y.loc[valid_idx]
lgb_reg.fit(train_x,train_y, eval_set=[(train_x,train_y),(valid_x,valid_y)],early_stopping_rounds=30)
xgb_reg.fit(train_x,train_y, eval_set=[(train_x,train_y),(valid_x,valid_y)],early_stopping_rounds=30)
cat_reg.fit(train_x,train_y, eval_set=[(train_x,train_y),(valid_x,valid_y)],early_stopping_rounds=30)
oof_lgb[valid_idx] = lgb_reg.predict(valid_x)
oof_xgb[valid_idx] = xgb_reg.predict(valid_x)
oof_cat[valid_idx] = cat_reg.predict(valid_x)
test_output_df['lgb'] += lgb_reg.predict(test_data.drop('gameId',axis=1))
test_output_df['xgb'] += xgb_reg.predict(test_data.drop('gameId',axis=1))
test_output_df['cat'] += cat_reg.predict(test_data.drop('gameId',axis=1))
test_output_df['lgb'] = test_output_df['lgb'] / 5
test_output_df['xgb'] = test_output_df['xgb'] / 5
test_output_df['cat'] = test_output_df['cat'] / 5
4)、完成oof的df
oof_df = pd.DataFrame({'lgb':oof_lgb,'xgb':oof_xgb,'cat':oof_cat})
2、stacking
不论直接用LogisticRegression,亦或是LinearRegression然后where归为0或1都是可以的,效果差不多
1)、LogisticRegression
from sklearn.linear_model import LogisticRegression
log_r = LogisticRegression()
log_r.fit(oof_df, train_data_y)
final_pre1 = log_r.predict(test_output_df)
2)、LinearRegression
from sklearn.linear_model import LinearRegression
lrg = LinearRegression()
lrg.fit(oof_df, train_data_y)
final_pre2 = lrg.predict(test_output_df)
final_pre2 = np.where(final_pre2>=0.5, 1, 0)
3)、效果比对
from sklearn.metrics import classification_report
print(classification_report(final_pre1, final_pre2, digits=4))