本案例使用lightGBM算法实现参数网格搜索
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
train_data = pd.read_csv('...')
train_label = pd.read_csv('...')
test_data = pd.read_csv('...')
train_x, test_x, train_y, test_y = train_test_split(train_data,train_label,test_size=0.3,random_state=2019)
cv_params = { 'n_estimators':[100,150,200,250,300],
'num_leaves':[15,20,25,30,35,40,45,50],
'max_depth':[3,4,5,6,7,8,9],
'min_data_in_leaf':[18,19,20,21,22],
'min_sum_hessian_in_leaf':[0.001,0.002],
'feature_fraction':[0.6,0.7,0.8,0.9,1.0],
'bagging_fraction':[0.6,0.7,0.8,0.9,1.0],
'bagging_freq':[2,4,6,8,10],
'lambda_l1':[1e-3,1e-2,0.0,0.1,0.2,0.3,0.4,0.5],
'lambda_l2':[1e-3,1e-2,0.0,0.1,0.2,0.3,0.4,0.5],
'learning_rate':[0.01,0.02,0.05,0.07,0.09,0.1,0.15,0.2]
}
model = lgb.LGBMClassifier(
boosting = 'gbdt',
objective = 'binary', #分类用binary,多分类用multi-class,回归用regression
metrics = 'f1',
n_estimators = 100,
num_leaves = 30, #搭配max_septh使用,取值<=2^(max_depth),否则过拟合,单独调时可使得max_depth=-1,表示不限制树的深度
max_depth = 5,
min_data_in_leaf = 15,
min_sum_hession_in_leaf = 0.005,
feature_fraction = 0.8,
bagging_fraction = 0.8,
bagging_freq = 5,
lambda_l1 = 0.1,
lambda_l2 = 0.1,
learning_rate = 0.1
)
optimized_lgb = GridSearchCV(estimators=model, param_grid=cv_params, scoring='f1', cv=3, verbose=20, n_jobs=4)
optimized_lgb.fit(train_x, train_y)
test_pred = optimized_lgb.predicted(test_x)
best_model = optimized_lgb.best_estimator_
best_model.fit(train_x, train_y, eval_set=[(test_x, test_y)], eval_metric='f1', early_stopping_rounds=100)
print(best_model.feature_importances_)
best_params = optimized_lgb.best_params_
best_score = optimized_lgb.best_score_
与模型交叉验证搭配,先用gridsearch搜索模型最佳参数,再用交叉验证训练测试模型。
GridSearch参数网格搜索费时费力,往往能得到全局最佳参数;也可使用贝叶斯参数自动搜索,省时省力,但有时候容易得到的是局部最佳参数。可以先用贝叶斯自动搜索确定范围,再用GridSearch搜索确定最佳参数。