LightGBM原生包,可以可视化出训练过程,还可以可视化出tree。
先根据数据量、特征量确定大概的参数。主要是max_depth、num_leaves、max_bin、min_data_in_leaf。网上可以找到很多参数确定的技巧。
import lightgbm as lgb
from sklearn import metrics
from sklearn import joblib
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
def get_data(path):
data = load_svmlight_file(path,dtype=np.float32)
return(data[0],data[1])
def gsearch_train(path):
evals_result = {}
x,y = get_data(path)
x_train,y_train,x_test,y_test = train_test_split(0.2)
param_test = {
'boosting':['gbdt'],
'objective':['binary'],
'num_boost_round':[1000],
'verbose_eval':[5],
'learning_rate':[0.1],
'max_depth':range(10,15,1),
'max_bin':[255],
'min_data_in_leaf':[50],
'feature_fraction':[0.5],
}
gsearch = GridSearchCV(estimator=lgb.LGBMClassifier(boosting_type='gbdt',n_jobs=-1,verbose=1),param_grid=param_test,scoring='roc_auc')
gsearch.fit(x_train,y_train)
return(gsearch.best_params_,gsearch.best_score_,gsearch.cv_results_,gsearch.best_estimator_)
def train(path,param):
evals_result = {}
x,y = get_data(path)
x_train,y_train,x_test,y_test = train_test_split(0.2)
params = {
}
train_set = lgb.Dataset(x_train,y_train)
test_set = lgb.Dataset(x_test,y_test)
model = lgb.train(params,train_set,valid_sets=[test_set],evals_result=evals_result,early_stopping=10)
return(model,evals_result)
if __name == '__main__':
path = ''
params,_,_,best_estimator = gsearch_train(path)
model,evals_result = train(path,params)
ax = lgb.plot_tree(best_estimator,tree_index=3,figsize=(20,8),show_info=['split_gain'])
plt.show()
ax = lgb.plot_metric(evals_result,metirc='auc',title='AUC')
plt.show()
ax = lgb.plot_metric(evals_result,metirc='bianry_logloss',title='Logloss')
plt.show()