1、准备
import lightgbm as lgbm
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from matplotlib import pyplot
import seaborn as sns
%matplotlib inline
train = pd.read_csv('Otto_train.csv')
# drop ids and get labels
y_train = train['target']
y_train = y_train.map(lambda s: s[6:])
y_train = y_train.map(lambda s: int(s)-1)
train = train.drop(["id", "target"], axis=1)
X_train = np.array(train)
2、使用方法
params = {'boosting_type': 'gbdt',
'objective': 'multiclass',
'nthread': -1,
'silent': True,
'learning_rate': 0.1,
'num_leaves': 50,
'max_depth': 6,
'max_bin': 127,
'subsample_for_bin': 50000,
'subsample': 0.8,
'subsample_freq': 1,
'colsample_bytree': 0.8,
'reg_alpha': 1,
'reg_lambda': 0,
'min_split_gain': 0.0,
'min_child_weight': 1,
'min_child_samples': 20,
'scale_pos_weight': 1}
lgbm1 = lgbm.sklearn.LGBMClassifier(num_class= 9, n_estimators=1000, seed=0, **params)
#modelfit(params,lgbm1, X_train, y_train)
params['num_class'] = 9
lgbmtrain = lgbm.Dataset(X_train, y_train, silent=True)
cv_result = lgbm.cv(
params, lgbmtrain, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='multi_logloss',
early_stopping_rounds=10,show_stdv=True,seed=0)
# note: cv_results will look like: {"multi_logloss-mean": <a list of historical mean>,
# "multi_logloss-stdv": <a list of historical standard deviation>}
print('best n_estimators:', len(cv_result['multi_logloss-mean']))
print('best cv score:', cv_result['multi_logloss-mean'][-1])
#json.dump(cv_result, open('lgbm_1.json', 'w'))
# 采用交叉验证得到的最佳参数n_estimators,训练模型
lgbm1.set_params(n_estimators = len(cv_result['multi_logloss-mean']))
lgbm1.fit(X_train, y_train)
3、画图
test_means = cv_result['multi_logloss-mean']
#test_stds = cv_result['multi_logloss-std']
x_axis = range(0, cv_result.shape[0])
pyplot.plot(x_axis, test_means)
pyplot.title("LightGBM n_estimators vs Log Loss")
pyplot.xlabel( 'n_estimators' )
pyplot.ylabel( 'Log Loss' )
pyplot.show()
附:GridSearchCV调参
params = {'boosting_type': 'gbdt',
'objective': 'multiclass',
'nthread': -1,
'silent': True,
'learning_rate': 0.1,
'max_depth': 6,
'max_bin': 127,
'subsample_for_bin': 50000,
'subsample': 0.8,
'subsample_freq': 1,
'colsample_bytree': 0.8,
'reg_alpha': 1,
'reg_lambda': 0,
'min_split_gain': 0.0,
'min_child_weight': 1,
'min_child_samples': 20,
'scale_pos_weight': 1}
lgbm2_1 = lgbm.sklearn.LGBMClassifier(num_class= 9, n_estimators=539, seed=0, **params)
gsearch2_1 = GridSearchCV(lgbm2_1, param_grid = param_test2_1, scoring='neg_log_loss',n_jobs=-1, cv=kfold)
gsearch2_1.fit(X_train , y_train)
# summarize results
print("Best: %f using %s" % (gsearch2_1.best_score_, gsearch2_1.best_params_))
test_means = gsearch2_1.cv_results_[ 'mean_test_score' ]
test_stds = gsearch2_1.cv_results_[ 'std_test_score' ]
train_means = gsearch2_1.cv_results_[ 'mean_train_score' ]
train_stds = gsearch2_1.cv_results_[ 'std_train_score' ]
pd.DataFrame(gsearch2_1.cv_results_).to_csv('my_preds_num_leaves_1.csv')
# plot results
pyplot.plot(num_leaves, -test_means)
pyplot.legend()
pyplot.xlabel( 'num_leaves' )
pyplot.ylabel( 'Log Loss' )