调参步骤:
- n_estimators
- max_depth 、min_child_weight
- gamma
- subsample、colsample_bytree
- reg_alpha、reg_lambda
- learning_rate
下面这段网上摘抄的,应该要加上 预测测试集
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
import pandas as pd
import return_data
from xgboost import plot_importance
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")
#alg传入XGBOOST,X_train传入训练数据特征信息,Y_train传入训练数据标签信息 X_test 最后要预测的值
def XGBmodelfit(alg, X_train, Y_train,X_test=None,Y_test=None,X_predictions=None,useTrainCV=True, cv_folds=5, early_stopping_rounds=200):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, label=Y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds, show_stdv=False)
alg.set_params(n_estimators=cvresult.shape[0])
#训练模型
alg.fit(X_train, Y_train,eval_metric='auc')
#预测结果:
dtrain_predictions = alg.predict(X_test) #输出 0 或 1
# dtrain_predprob = alg.predict_proba(X_test)[:,1] #输出概率
#打印报告信息:
print("\nModel Report")
print("Accuracy (Train) : %.4g" % metrics.accuracy_score(Y_test, dtrain_predictions))
print("AUC Score (Train): %f" % metrics.roc_auc_score(Y_test, dtrain_predictions))
print(alg)
print("the best:")
print(cvresult.shape[0])
plot_importance(alg)
plt.show()
# feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
# feat_imp.plot(kind='bar', title='Feature Importances')
# plt.ylabel('Feature Importance Score')
#dataset_X,dataset_Y=return_data.return_tarin_data()
X_train, X_test, y_train, y_test = train_test_split(dataset_X, dataset_Y,
test_size=0.2,
random_state=45)
xgb1 = XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
# XGBmodelfit(xgb1,X_train,y_train,X_test,y_test)
param_grid = {
'max_depth':range(3,10,2),
'min_child_weight':range(1,6,2)
}
#这里根据输出结果可以继续微调
# param_grid = {
# 'max_depth':[7,8],
# 'min_child_weight':[4,5]
# }
gsearch1 = GridSearchCV(estimator = XGBClassifier(
learning_rate =0.1, n_estimators=140, max_depth=9,
min_child_weight=1, gamma=0, subsample=0.8,colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4,scale_pos_weight=1, seed=27),
param_grid=param_grid,cv=5)
gsearch1.fit(X_train,y_train)
print(gsearch1.best_params_,gsearch1.best_score_)
model_best = gsearch1.best_estimator_
model_best.fit(X_train,y_train) # 训练最优模型
y_posb=model_best.predict_proba(X_test)[:,1]
# ypred=model_best.predict(X_test)
#下面就是混淆矩阵 auc评估之类的了
两种XGBoost:
- xgb - 直接引用xgboost。接下来会用到其中的“cv”函数。
- XGBClassifier - 是xgboost的sklearn包。这个包允许我们像GBM一样使用Grid Search 和并行处理。
开头这个函数改改,后面参照寒小阳就差不多了,但调参效果提升不大,还是要尝试特征工程,stacking,模型融合
test_results = pd.read_csv('test_results.csv')
def modelfit(alg, dtrain, dtest, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
'''<br data-filtered="filtered"> 功能:训练,测试,输出AUC,画出重要特征的功能
参数:alg是分类器,dtrain是训练集(包括label),dtest是测试集(不包括label),predictors是要参与训练的特征(不包括label),
useTrainCV是是否要交叉验证,cv_folds是交叉验证的折数,early_stopping_rounds是到指定次数就停止继续迭代<br data-filtered="filtered"> '''
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
xgtest = xgb.DMatrix(dtest[predictors].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
alg.set_params(n_estimators=cvresult.shape[0])
#训练
alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
#预测
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
#输出accuracy、AUC分数
print "\nModel Report"
print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
#预测测试集,输出测试集的AUC分数
dtest['predprob'] = alg.predict_proba(dtest[predictors])[:,1]
results = test_results.merge(dtest[['ID','predprob']], on='ID')
print 'AUC Score (Test): %f' % metrics.roc_auc_score(results['Disbursed'], results['predprob'])
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
参考:模型融合---Xgboost调参总结 https://www.cnblogs.com/nxf-rabbit75/p/10595551.html
寒小阳 机器学习系列(12)_XGBoost参数调优完全指南(附Python代码)https://blog.csdn.net/han_xiaoyang/article/details/52665396
XGboost-网格调参法 https://blog.csdn.net/zllnau66/article/details/81980876