#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb #XGBOOST
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier #GBDT
#交叉验证和寻找最优参数的库
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
#读数据
train = pd.read_csv('data.csv')
#目标和数据ID
target = 'Disbursed'
IDcol = 'ID'
#GBDT模型训练和特征重要性提取,以及利用交叉验证寻找最优的 树的数量 的参数
#参数解释 :alg 模型 dtrain训练数据 predictors训练数据里面的特征维度 performCV 是否用交叉验证(第一次训练设为True,
#找到最优的参数以后,后面改为false,在传入模型的时候设置树的个数)
#cv_folds 交叉验证分组数 printFeatureImportance 输出特征重要性
def modelfitGBDT(alg, dtrain, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['Disbursed'])
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
#Perform cross-validation:
if performCV:
cv_score = cross_validation.cross_val_score(alg, dtrain[predictors], dtrain['Disbursed'], cv=cv_folds, scoring='roc_auc')
#Print model report:
print "\nModel Report"
print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
if performCV:
print "CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score))
#Print Feature Importance:
if printFeatureImportance:
feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
#随便试着训练一下
predictors = [x for x in train.columns if x not in [target, IDcol]]
gbm0 = GradientBoostingClassifier(random_sta
gbdt和xgboost的调参模板
最新推荐文章于 2024-07-09 19:33:20 发布
本文详细介绍了如何使用Python进行GBDT(Gradient Boosting Decision Tree)和XGBoost的调参过程,包括使用GridSearchCV进行交叉验证来寻找最优参数。涉及的调参参数包括树的数量(n_estimators)、最大深度(max_depth)、最小分裂值(min_samples_split)、每个叶子的最小样本数(min_samples_leaf)、最大特征数(max_features)和子采样率(subsample)。
摘要由CSDN通过智能技术生成