xgboost调参

最新推荐文章于 2022-04-20 12:01:58 发布

HawardScut

最新推荐文章于 2022-04-20 12:01:58 发布

阅读量1.2k

点赞数

分类专栏：机器学习基础

本文链接：https://blog.csdn.net/hao5335156/article/details/81531530

版权

机器学习基础专栏收录该内容

15 篇文章 2 订阅

订阅专栏

import pickle
import xgboost as xgb

import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston
from xgboost.sklearn import XGBClassifier,XGBRegressor

#加载数据
#dtrain = xgb.DMatrix('../0725/dtrain.buffer') #特征+lable
#dtest = xgb.DMatrix('../0725/dtest.buffer') #特征
y = np.load("../0725/train_lable.npy")
X = np.load("../0725/train_feature.npy")
testX = np.load("../0725/test_feature.npy")

def modelfit(alg, dtrain,dlable,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    def rmse(predictions, targets):
        return np.sqrt(((predictions - targets) ** 2).mean())
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain, label=dlable)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='rmse', early_stopping_rounds=early_stopping_rounds,
                          callbacks=[xgb.callback.print_evaluation(show_stdv=True), xgb.callback.early_stop(early_stopping_rounds)])
        print("n_estimators:",cvresult.shape[0])
        alg.set_params(n_estimators=cvresult.shape[0])
          # Fit the algorithm on the data
        alg.fit(dtrain, dlable, eval_metric='rmse')
        # Predict training set:
        preds = alg.predict(dtrain)                                         
        # Print model report:
        print("\nModel Report:",rmse(preds,dtrain.get_label()))

第一步：确定学习速率和tree_based 参数调优的估计器数目n_estimators

xgb1 = XGBRegressor(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'gpu:reg:linear',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
modelfit(xgb1, X, y,)
#n_estimators=33

第二步：大致探索参数max_depth，min_child_weight

#Grid seach on subsample and max_features

param_test1 = {
    'max_depth':list(range(3,10,2)),
    'min_child_weight':list(range(1,6,2))  
}
"""
参数说明;
n_estimators：基学习器的个数(第一步得到)
scoring：得分排名依据，因为是越高越好，这是mean_squared_error取负

"""

gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=33, max_depth=5,
                                        min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1, seed=27),
                       param_grid = param_test1, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
gsearch1.fit(X,y)
print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)
#最好：{'max_depth': 5, 'min_child_weight': 1} -3782.6882609903814

第三步：进一步探索参数max_depth，min_child_weight(前面得到最好的是{‘max_depth’: 5, ‘min_child_weight’: 1})

#Grid seach on subsample and max_features

param_test2 = {
    'max_depth':[4,5,6],
    'min_child_weight':[0,1,2]  #范围: [0,∞]
}
gsearch2 = GridSearchCV(estimator = XGBRegressor( learning_rate=0.1, n_estimators=33, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test2, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
gsearch2.fit(X,y)
print(gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_)

继续搜索参数min_child_weight {‘max_depth’: 5, ‘min_child_weight’: 2}

#Grid seach on subsample and max_features
#-3775.018211540544
param_test2b = {
    'min_child_weight':[2,4,6,8]
}
gsearch2b = GridSearchCV(estimator = XGBRegressor( learning_rate=0.1, n_estimators=33, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test2b, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
gsearch2b.fit(X,y)
print(gsearch2b.cv_results_, gsearch2b.best_params_, gsearch2b.best_score_)

第四步：调整参数gamma，range: [0,∞]，default=0

#Grid seach on subsample and max_features
#在树的叶子节点上进行进一步划分所需的最小损失。越大，算法就越保守。
param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=33, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test3, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
gsearch3.fit(X,y)
print(gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_)

第五步：调整参数subsample，colsample_bytree

#Grid seach on subsample and max_features
param_test4 = {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=53, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test4, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
a=gsearch4.fit(train[predictors],train[target])
print(a)
gsearch4.fit(X,y)
print(gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_)

最后

# {'colsample_bytree': 0.8, 'subsample': 0.8} -3805.70983108
clf = xgb.XGBRegressor(learning_rate =0.1, n_estimators=33, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27)
print("train.....")
model=clf.fit(X,y)
pickle.dump(model, open("best_model.pkl", "wb"))
best_model = pickle.load(open("best_model.pkl", "rb"))
y_pred=model.predict(testX)
np.save("y_pred.npy",y_pred)

HawardScut

关注

0
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
xgboost调参

import pickleimport xgboost as xgbimport numpy as npfrom sklearn.model_selection import KFold, train_test_split, GridSearchCVfrom sklearn.metrics import confusion_matrix, mean_squared_errorfrom ...
复制链接

扫一扫

专栏目录