xgboost调参

import pickle
import xgboost as xgb

import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston
from xgboost.sklearn import XGBClassifier,XGBRegressor
#加载数据
#dtrain = xgb.DMatrix('../0725/dtrain.buffer') #特征+lable
#dtest = xgb.DMatrix('../0725/dtest.buffer') #特征
y = np.load("../0725/train_lable.npy")
X = np.load("../0725/train_feature.npy")
testX = np.load("../0725/test_feature.npy")
def modelfit(alg, dtrain,dlable,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    def rmse(predictions, targets):
        return np.sqrt(((predictions - targets) ** 2).mean())
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain, label=dlable)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='rmse', early_stopping_rounds=early_stopping_rounds,
                          callbacks=[xgb.callback.print_evaluation(show_stdv=True), xgb.callback.early_stop(early_stopping_rounds)])
        print("n_estimators:",cvresult.shape[0])
        alg.set_params(n_estimators=cvresult.shape[0])
          # Fit the algorithm on the data
        alg.fit(dtrain, dlable, eval_metric='rmse')
        # Predict training set:
        preds = alg.predict(dtrain)                                         
        # Print model report:
        print("\nModel Report:",rmse(preds,dtrain.get_label()))

第一步:确定学习速率和tree_based 参数调优的估计器数目n_estimators

xgb1 = XGBRegressor(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'gpu:reg:linear',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
modelfit(xgb1, X, y,)
#n_estimators=33

第二步:大致探索参数max_depth,min_child_weight

#Grid seach on subsample and max_features

param_test1 = {
    'max_depth':list(range(3,10,2)),
    'min_child_weight':list(range(1,6,2))  
}
"""
参数说明;
n_estimators:基学习器的个数(第一步得到)
scoring:得分排名依据,因为是越高越好,这是mean_squared_error取负

"""

gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=33, max_depth=5,
                                        min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1, seed=27),
                       param_grid = param_test1, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
gsearch1.fit(X,y)
print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)
#最好:{'max_depth': 5, 'min_child_weight': 1} -3782.6882609903814

第三步:进一步探索参数max_depth,min_child_weight(前面得到最好的是{‘max_depth’: 5, ‘min_child_weight’: 1})

#Grid seach on subsample and max_features

param_test2 = {
    'max_depth':[4,5,6],
    'min_child_weight':[0,1,2]  #范围: [0,∞]
}
gsearch2 = GridSearchCV(estimator = XGBRegressor( learning_rate=0.1, n_estimators=33, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test2, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
gsearch2.fit(X,y)
print(gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_)

继续搜索参数min_child_weight {‘max_depth’: 5, ‘min_child_weight’: 2}

#Grid seach on subsample and max_features
#-3775.018211540544
param_test2b = {
    'min_child_weight':[2,4,6,8]
}
gsearch2b = GridSearchCV(estimator = XGBRegressor( learning_rate=0.1, n_estimators=33, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test2b, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
gsearch2b.fit(X,y)
print(gsearch2b.cv_results_, gsearch2b.best_params_, gsearch2b.best_score_)

第四步:调整参数gamma,range: [0,∞],default=0

#Grid seach on subsample and max_features
#在树的叶子节点上进行进一步划分所需的最小损失。越大,算法就越保守。
param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=33, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test3, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
gsearch3.fit(X,y)
print(gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_)

第五步:调整参数subsample,colsample_bytree

#Grid seach on subsample and max_features
param_test4 = {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=53, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test4, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
a=gsearch4.fit(train[predictors],train[target])
print(a)
gsearch4.fit(X,y)
print(gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_)

最后

# {'colsample_bytree': 0.8, 'subsample': 0.8} -3805.70983108
clf = xgb.XGBRegressor(learning_rate =0.1, n_estimators=33, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27)
print("train.....")
model=clf.fit(X,y)
pickle.dump(model, open("best_model.pkl", "wb"))
best_model = pickle.load(open("best_model.pkl", "rb"))
y_pred=model.predict(testX)
np.save("y_pred.npy",y_pred)
  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值