import pprint
import sklearn.preprocessing as preprocessing
from xgboost import XGBRegressor
import xgboost as xgb
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
import numpy as np
import time
skfold = KFold(n_splits=5, shuffle=True)
def xgb_cv(clf, train_data, label, cv_folds=5, early_stopping_rounds=50, metric='rmse', \
eval_metric="rmse", is_print_f_i=False):
train_X, test_X, train_y, test_y = train_test_split(train_data, label, test_size=0.1, random_state=2017)
param = clf.get_xgb_params()
train_data = xgb.DMatrix(train_X, train_y)
train_test = xgb.DMatrix(test_X, test_y)
cv_res = xgb.cv(param, train_data, num_boost_round=clf.get_params()['n_estimators'], nfold=cv_folds, metrics=metric,
early_stopping_rounds=early_stopping_rounds)
clf.set_params(n_estimators=cv_res.shape[0])
clf.set_params(n_estimators=cv_res.shape[0])
clf.fit(train_X, train_y, eval_metric=metric)
return cv_res.shape[0]
def grid_search_para(train_data, label, best_para=0, grid_param=0, is_search_estimator=False, search_lr=0.1,
scoring='mean_squared_error', search_estimators=10000, iid=False, cv=skfold):
if not is_search_estimator:
for key, value in grid_param.items():
print('start GridSearchCV {} in range {}'.format(key, value))
xgb_ = XGBRegressor(**best_para)
grid_search = GridSearchCV(estimator=xgb_, param_grid=grid_param, scoring=scoring, iid=iid, cv=cv)
grid_search.fit(train_data, label)
best_para.update(grid_search.best_params_)
print('the best parameter is ', grid_search.best_params_)
print('the best score is %f' % grid_search.best_score_)
else:
xgb_ = XGBRegressor(booster="dart")
if best_para == 0:
best_para = xgb_.get_params()
best_para['n_estimators'] = search_estimators
best_para['learning_rate'] = search_lr
xgb_ = XGBRegressor(**best_para)
best_estimator = xgb_cv(xgb_, train_data, label)
best_para['n_estimators'] = best_estimator
return best_para
def select_parameter(train_data, label, test_size=0.1, scoring='rmse'):
best_para = grid_search_para(train_data, label, best_para=0, is_search_estimator=True)
grid_param = {'max_depth': list(range(3, 10, 1)), 'min_child_weight': list(range(1, 12, 2))}
best_para = grid_search_para(train_data, label, best_para, grid_param=grid_param)
grid_param = {'gamma': [i / 10.0 for i in range(0, 5)]}
best_para = grid_search_para(train_data, label, best_para, grid_param=grid_param)
base_para = grid_search_para(train_data, label, best_para, is_search_estimator=True)
grid_param = {'subsample': [i / 10.0 for i in range(6, 10)], 'colsample_bytree': [i / 10.0 for i in range(6, 10)]}
best_para = grid_search_para(train_data, label, best_para, grid_param=grid_param)
grid_param = {'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05, 1e-5, 1e-2, 0.1, 1, 100]}
best_para = grid_search_para(train_data, label, best_para, grid_param=grid_param)
best_para = grid_search_para(train_data, label, best_para, is_search_estimator=True, search_lr=0.1)
pprint.pprint("The best parameter is \n {}".format(base_para))
return best_para
if __name__ == "__main__":
pass
XGBoost 自动化调参模块
最新推荐文章于 2025-03-04 14:00:00 发布