import pickle
import xgboost as xgb
import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston
from xgboost.sklearn import XGBClassifier,XGBRegressor
y = np.load("../0725/train_lable.npy")
X = np.load("../0725/train_feature.npy")
testX = np.load("../0725/test_feature.npy")
def modelfit(alg, dtrain,dlable,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
def rmse(predictions, targets):
return np.sqrt(((predictions - targets) ** 2).mean())
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain, label=dlable)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='rmse', early_stopping_rounds=early_stopping_rounds,
callbacks=[xgb.callback.print_evaluation(show_stdv=True), xgb.callback.early_stop(early_stopping_rounds)])
print("n_estimators:",cvresult.shape[0])
alg.set_params(n_estimators=cvresult.shape[0])
alg.fit(dtrain, dlable, eval_metric='rmse')
preds = alg.predict(dtrain)
print("\nModel Report:",rmse(preds,dtrain.get_label()))
第一步:确定学习速率和tree_based 参数调优的估计器数目n_estimators
xgb1 = XGBRegressor(
learning_rate =0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'gpu:reg:linear',
nthread=4,
scale_pos_weight=1,
seed=27)
modelfit(xgb1, X, y,)
第二步:大致探索参数max_depth,min_child_weight
param_test1 = {
'max_depth':list(range(3,10,2)),
'min_child_weight':list(range(1,6,2))
}
"""
参数说明;
n_estimators:基学习器的个数(第一步得到)
scoring:得分排名依据,因为是越高越好,这是mean_squared_error取负
"""
gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=33, max_depth=5,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1, seed=27),
param_grid = param_test1, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
gsearch1.fit(X,y)
print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)
第三步:进一步探索参数max_depth,min_child_weight(前面得到最好的是{‘max_depth’: 5, ‘min_child_weight’: 1})
param_test2 = {
'max_depth':[4,5,6],
'min_child_weight':[0,1,2]
}
gsearch2 = GridSearchCV(estimator = XGBRegressor( learning_rate=0.1, n_estimators=33, max_depth=5,
min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test2, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
gsearch2.fit(X,y)
print(gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_)
继续搜索参数min_child_weight {‘max_depth’: 5, ‘min_child_weight’: 2}
param_test2b = {
'min_child_weight':[2,4,6,8]
}
gsearch2b = GridSearchCV(estimator = XGBRegressor( learning_rate=0.1, n_estimators=33, max_depth=5,
min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test2b, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
gsearch2b.fit(X,y)
print(gsearch2b.cv_results_, gsearch2b.best_params_, gsearch2b.best_score_)
第四步:调整参数gamma,range: [0,∞],default=0
param_test3 = {
'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=33, max_depth=5,
min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test3, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
gsearch3.fit(X,y)
print(gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_)
第五步:调整参数subsample,colsample_bytree
param_test4 = {
'subsample':[i/10.0 for i in range(6,10)],
'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=53, max_depth=5,
min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test4, scoring='neg_mean_squared_error',n_jobs=1,iid=False, cv=5)
a=gsearch4.fit(train[predictors],train[target])
print(a)
gsearch4.fit(X,y)
print(gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_)
最后
clf = xgb.XGBRegressor(learning_rate =0.1, n_estimators=33, max_depth=5,
min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'gpu:reg:linear', nthread=4, scale_pos_weight=1,seed=27)
print("train.....")
model=clf.fit(X,y)
pickle.dump(model, open("best_model.pkl", "wb"))
best_model = pickle.load(open("best_model.pkl", "rb"))
y_pred=model.predict(testX)
np.save("y_pred.npy",y_pred)