1. 原生api 训练、网格搜索
1.1 数据
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
data = load_breast_cancer()
print(data['data'].shape, data['target'].shape)
x_data = data.data
y_data = data.target
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=666)
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)
'''
(569, 30) (569,)
'''
1.2 设置参数 & 训练
import xgboost as xgb
params = {'objective': 'binary:logistic',
'max_depth': 5,
'min_child_weight': 0.8,
'verbosity': 0,
'subsample': 0.8,
'colsample_bytree': 0.8,
'gamma': 0.1,
'lambda': 0.8,
'eta': 1}
num_round = 50
watch_list = [(dtrain, 'train'), (dtest, 'test')]
evals_result = {}
xgb_model = xgb.train(
params=params,
dtrain=dtrain,
num_boost_round=num_round,
evals=watch_list,
early_stopping_rounds=10,
evals_result=evals_result
)
'''
[0] train-logloss:0.19172 test-logloss:0.30370
[1] train-logloss:0.09322 test-logloss:0.22359
[2] train-logloss:0.05241 test-logloss:0.19594
[3] train-logloss:0.03034 test-logloss:0.21439
[4] train-logloss:0.02134 test-logloss:0.20752
[5] train-logloss:0.01631 test-logloss:0.20046
[6] train-logloss:0.01476 test-logloss:0.21509
[7] train-logloss:0.01407 test-logloss:0.20678
[8] train-logloss:0.01231 test-logloss:0.20136
[9] train-logloss:0.01197 test-logloss:0.20550
[10] train-logloss:0.01053 test-logloss:0.19675
[11] train-logloss:0.00992 test-logloss:0.20746
'''
preds = xgb_model.predict(dtest)
# 也可直接用不带标签的特征数据
# preds = xgb_model.predict(xgb.DMatrix(x_test))
labels = dtest.get_label()
labels, preds
'''
array([0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0.,
0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0.,
0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1.,
0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 0.,
0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0.], dtype=float32)
array([2.3674986e-03, 9.9861741e-01, 9.9975663e-01, 2.1583840e-01,
1.1854486e-03, 6.3544436e-04, 1.2252343e-04, 9.9900925e-01,
6.5811968e-05, 9.9750012e-01, 9.9970168e-01, 9.9982613e-01,
1.4022045e-01, 9.8603803e-01, 9.6359092e-01, 9.9929202e-01,
9.9949872e-01, 9.9832839e-01, 9.9978691e-01, 9.9982613e-01,
6.3544436e-04, 6.2098140e-03, 9.9957401e-01, 9.9969757e-01,
9.9982423e-01, 9.9957675e-01, 9.9975401e-01, 1.4727229e-04,
9.9970168e-01, 1.3158246e-04, 1.0680066e-03, 9.9991202e-01,
9.9991202e-01, 1.0036851e-01, 1.4891458e-04, 6.8655396e-01,
9.9900925e-01, 6.5541302e-04, 8.1419773e-02, 9.9970526e-01,
6.5962277e-02, 9.9671751e-01, 1.5763033e-04, 9.9976224e-01,
1.8256794e-01, 1.4891458e-04, 9.9982613e-01, 9.9935108e-01,
6.6271797e-04, 9.9606222e-01, 9.9767417e-01, 6.6271797e-04,
6.6271797e-04, 9.9754131e-01, 9.9975401e-01, 7.4481402e-05,
9.0036017e-01, 3.5478315e-01, 9.9987698e-01, 1.1854486e-03,
9.9930811e-01, 3.8913928e-02, 9.9082899e-01, 7.0148520e-04,
9.9900925e-01, 9.9870503e-01, 9.9982613e-01, 9.6617037e-01,
1.5763033e-04, 1.8701397e-01, 9.9964190e-01, 9.9982613e-01,
2.0063692e-03, 9.9939549e-01, 8.6724478e-01, 9.9991202e-01,
9.9935108e-01, 9.7720438e-01, 2.3587022e-03, 1.5763033e-04,
5.4037949e-04, 4.3376945e-03, 7.8841185e-05, 9.9982423e-01,
3.1165761e-04, 9.6175414e-01, 9.9949872e-01, 1.4727229e-04,
9.9987698e-01, 9.9982613e-01, 9.9797589e-01, 9.9478018e-01,
9.4439185e-01, 1.3158246e-04, 1.5761836e-01, 9.9597138e-01,
9.6866369e-01, 9.9949872e-01, 7.1160225e-03, 9.9847311e-01,
9.9786395e-01, 5.4037949e-04, 1.1854486e-03, 9.9964190e-01,
1.8933583e-02, 9.9413067e-01, 7.8841185e-05, 1.3096399e-03,
9.9991202e-01, 9.9705601e-01, 8.6442098e-02, 9.3267983e-01,
9.9922729e-01, 1.6699207e-04], dtype=float32)
'''
error = sum([1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]]) / float(len(preds))
print(f'error={error: .2f}')
'''
error= 0.07
'''
'''
回归
'objective': 'reg:squarederror', # 平方误差目标函数
np.sqrt(mean_squared_error(preds, labels)) # rmse
'''
1.3 网格搜索调参
1.3.1 外套sklearn api⭐⭐⭐
import xgboost as xgb
# 自定义类,实现外套sklearn api
class MyXGBoost:
def __init__(self, **params):
self.params = params
if 'num_boost_round' in self.params:
self.num_boost_round = self.params['num_boost_round']
# 默认回归,分类需要修改objective
self.params.update({'verbosity': 1,
'objective':'reg:squarederror',
'seed':0})
self.bst = None
def fit(self, x_train, y_train):
dtrain = xgb.DMatrix(x_train, y_train)
self.bst = xgb.train(params=self.params
,dtrain=dtrain
,num_boost_round=self.num_boost_round
)
def predict(self, x_pred):
dpred = xgb.DMatrix(x_pred)
return self.bst.predict(dpred)
def kfold(self, x_train, y_train, nfold=5):
dtrain = xgb.DMatrix(x_train, y_train)
cv_round = xgb.cv(params=self.params
,dtrain=dtrain
,num_boost_round=self.num_boost_round
,nfold=nfold
,early_stopping_rounds=10
)
return cv_round.iloc[-1, :]
def get_params(self, deep=True):
return self.params
def set_params(self, **params):
self.params.update(params)
return self
1.3.2 自定义目标函数
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
import numpy as np
def score_fn(y_pred, y_true):
return np.sqrt(mean_squared_error(y_pred, y_true))
# rmse 值越小拟合越好
cv_score_fn = make_scorer(score_func=score_fn
,greater_is_better=False)
1.3.3 设置搜索参数 & 初次搜索
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
params_grid = {
'max_depth': list([4, 5, 6, 7])
,'min_child_weight': list([1, 5, 9])
,'subsample': list([0.6, 0.8, 1])
,'colsample_bytree': list([0.6, 0.8, 1])
}
xgb_model = MyXGBoost(num_boost_round=20)
grid = GridSearchCV(estimator=xgb_model
,param_grid=params_grid
,scoring=cv_score_fn
,cv=3
,n_jobs=-1)
grid.fit(x_train, y_train)
print(grid.best_params_)
不知道为什么 Parameters: { "num_boost_round" } are not used.
'''
[17:04:14] WARNING: C:\Users\dev-admin\croot2\xgboost-split_1675461376218\work\src\learner.cc:767:
Parameters: { "num_boost_round" } are not used.
{'colsample_bytree': 1, 'max_depth': 5, 'min_child_weight': 9, 'subsample': 1}
'''
1.3.4 确定部分参数 & 进一步搜索
params = {
'max_depth': 5
,'min_child_weight': 9
,'subsample': 1
,'colsample_bytree':1
,'num_boost_round': 20
}
xgb_model = MyXGBoost(**params)
params_grid = {
'gamma': list([0, 0.1, 0.5])
,'lambda': list([1, 1.5])
,'eta': list([0.3, 0.5, 1])
}
grid = GridSearchCV(estimator=xgb_model
,param_grid=params_grid
,scoring=cv_score_fn
,cv=3
,n_jobs=-1)
grid.fit(x_train, y_train)
print(grid.best_params_)
'''
[17:04:15] WARNING: C:\Users\dev-admin\croot2\xgboost-split_1675461376218\work\src\learner.cc:767:
Parameters: { "num_boost_round" } are not used.
{'eta': 0.3, 'gamma': 0, 'lambda': 1}
'''
1.4 循环、交叉验证调参
import xgboost as xgb
params = {
'colsample_bytree': 1,
'max_depth': 7,
'min_child_weight': 5,
'subsample': 0.8
}
etas = [0.3, 0.5, 1]
num_boost_rounds = [10, 15, 20]
nfold = 3
best_eta, best_round = 0, 0
best_score = float('inf')
for eta in etas:
for num_boost_round in num_boost_rounds:
params.update({'eta': eta})
dtrain = xgb.DMatrix(x_train, y_train)
cv_rounds = xgb.cv(params=params
,dtrain=dtrain
,num_boost_round=num_boost_round
,nfold=nfold
,early_stopping_rounds=10
)
# 返回num_boost_round轮评估指标的train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
# 选取最后一轮的test-rmse-mean
score = cv_rounds.iloc[-1, -2]
if score < best_score:
best_score = score
best_eta = eta
best_round = num_boost_round
print(f'eta: {best_eta}, num_boost_round: {best_round}')
'''
eta: 0.5, num_boost_round: 15
'''
2. sklearn api 调参
2.1 数据
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=98)
print(Xtrain.shape, Xtest.shape)
'''
(455, 30) (114, 30)
'''
2.2 网格搜索
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
import numpy as np
def score_fn(y_pred, y_true):
return np.sqrt(mean_squared_error(y_pred, y_true))
# rmse 值越小拟合越好
cv_score_fn = make_scorer(score_func=score_fn
,greater_is_better=False)
from sklearn.model_selection import GridSearchCV
param_grid = {
'learning_rate': list([0.1, 0.2, 0.3])
,'n_estimators': list([50, 60, 70])
,'max_depth': list([4, 5, 6, 7])
,'min_child_weight': list([0.6, 0.8, 1])
,'subsample': list([0.8])
,'colsample_bytree': [0.8, 0.9]
}
xgbc = XGBClassifier(objective='binary:logistic')
cv = KFold(n_splits=5, shuffle=True, random_state=98)
grid = GridSearchCV(xgbc,
param_grid=param_grid,
cv=cv,
scoring=cv_score_fn,
n_jobs=-1)
grid.fit(Xtrain, ytrain)
grid.best_params_
'''
{'colsample_bytree': 0.8,
'learning_rate': 0.3,
'max_depth': 6,
'min_child_weight': 0.6,
'n_estimators': 70,
'subsample': 0.8}
'''
xgb_model = grid.best_estimator_
ypreds = xgb_model.predict(Xtest)
error = sum([1 for i in range(len(ypreds)) if int(ypreds[i] > 0.5) != ytest[i]]) / float(len(ypreds))
error
'''
0.03508771929824561
'''
2.3 optuna
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
import numpy as np
def score_fn(y_pred, y_true):
return np.sqrt(mean_squared_error(y_pred, y_true))
# rmse 值越小拟合越好
cv_score_fn = make_scorer(score_func=score_fn
,greater_is_better=False)
def objective(trial):
params = {
'n_estimators': trial.suggest_categorical('n_estimators',[5, 20, 50, 100])
,'max_depth': trial.suggest_categorical('max_depth',[3, 5, 10, 20])
,'learning_rate': trial.suggest_categorical('learning_rate',[0.01, 0.1, 0.3])
,'subsample': trial.suggest_categorical('subsample',[0.5, 0.8, 1])
,'colsample_bytree': trial.suggest_categorical('colsample_bytree',[0.5, 0.8, 1])
}
xgbr.set_params(**params)
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
scores = []
for train_idx, test_idx in kf.split(Xtrain, ytrain):
X_train_round, y_train_round = Xtrain[train_idx], ytrain[train_idx]
X_test_round, y_test_round = Xtrain[test_idx], ytrain[test_idx]
xgbr.fit(X_train_round, y_train_round)
score = score_fn(y_test_round, xgbr.predict(X_test_round))
scores.append(score)
return np.mean(scores)
xgbr = XGBRegressor(random_state=42)
study = optuna.create_study(direction = 'minimize')
study.optimize(objective, n_trials = 50)
best_params = study.best_params
xgbr.set_params(**best_params)
xgbr.fit(Xtrain, ytrain)
ypreds = xgbr.predict(Xtest)
error = sum([1 for i in range(len(ypreds)) if int(ypreds[i] > 0.5) != ytest[i]]) / float(len(ypreds))
'''
[I 2023-08-13 00:24:31,687] A new study created in memory with name: no-name-55579fda-c6eb-44dd-af61-e942096f9e76
[I 2023-08-13 00:24:31,779] Trial 0 finished with value: 0.47942750589521343 and parameters: {'n_estimators': 5, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 1}. Best is trial 0 with value: 0.47942750589521343.
[I 2023-08-13 00:24:32,029] Trial 1 finished with value: 0.19397620629291293 and parameters: {'n_estimators': 20, 'max_depth': 20, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 1 with value: 0.19397620629291293.
[I 2023-08-13 00:24:32,101] Trial 2 finished with value: 0.19759044486258567 and parameters: {'n_estimators': 5, 'max_depth': 5, 'learning_rate': 0.3, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 1 with value: 0.19397620629291293.
[I 2023-08-13 00:24:32,685] Trial 3 finished with value: 0.25242426092514947 and parameters: {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 0.5, 'colsample_bytree': 0.8}. Best is trial 1 with value: 0.19397620629291293.
[I 2023-08-13 00:24:32,749] Trial 4 finished with value: 0.20931217440968805 and parameters: {'n_estimators': 5, 'max_depth': 20, 'learning_rate': 0.3, 'subsample': 1, 'colsample_bytree': 0.5}. Best is trial 1 with value: 0.19397620629291293.
[I 2023-08-13 00:24:32,897] Trial 5 finished with value: 0.2148347864283057 and parameters: {'n_estimators': 20, 'max_depth': 5, 'learning_rate': 0.3, 'subsample': 1, 'colsample_bytree': 1}. Best is trial 1 with value: 0.19397620629291293.
[I 2023-08-13 00:24:33,150] Trial 6 finished with value: 0.33977384268310445 and parameters: {'n_estimators': 50, 'max_depth': 10, 'learning_rate': 0.01, 'subsample': 0.5, 'colsample_bytree': 0.5}. Best is trial 1 with value: 0.19397620629291293.
[I 2023-08-13 00:24:33,770] Trial 7 finished with value: 0.183987741509823 and parameters: {'n_estimators': 100, 'max_depth': 20, 'learning_rate': 0.3, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 7 with value: 0.183987741509823.
[I 2023-08-13 00:24:33,834] Trial 8 finished with value: 0.33880373171511113 and parameters: {'n_estimators': 5, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 1, 'colsample_bytree': 1}. Best is trial 7 with value: 0.183987741509823.
[I 2023-08-13 00:24:34,177] Trial 9 finished with value: 0.18608609798364395 and parameters: {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.3, 'subsample': 1, 'colsample_bytree': 0.5}. Best is trial 7 with value: 0.183987741509823.
[I 2023-08-13 00:24:34,809] Trial 10 finished with value: 0.183987741509823 and parameters: {'n_estimators': 100, 'max_depth': 20, 'learning_rate': 0.3, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 7 with value: 0.183987741509823.
[I 2023-08-13 00:24:35,432] Trial 11 finished with value: 0.183987741509823 and parameters: {'n_estimators': 100, 'max_depth': 20, 'learning_rate': 0.3, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 7 with value: 0.183987741509823.
[I 2023-08-13 00:24:36,064] Trial 12 finished with value: 0.183987741509823 and parameters: {'n_estimators': 100, 'max_depth': 20, 'learning_rate': 0.3, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 7 with value: 0.183987741509823.
[I 2023-08-13 00:24:36,696] Trial 13 finished with value: 0.183987741509823 and parameters: {'n_estimators': 100, 'max_depth': 20, 'learning_rate': 0.3, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 7 with value: 0.183987741509823.
[I 2023-08-13 00:24:37,128] Trial 14 finished with value: 0.18313284638690286 and parameters: {'n_estimators': 50, 'max_depth': 10, 'learning_rate': 0.3, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 14 with value: 0.18313284638690286.
[I 2023-08-13 00:24:37,540] Trial 15 finished with value: 0.1817555147315062 and parameters: {'n_estimators': 50, 'max_depth': 10, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 15 with value: 0.1817555147315062.
[I 2023-08-13 00:24:37,916] Trial 16 finished with value: 0.17797282645596724 and parameters: {'n_estimators': 50, 'max_depth': 10, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 0.8}. Best is trial 16 with value: 0.17797282645596724.
[I 2023-08-13 00:24:38,277] Trial 17 finished with value: 0.17797282645596724 and parameters: {'n_estimators': 50, 'max_depth': 10, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 0.8}. Best is trial 16 with value: 0.17797282645596724.
[I 2023-08-13 00:24:38,654] Trial 18 finished with value: 0.1796387060606775 and parameters: {'n_estimators': 50, 'max_depth': 10, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 1}. Best is trial 16 with value: 0.17797282645596724.
[I 2023-08-13 00:24:38,994] Trial 19 finished with value: 0.1726048235818686 and parameters: {'n_estimators': 50, 'max_depth': 10, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 0.5}. Best is trial 19 with value: 0.1726048235818686.
[I 2023-08-13 00:24:39,194] Trial 20 finished with value: 0.17321528816321324 and parameters: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 0.5}. Best is trial 19 with value: 0.1726048235818686.
[I 2023-08-13 00:24:39,390] Trial 21 finished with value: 0.17321528816321324 and parameters: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 0.5}. Best is trial 19 with value: 0.1726048235818686.
[I 2023-08-13 00:24:39,590] Trial 22 finished with value: 0.17321528816321324 and parameters: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 0.5}. Best is trial 19 with value: 0.1726048235818686.
[I 2023-08-13 00:24:39,793] Trial 23 finished with value: 0.17321528816321324 and parameters: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 0.5}. Best is trial 19 with value: 0.1726048235818686.
...
[I 2023-08-13 00:24:44,512] Trial 46 finished with value: 0.17738508610056414 and parameters: {'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 1, 'colsample_bytree': 0.5}. Best is trial 19 with value: 0.1726048235818686.
[I 2023-08-13 00:24:44,671] Trial 47 finished with value: 0.18466580983642228 and parameters: {'n_estimators': 20, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 0.5}. Best is trial 19 with value: 0.1726048235818686.
[I 2023-08-13 00:24:45,027] Trial 48 finished with value: 0.17881540967668666 and parameters: {'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 1}. Best is trial 19 with value: 0.1726048235818686.
[I 2023-08-13 00:24:45,607] Trial 49 finished with value: 0.17358566430619748 and parameters: {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 0.5}. Best is trial 19 with value: 0.1726048235818686.
'''
print(best_params)
print(error)
print(study.best_trial)
'''
{'n_estimators': 50, 'max_depth': 10, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 0.5}
0.07017543859649122
FrozenTrial(number=19, state=TrialState.COMPLETE, values=[0.1726048235818686], datetime_start=datetime.datetime(2023, 8, 13, 0, 24, 38, 654382), datetime_complete=datetime.datetime(2023, 8, 13, 0, 24, 38, 994747), params={'n_estimators': 50, 'max_depth': 10, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 0.5}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': CategoricalDistribution(choices=(5, 20, 50, 100)), 'max_depth': CategoricalDistribution(choices=(3, 5, 10, 20)), 'learning_rate': CategoricalDistribution(choices=(0.01, 0.1, 0.3)), 'subsample': CategoricalDistribution(choices=(0.5, 0.8, 1)), 'colsample_bytree': CategoricalDistribution(choices=(0.5, 0.8, 1))}, trial_id=19, value=None)
'''