importxgboost
# First XGBoost model for Pima Indiansdataset
fromnumpyimportloadtxt
fromxgboostimportXGBClassifier
fromsklearn.model_selectionimporttrain_test_split
fromsklearn.metricsimportaccuracy_score
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
X = dataset[:,0:8]
Y = dataset[:,8]
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test =train_test_split(X, Y, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value iny_pred]
# evaluate predictions
accuracy = accuracy_score(y_test,predictions)
print("Accuracy:%.2f%%"% (accuracy *100.0))
Accuracy: 77.95%
fromnumpyimportloadtxt
fromxgboostimportXGBClassifier
fromsklearn.model_selectionimporttrain_test_split
fromsklearn.metricsimportaccuracy_score
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
X = dataset[:,0:8]
Y = dataset[:,8]
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test =train_test_split(X, Y, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier()
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train,early_stopping_rounds=10, eval_metric="logloss",eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value iny_pred]
# evaluate predictions
accuracy = accuracy_score(y_test,predictions)
print("Accuracy:%.2f%%"% (accuracy *100.0))
[0] validation_0-logloss:0.660186
Will train untilvalidation_0-logloss hasn't improved in 10 rounds.
[1] validation_0-logloss:0.634854
[2] validation_0-logloss:0.612239
[3] validation_0-logloss:0.593118
[4] validation_0-logloss:0.578303
[5] validation_0-logloss:0.564942
[6] validation_0-logloss:0.555113
[7] validation_0-logloss:0.54499
[8] validation_0-logloss:0.539151
[9] validation_0-logloss:0.531819
[10] validation_0-logloss:0.526065
[11] validation_0-logloss:0.51977
[12] validation_0-logloss:0.514979
[13] validation_0-logloss:0.50927
[14] validation_0-logloss:0.506086
[15] validation_0-logloss:0.503565
[16] validation_0-logloss:0.503591
[17] validation_0-logloss:0.500805
[18] validation_0-logloss:0.497605
[19] validation_0-logloss:0.495328
[20] validation_0-logloss:0.494777
[21] validation_0-logloss:0.494274
[22] validation_0-logloss:0.493333
[23] validation_0-logloss:0.492211
[24] validation_0-logloss:0.491936
[25] validation_0-logloss:0.490578
[26] validation_0-logloss:0.490895
[27] validation_0-logloss:0.490646
[28] validation_0-logloss:0.491911
[29] validation_0-logloss:0.491407
[30] validation_0-logloss:0.488828
[31] validation_0-logloss:0.487867
[32] validation_0-logloss:0.487297
[33] validation_0-logloss:0.487562
[34] validation_0-logloss:0.487788
[35] validation_0-logloss:0.487962
[36] validation_0-logloss:0.488218
[37] validation_0-logloss:0.489582
[38] validation_0-logloss:0.489334
[39] validation_0-logloss:0.490969
[40] validation_0-logloss:0.48978
[41] validation_0-logloss:0.490704
[42] validation_0-logloss:0.492369
Stopping. Bestiteration:
[32] validation_0-logloss:0.487297
Accuracy: 78.35%
fromnumpyimportloadtxt
fromxgboostimportXGBClassifier
fromxgboostimportplot_importance
frommatplotlibimportpyplot
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
X = dataset[:,0:8]
y = dataset[:,8]
# fit model no training data
model = XGBClassifier()
model.fit(X, y)
# plot feature importance
plot_importance(model)
pyplot.show()
# Tune learning_rate
fromnumpyimportloadtxt
fromxgboostimportXGBClassifier
fromsklearn.model_selectionimportGridSearchCV
fromsklearn.model_selectionimportStratifiedKFold
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
X = dataset[:,0:8]
Y = dataset[:,8]
# grid search
model = XGBClassifier()
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
param_grid = dict(learning_rate=learning_rate)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid,scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X, Y)
# summarize results
print("Best: %fusing %s"%(grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean, paraminzip(means, params):
print("%f with: %r"% (mean, param))
Best: -0.483013 using{'learning_rate': 0.1}
-0.689650 with: {'learning_rate': 0.0001}
-0.661274 with: {'learning_rate': 0.001}
-0.530747 with: {'learning_rate': 0.01}
-0.483013 with: {'learning_rate': 0.1}
-0.515440 with: {'learning_rate': 0.2}
-0.557315 with: {'learning_rate': 0.3}
1.learning rate
2.tree
max_depth
min_child_weight
subsample, colsample_bytree
gamma
3.正则化参数
lambda
alpha
xgb1 = XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)