import xgboost as xgb
import numpy as np
import pickle
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error #混淆矩阵,均方误差
from sklearn.datasets import load_iris, load_digits, load_boston
#鸢尾花数据集,鸢尾花种类预测,属于分类,分类问题用混淆矩阵
#样本数据集,属于分类,分类问题用混淆矩阵
#波士顿房价数据集,线性回归,回归问题用MSE
rng = np.random.RandomState(31337)
#二分类:混淆矩阵
print("数字0和1的二分类问题")
digits = load_digits(2)
y = digits['target']
X = digits['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng) #shuffle()函数是打乱序列里面的元素,并随机排列的。
print("在2折数据上的交叉验证")
for train_index, test_index in kf.split(X):
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index])
actuals = y[test_index]
print("混淆矩阵:")
print(confusion_matrix(actuals, predictions))
'''
数字0和1的二分类问题
在2折数据上的交叉验证
[22:28:44] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
混淆矩阵:
[[87 0]
[ 1 92]]
[22:28:44] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
混淆矩阵:
[[91 0]
[ 2 87]]
'''
#多分类:混淆矩阵
print("\nIris: 多分类")
iris = load_iris()
y = iris['target']
X = iris['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
print("在2折数据上的交叉验证")
for train_index, test_index in kf.split(X):
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index])
actuals = y[test_index]
print("混淆矩阵:")
print(confusion_matrix(actuals, predictions))
'''
Iris: 多分类
在2折数据上的交叉验证
[22:12:50] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
混淆矩阵:
[[24 0 0]
[ 0 26 1]
[ 0 4 20]]
[22:12:50] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
混淆矩阵:
'''
'''
[[26 0 0]
[ 0 20 3]
[ 0 1 25]]
'''
结论:鸢尾花数据集和样本分类集,除了数据源不一样,其他的套路都是一样的。所以此类问题,都可以通过此方法解决。
#回归问题:MSE
print("\n波士顿房价回归预测问题")
boston = load_boston()
y = boston['target']
X = boston['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
print("在2折数据上的交叉验证")
for train_index, test_index in kf.split(X):
xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index])
actuals = y[test_index]
print("MSE:",mean_squared_error(actuals, predictions))
'''
波士顿房价回归预测问题
在2折数据上的交叉验证
MSE: 21.88594990885867
MSE: 14.807648754688827
'''
网格搜索查找最优超参数
# 第2种训练方法的 调参方法:使用sklearn接口的regressor + GridSearchCV
print("参数最优化:")
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()
param_dict = {'max_depth': [2,4,6],
'n_estimators': [50,100,200]}
clf = GridSearchCV(xgb_model, param_dict, verbose=1)
'''
verbose是控制日志输出的, 'controls the verbosity: the higher, the more messages'
verbose = 0没有输出;verbose = 1 简化版日志输出;verbose=2 更细致的日志输出...
一般设置到2就很多信息了,日志输出太多会影响运行的速度。
'''
clf.fit(X,y)
print(clf.best_score_)
print(clf.best_params_)
'''
参数最优化:
Fitting 5 folds for each of 9 candidates, totalling 45 fits
0.6839859272017424
{'max_depth': 2, 'n_estimators': 100}
'''
early-stopping 早停:
# 第1/2种训练方法的 调参方法:early stopping
# 在训练集上学习模型,一颗一颗树添加,在验证集上看效果,当验证集效果不再提升,停止树的添加与生长
X = digits['data']
y = digits['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
eval_set=[(X_val, y_val)])
输出结果
[0] validation_0-auc:0.99950
[1] validation_0-auc:0.99975
[2] validation_0-auc:0.99975
[3] validation_0-auc:0.99975
[4] validation_0-auc:0.99975
[5] validation_0-auc:0.99975
[6] validation_0-auc:1.00000
[7] validation_0-auc:1.00000
[8] validation_0-auc:1.00000
[9] validation_0-auc:1.00000
[10] validation_0-auc:1.00000
[11] validation_0-auc:1.00000
[12] validation_0-auc:1.00000
[13] validation_0-auc:1.00000
[14] validation_0-auc:1.00000
[15] validation_0-auc:1.00000
[16] validation_0-auc:1.00000
'''
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
'''
特征重要度
iris = load_iris()
y = iris['target']
X = iris['data']
xgb_model = xgb.XGBClassifier().fit(X,y)
xgb_model.feature_importances_
'''
[22:47:36] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0,
the default evaluation metric used with the objective 'multi:softprob' was changed from
'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
array([0.00959796, 0.01645038, 0.67658573, 0.29736587], dtype=float32)
'''
print('特征排序:')
feature_names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
feature_importances = xgb_model.feature_importances_
indices = np.argsort(feature_importances)[::-1]
for index in indices:
print("特征 %s 重要度为 %f" %(feature_names[index], feature_importances[index]))
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(16,8))
plt.title("feature importances")
plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b')
'''
特征排序:
特征 petal_length 重要度为 0.676586
特征 petal_width 重要度为 0.297366
特征 sepal_width 重要度为 0.016450
特征 sepal_length 重要度为 0.009598
([<matplotlib.axis.XTick at 0x16e8bc09820>,
<matplotlib.axis.XTick at 0x16e8bc09850>,
<matplotlib.axis.XTick at 0x16e8bcc6a30>,
<matplotlib.axis.XTick at 0x16e8c015bb0>],
[Text(0, 0, 'petal_length'),
Text(0, 0, 'petal_width'),
Text(0, 0, 'sepal_width'),
Text(0, 0, 'sepal_length')])
'''
XGBoost 的early_stopping
-
在训练集上学习模型,一颗一颗树添加,在验证集上看效果,当验证集效果不再提升,停止树的添加与生长
-
通过指定early_stopping可以避免过拟合
通过XGBoost 可以计算特征的重要程度
混淆矩阵: