Xgboost建模，sklearn评估,分类问题用混淆矩阵，回归问题用MSE

最新推荐文章于 2024-07-16 12:38:33 发布

缘源园

最新推荐文章于 2024-07-16 12:38:33 发布

阅读量4.5k

点赞数 2

分类专栏：数据分析文章标签： python 机器学习数据挖掘数据分析深度学习

本文链接：https://blog.csdn.net/weixin_48135624/article/details/115102615

版权

数据分析专栏收录该内容

54 篇文章 8 订阅

订阅专栏

import xgboost as xgb
import numpy as np
import pickle
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error #混淆矩阵，均方误差
from sklearn.datasets import load_iris, load_digits, load_boston 
#鸢尾花数据集，鸢尾花种类预测，属于分类，分类问题用混淆矩阵
#样本数据集,属于分类，分类问题用混淆矩阵
#波士顿房价数据集，线性回归，回归问题用MSE

rng = np.random.RandomState(31337)

#二分类：混淆矩阵
print("数字0和1的二分类问题")
digits = load_digits(2)
y = digits['target']
X = digits['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng) #shuffle（）函数是打乱序列里面的元素，并随机排列的。
print("在2折数据上的交叉验证")
for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print("混淆矩阵:")
    print(confusion_matrix(actuals, predictions))
'''
数字0和1的二分类问题
在2折数据上的交叉验证
[22:28:44] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
混淆矩阵:
[[87  0]
 [ 1 92]]
[22:28:44] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
混淆矩阵:

[[91  0]
 [ 2 87]]
'''

#多分类：混淆矩阵
print("\nIris: 多分类")
iris = load_iris()
y = iris['target']
X = iris['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
print("在2折数据上的交叉验证")
for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print("混淆矩阵:")
    print(confusion_matrix(actuals, predictions))
'''
Iris: 多分类
在2折数据上的交叉验证
[22:12:50] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
混淆矩阵:
[[24  0  0]
 [ 0 26  1]
 [ 0  4 20]]
[22:12:50] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
混淆矩阵:

'''
'''
[[26  0  0]
 [ 0 20  3]
 [ 0  1 25]]

'''

结论：鸢尾花数据集和样本分类集，除了数据源不一样，其他的套路都是一样的。所以此类问题，都可以通过此方法解决。

#回归问题：MSE
print("\n波士顿房价回归预测问题")
boston = load_boston()
y = boston['target']
X = boston['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
print("在2折数据上的交叉验证")
for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print("MSE:",mean_squared_error(actuals, predictions))
'''
波士顿房价回归预测问题
在2折数据上的交叉验证
MSE: 21.88594990885867
MSE: 14.807648754688827
'''

网格搜索查找最优超参数

# 第2种训练方法的 调参方法：使用sklearn接口的regressor + GridSearchCV
print("参数最优化：")
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()
param_dict = {'max_depth': [2,4,6],
              'n_estimators': [50,100,200]}

clf = GridSearchCV(xgb_model, param_dict, verbose=1)
'''
verbose是控制日志输出的, 'controls the verbosity: the higher, the more messages'
verbose = 0没有输出；verbose = 1 简化版日志输出；verbose=2 更细致的日志输出...
一般设置到2就很多信息了，日志输出太多会影响运行的速度。
'''
clf.fit(X,y)
print(clf.best_score_)
print(clf.best_params_)
'''
参数最优化：
Fitting 5 folds for each of 9 candidates, totalling 45 fits
0.6839859272017424
{'max_depth': 2, 'n_estimators': 100}

'''

early-stopping 早停:

# 第1/2种训练方法的 调参方法：early stopping
# 在训练集上学习模型，一颗一颗树添加，在验证集上看效果，当验证集效果不再提升，停止树的添加与生长
X = digits['data']
y = digits['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
        eval_set=[(X_val, y_val)])

输出结果
[0]	validation_0-auc:0.99950
[1]	validation_0-auc:0.99975
[2]	validation_0-auc:0.99975
[3]	validation_0-auc:0.99975
[4]	validation_0-auc:0.99975
[5]	validation_0-auc:0.99975
[6]	validation_0-auc:1.00000
[7]	validation_0-auc:1.00000
[8]	validation_0-auc:1.00000
[9]	validation_0-auc:1.00000
[10]	validation_0-auc:1.00000
[11]	validation_0-auc:1.00000
[12]	validation_0-auc:1.00000
[13]	validation_0-auc:1.00000
[14]	validation_0-auc:1.00000
[15]	validation_0-auc:1.00000
[16]	validation_0-auc:1.00000
'''
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


'''

特征重要度

iris = load_iris()
y = iris['target']
X = iris['data']
xgb_model = xgb.XGBClassifier().fit(X,y)
xgb_model.feature_importances_
'''
[22:47:36] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, 
the default evaluation metric used with the objective 'multi:softprob' was changed from 
'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
array([0.00959796, 0.01645038, 0.67658573, 0.29736587], dtype=float32)
'''

print('特征排序：')
feature_names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
feature_importances = xgb_model.feature_importances_

indices = np.argsort(feature_importances)[::-1]

for index in indices:
    print("特征 %s 重要度为 %f" %(feature_names[index], feature_importances[index]))

%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(16,8))
plt.title("feature importances")
plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b')
'''
特征排序：
特征 petal_length 重要度为 0.676586
特征 petal_width 重要度为 0.297366
特征 sepal_width 重要度为 0.016450
特征 sepal_length 重要度为 0.009598
([<matplotlib.axis.XTick at 0x16e8bc09820>,
  <matplotlib.axis.XTick at 0x16e8bc09850>,
  <matplotlib.axis.XTick at 0x16e8bcc6a30>,
  <matplotlib.axis.XTick at 0x16e8c015bb0>],
 [Text(0, 0, 'petal_length'),
  Text(0, 0, 'petal_width'),
  Text(0, 0, 'sepal_width'),
  Text(0, 0, 'sepal_length')])

'''