Xgboost建模,sklearn评估,分类问题用混淆矩阵,回归问题用MSE

import xgboost as xgb
import numpy as np
import pickle
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error #混淆矩阵,均方误差
from sklearn.datasets import load_iris, load_digits, load_boston 
#鸢尾花数据集,鸢尾花种类预测,属于分类,分类问题用混淆矩阵
#样本数据集,属于分类,分类问题用混淆矩阵
#波士顿房价数据集,线性回归,回归问题用MSE
rng = np.random.RandomState(31337)

#二分类:混淆矩阵
print("数字0和1的二分类问题")
digits = load_digits(2)
y = digits['target']
X = digits['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng) #shuffle()函数是打乱序列里面的元素,并随机排列的。
print("在2折数据上的交叉验证")
for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print("混淆矩阵:")
    print(confusion_matrix(actuals, predictions))
'''
数字0和1的二分类问题
在2折数据上的交叉验证
[22:28:44] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
混淆矩阵:
[[87  0]
 [ 1 92]]
[22:28:44] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
混淆矩阵:

[[91  0]
 [ 2 87]]
'''
#多分类:混淆矩阵
print("\nIris: 多分类")
iris = load_iris()
y = iris['target']
X = iris['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
print("在2折数据上的交叉验证")
for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print("混淆矩阵:")
    print(confusion_matrix(actuals, predictions))
'''
Iris: 多分类
在2折数据上的交叉验证
[22:12:50] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
混淆矩阵:
[[24  0  0]
 [ 0 26  1]
 [ 0  4 20]]
[22:12:50] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
混淆矩阵:

'''
'''
[[26  0  0]
 [ 0 20  3]
 [ 0  1 25]]

'''

结论:鸢尾花数据集和样本分类集,除了数据源不一样,其他的套路都是一样的。所以此类问题,都可以通过此方法解决。

 

#回归问题:MSE
print("\n波士顿房价回归预测问题")
boston = load_boston()
y = boston['target']
X = boston['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
print("在2折数据上的交叉验证")
for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print("MSE:",mean_squared_error(actuals, predictions))
'''
波士顿房价回归预测问题
在2折数据上的交叉验证
MSE: 21.88594990885867
MSE: 14.807648754688827
'''

网格搜索查找最优超参数

# 第2种训练方法的 调参方法:使用sklearn接口的regressor + GridSearchCV
print("参数最优化:")
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()
param_dict = {'max_depth': [2,4,6],
              'n_estimators': [50,100,200]}

clf = GridSearchCV(xgb_model, param_dict, verbose=1)
'''
verbose是控制日志输出的, 'controls the verbosity: the higher, the more messages'
verbose = 0没有输出;verbose = 1 简化版日志输出;verbose=2 更细致的日志输出...
一般设置到2就很多信息了,日志输出太多会影响运行的速度。
'''
clf.fit(X,y)
print(clf.best_score_)
print(clf.best_params_)
'''
参数最优化:
Fitting 5 folds for each of 9 candidates, totalling 45 fits
0.6839859272017424
{'max_depth': 2, 'n_estimators': 100}

'''

early-stopping 早停:

# 第1/2种训练方法的 调参方法:early stopping
# 在训练集上学习模型,一颗一颗树添加,在验证集上看效果,当验证集效果不再提升,停止树的添加与生长
X = digits['data']
y = digits['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
        eval_set=[(X_val, y_val)])

输出结果
[0]	validation_0-auc:0.99950
[1]	validation_0-auc:0.99975
[2]	validation_0-auc:0.99975
[3]	validation_0-auc:0.99975
[4]	validation_0-auc:0.99975
[5]	validation_0-auc:0.99975
[6]	validation_0-auc:1.00000
[7]	validation_0-auc:1.00000
[8]	validation_0-auc:1.00000
[9]	validation_0-auc:1.00000
[10]	validation_0-auc:1.00000
[11]	validation_0-auc:1.00000
[12]	validation_0-auc:1.00000
[13]	validation_0-auc:1.00000
[14]	validation_0-auc:1.00000
[15]	validation_0-auc:1.00000
[16]	validation_0-auc:1.00000
'''
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


'''

特征重要度

iris = load_iris()
y = iris['target']
X = iris['data']
xgb_model = xgb.XGBClassifier().fit(X,y)
xgb_model.feature_importances_
'''
[22:47:36] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, 
the default evaluation metric used with the objective 'multi:softprob' was changed from 
'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
array([0.00959796, 0.01645038, 0.67658573, 0.29736587], dtype=float32)
'''
print('特征排序:')
feature_names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
feature_importances = xgb_model.feature_importances_

indices = np.argsort(feature_importances)[::-1]

for index in indices:
    print("特征 %s 重要度为 %f" %(feature_names[index], feature_importances[index]))

%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(16,8))
plt.title("feature importances")
plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b')
'''
特征排序:
特征 petal_length 重要度为 0.676586
特征 petal_width 重要度为 0.297366
特征 sepal_width 重要度为 0.016450
特征 sepal_length 重要度为 0.009598
([<matplotlib.axis.XTick at 0x16e8bc09820>,
  <matplotlib.axis.XTick at 0x16e8bc09850>,
  <matplotlib.axis.XTick at 0x16e8bcc6a30>,
  <matplotlib.axis.XTick at 0x16e8c015bb0>],
 [Text(0, 0, 'petal_length'),
  Text(0, 0, 'petal_width'),
  Text(0, 0, 'sepal_width'),
  Text(0, 0, 'sepal_length')])

'''

 

XGBoost 的early_stopping

  • 在训练集上学习模型,一颗一颗树添加,在验证集上看效果,当验证集效果不再提升,停止树的添加与生长

  • 通过指定early_stopping可以避免过拟合

 通过XGBoost 可以计算特征的重要程度

 

 

混淆矩阵: 

混淆矩阵

 

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

缘 源 园

你的鼓励将是我创造的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值