sklearn sklearn中Xgboost算法介绍和XGBClassifier方法的代码实现
xgboost.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, verbosity=1, objective='binary:logistic',
booster='gbtree', tree_method='auto',n_jobs=1, gpu_id=-1, gamma=0,min_child_weight=1,
max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1,colsample_bynode=1,
reg_alpha=0,reg_lambda=1,scale_pos_weight=1,base_score=0.5,random_state=0,missing=None,**kwargs)
参数:
代码:
# 导入库
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from xgboost import plot_importance
### 载入数据
digits = datasets.load_digits()
### data analysis
print(digits.data.shape)
print(digits.target.shape)
### 划分训练集测试集
x_train,x_test,y_train,y_test = train_test_split(digits.data, digits.target, test_size = 0.3,random_state = 33)
### 训练模型
model = XGBClassifier(learning_rate=0.1,
n_estimators=100, # 树的个数--100棵树建立xgboost
max_depth=6, # 树的深度
min_child_weight = 1, # 叶子节点最小权重
gamma=0., # 惩罚项中叶子结点个数前的参数
subsample=0.8, # 随机选择80%样本建立决策树
colsample_btree=0.8, # 随机选择80%特征建立决策树
objective='multi:softmax', # 指定损失函数
scale_pos_weight=1, # 解决样本个数不平衡的问题
random_state=27 # 随机数
)
# 拟合
model.fit(x_train, y_train, eval_set = [(x_test,y_test)], eval_metric = "mlogloss", early_stopping_rounds = 10,
verbose = True)
### 特征重要性
fig,ax = plt.subplots(figsize=(15,15))
plot_importance(model, height=0.5, ax=ax, max_num_features=64)
plt.show()
### 预测
y_pred = model.predict(x_test)
### 模型正确率
accuracy = accuracy_score(y_test,y_pred)
print("准确率: %.2f%%" % (accuracy*100.0))
准确率:
(1797, 64) #数据总共1797条,每条64个变量
(1797,) #目标值个数对应数据条数
准确率: 95.37%
xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA, prediction = FALSE, showsd = TRUE,
metrics = list(), obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = TRUE
print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...
使用交叉验证获取最好的参数:
import xgboost as xgb
xgb_param = model.get_xgb_params()
xgtrain = xgb.DMatrix(x_train, label=y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['mlogloss'],
early_stopping_rounds=50, stratified=True, seed=1301)
#交叉验证后最好的树
print('Best number of trees = {}'.format(cvresult.shape[0]))
model.set_params(n_estimators=cvresult.shape[0])#把model的参数设置成最好的树对应的参数
# 下面进行相同方式的结果展现
fig,ax = plt.subplots(figsize=(15,15))
plot_importance(model, height=0.5, ax=ax, max_num_features=64)
plt.show()
### 预测
y_pred = model.predict(x_test)
### 模型正确率
accuracy = accuracy_score(y_test,y_pred)
print("准确率: %.2f%%" % (accuracy*100.0))