在下面我将逐渐补充XGBoost学习过程中使用的资料等内容。
官方资料
链接中是Python API源代码以及C++源代码。
官方的说明在这里,其中Python的样例以及API说明、参数调节教程比较好。
比较好的博客资料
一些样例代码
读入数据的方式有多种,比如 load_svmlight_file()以及DMatrix().这两种的效果是一样的。模型也有多种使用方法,比如xgboost.train(), xgboost.XGBClassifier().fit()。其中load_svmlight_file()和 xgboost.XGBClassifier().fit()是xgboost的Scikit-Learn Wrapper 接口,另外两种是Python调用的普通函数。
下面是对比两种包装器以及两种数据加载方法的代码,比较乱,可以忽略。
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets,svm
from sklearn.datasets import load_svmlight_file
iris = datasets.load_iris()
X_train = iris.data[:, :2] # we only take the first two features.
y_train = iris.target
train_X = iris.data[:, :2] # we only take the first two features.
train_Y = iris.target
test_X = iris.data[:, :2] # we only take the first two features.
test_Y = iris.target
### first xgboost
xgm = xgb.XGBClassifier()
xgm.fit(X_train, y_train)
y_pred = xgm.predict(X_train)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_train, predictions)
print xgm
print("XGBoost Accuracy: %.2f%%" % (accuracy * 100.0))
### second xgboost
xg_train = xgb.DMatrix( train_X, label=train_Y)
xg_test = xgb.DMatrix(test_X, label=test_Y)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 3
param['silent'] = True
param['nthread'] = -1
param['num_class'] = 3
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 100
bst = xgb.train(param, xg_train, num_round, watchlist );
# get prediction
pred = bst.predict( xg_test );
print ('predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) ))
# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bst = xgb.train(param, xg_train, num_round, watchlist );
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
yprob = bst.predict( xg_test ).reshape( test_Y.shape[0], 3 )
ylabel = np.argmax(yprob, axis=1)
print ('predicting, classification error=%f' % (sum( int(ylabel[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) ))
## compare two data input method.
X_train, y_train=load_svmlight_file('/Users/AureDi/Desktop/heart_scale') # loading datasets in the svmlight/libsvm format.
xgm = xgb.XGBClassifier(max_depth=3)
xgm.fit(X_train, y_train)
y_pred = xgm.predict(X_train)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_train, predictions)
print xgm
print("XGBoost Accuracy: %.2f%%" % (accuracy * 100.0))
xg_train = xgb.DMatrix('/Users/AureDi/Desktop/heart_scale')
xg_test = xgb.DMatrix('/Users/AureDi/Desktop/heart_scale')
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'binary:logistic' # 'multi:softprob' #
# scale weight of positive examples
param['learning_rate'] = 0.1
param['max_depth'] = 3
param['silent'] = True
param['nthread'] = -1
param['num_class'] = 1
param['base_score'] = 0.5
param['colsample_bylevel'] = 1
param['colsample_bytree'] = 1
param['gamma'] = 0
param['max_delta_step'] = 0
param['min_child_weight'] = 1
param['missing'] = None
param['n_estimators'] = 100
param['reg_alpha'] = 0
param['reg_lambda'] = 1
param['scale_pos_weight'] = 1
param['seed'] = 0
param['subsample'] = 1
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 100
bst = xgb.train(param, xg_train, num_round, watchlist );
其中 xgboost.XGBClassifier()利用函数参数设置模型参数,xgboost.train()利用param列表设置模型参数。