XGBoost有两种方法建模训练
分类问题
使用xgboost原生库进行训练
import xgboost as xgb
from sklearn.metrics import accuracy_score
dtrain = xgb.DMatrix(f_train, label = l_train)
dtest = xgb.DMatrix(f_test, label = l_test)
param = {'max_depth':2, 'eta':1, 'silent':0, 'objective':'binary:logistic' }
num_round = 2
bst = xgb.train(param, dtrain, num_round)
train_preds = bst.predict(dtrain)
train_predictions = [round(value) for value in train_preds] #进行四舍五入的操作--变成0.1(算是设定阈值的符号函数)
train_accuracy = accuracy_score(l_train, train_predictions) #使用sklearn进行比较正确率
print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
from xgboost import plot_importance #显示特征重要性
plot_importance(bst)#打印重要程度结果。
pyplot.show()
2、使用XGBClassifier进行训练
# 未设定早停止, 未进行矩阵变换
from xgboost import XGBClassifier
from sklearn.datasets import load_svmlight_file #用于直接读取svmlight文件形式, 否则就需要使用xgboost.DMatrix(文件名)来读取这种格式的文件
from sklearn.metrics import accuracy_score
from matplotlib import pyplot
num_round = 100
bst1 =XGBClassifier(max_depth=2, learning_rate=1, n_estimators=num_round,
silent=True, objective='binary:logistic')
bst1.fit(f_train, l_train)
train_preds = bst1.predict(f_train)
train_accuracy = accuracy_score(l_train, train_preds)
print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
preds = bst1.predict(f_test)
test_accuracy = accuracy_score(l_test, preds)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
from xgboost import plot_importance #显示特征重要性
plot_importance(bst1)#打印重要程度结果。
pyplot.show()
当用XGBClassifier编译模型时,param前要加**,不然会报错
参考:XGBRegressor 参数调优
回归问题
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import numpy as np
folds = KFold(n_splits=5,random_state=111)
param = {'objective' : 'reg:squarederror',
'learning_rate': 0.22,
'max_depth': 9,
'min_child_weight': 1.5,
'gamma': 3.12,
'subsample': 1,
'colsample_bytree': 0.87,
}
num_round=20
#筛选违约样本
train_Default=train[train.PD==1]
test_Default=test[test.PD==1]
X_train=train_Default[feature_columns]
y_train=train_Default[['EAD.']]
X_test=test_Default[feature_columns]
y_test=test_Default[['EAD.']]
SEED=111
rmse_Score=[]
for train_index, test_index in folds.split(X_train, y_train):
fold_xtrain,fold_ytrain=X_train.values[train_index],y_train.values[train_index]
fold_xtest,fold_ytest=X_train.values[test_index],y_train.values[test_index]
bst=XGBRegressor(**param,n_estimators=num_round,random_state=SEED)#编译模型
bst.fit(fold_xtrain,fold_ytrain)#训练
preds = bst.predict(fold_xtest)#预测
rmse = np.sqrt(mean_squared_error(fold_ytest, preds))
rmse_Score.append(rmse)
RMSE=round((np.mean(rmse_Score)),4)
print('RMSE = %.4f' % RMSE)