xgboost是提升树方法的一种,算法由GBDT改进而来,在计算时也采用并行计算,速度更快。sklearn中提供分类和回归的xgboost模型,本文对二分类问题采用xgboost进行训练。
一、数据准备
1、样本
- 正样本:1.5W
- 负样本:10W
- 5个特征
2、分训练集和测试集
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('data.csv')
#label
label = df.ix[:,[0]]
#特征
features = df.ix[:,[1,2,3,4,5]]
#分训练集测试集
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=3)
二、训练模型
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn import metrics
model = XGBClassifier(learning_rate=0.01,
n_estimators=10, # 树的个数-10棵树建立xgboost
max_depth=4, # 树的深度
min_child_weight = 1, # 叶子节点最小权重
gamma=0., # 惩罚项中叶子结点个数前的参数
subsample=1, # 所有样本建立决策树
colsample_btree=1, # 所有特征建立决策树
scale_pos_weight=1, # 解决样本个数不平衡的问题
random_state=27, # 随机数
slient = 0
)
model.fit(X_train,
y_train)
三、预测
#预测
y_test, y_pred = y_test, model.predict(X_test)
print("Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred))
y_train_proba = model.predict_proba(X_train)[:,1]
print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, y_train_proba))
y_proba = model.predict_proba(X_test)[:,1]
print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_proba))