xgboost实现Happy Customer Bank目标客户识别

最新推荐文章于 2023-04-28 22:23:04 发布

机智帅气的葫芦娃

最新推荐文章于 2023-04-28 22:23:04 发布

阅读量705

点赞数 2

分类专栏： xgboost python

本文链接：https://blog.csdn.net/clever_wr/article/details/85144064

版权

python 同时被 2 个专栏收录

5 篇文章 1 订阅

订阅专栏

xgboost

1 篇文章 0 订阅

订阅专栏

老师最近布置了一个作业，第一次接触了xgboost，之前只是听老师理论的讲解，这次布置作业让我对这个模型有了深入的理解。作业任务说明：Happy Customer Bank目标客户识别

https://discuss.analyticsvidhya.com/t/hackathon-3-x-predict-customer-worth-for-happy-customer-bank/3802

这里不对该项目做详细介绍。

花了我半天的时间，在尝试怎么把字符串转化为数字类型进行训练。真的好麻烦，比如把“female”和“male”用数字形式表示。。

Female  Male
    1     0

采用枚举的方式进行数字类型的转化，对多类字符串类型特征值进行转化，具体代码如下：

array = ['Gender', 'City', 'DOB', 'Lead_Creation_Date', 'Employer_Name', 'Salary_Account', 'Mobile_Verified', 'Var1', 'Filled_Form', 'Device_Type', 'Var2', 'Source']
#array = train.columns显示label
#print array.size可以用来测试数组的距离
for i in range(len(array)) :
    class_mapping = {label:idx for idx,label in enumerate(set(train[array[i]]))}
    train[array[i]] = train[array[i]].map(class_mapping)
#    print train[array[i]]

求误差的代码：

#-*-coding:utf-8-*-
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import time
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
start_name = time.time()
path = "D:\\"
#读入数据
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")

#Sex_ohe_1 = pd.get_dummies(train['Gender'])
#print Sex_ohe_1.head()
array = ['Gender', 'City', 'DOB', 'Lead_Creation_Date', 'Employer_Name', 'Salary_Account', 'Mobile_Verified', 'Var1', 'Filled_Form', 'Device_Type', 'Var2', 'Source']
#array = train.columns显示label
#print array.size可以用来测试数组的距离
for i in range(len(array)) :
    class_mapping = {label:idx for idx,label in enumerate(set(train[array[i]]))}
    train[array[i]] = train[array[i]].map(class_mapping)
#    print train[array[i]]
for i in range(len(array)) :
    class_mapping = {label:idx for idx,label in enumerate(set(test[array[i]]))}
    test[array[i]] = train[array[i]].map(class_mapping)
#    print test[array[i]]

#用train_test_split进行训练数据集的划分,将训练集和交叉验证集分为8:2
train1, val = train_test_split(train, test_size = 0.2, random_state=1)
y = train1.Disbursed
x = train1.drop(['Disbursed', 'ID', 'LoggedIn'], axis=1)
test = test.drop('ID', axis=1)
valy = val.Disbursed
valx = val.drop(['Disbursed', 'ID', 'LoggedIn'], axis=1)
xgb_val = xgb.DMatrix(valx, label = valy)
xgb_train1 = xgb.DMatrix(x, label = y)
xgb_test = xgb.DMatrix(test)
print xgb_test.feature_names
params = {
    'booster':'gbtree',
    'objective': 'multi:softmax', #多分类的问题
    'num_class':10, # 类别数，与 multisoftmax 并用
    'gamma':0.1,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth':12, # 构建树的深度，越大越容易过拟合
    'lambda':2,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample':0.7, # 随机采样训练样本
    'colsample_bytree':0.7, # 生成树时进行的列采样
    'min_child_weight':3,
    # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
    #，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
    #这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。
    'silent':0 ,#设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.01, # 如同学习率
    'seed':1000,
    'nthread':7,# cpu 线程数
    #'eval_metric': 'auc'
}
plst = list(params.items())
num_iter = 5000
watchList = [(xgb_train1, "train"), (xgb_val, "val")]
model = xgb.train(plst, xgb_train1, num_iter, watchList, early_stopping_rounds=100)
model.save_model(path + "xgboost.model")
print "best_ntree_limit", model.best_ntree_limit
predict = model.predict(xgb_test, ntree_limit = model.best_ntree_limit)
np.savetxt(path + 'result.csv', np.c_[range(1, len(test)+1), predict], delimiter=',',header='ID, Disbursed',comments='',fmt='%d')
cost_time = time.time() - start_name
print "运行时间：", cost_time

测试结果：

train-merror:0.014695	val-merror:0.014365
best_ntree_limit 1
运行时间： 72.9449999332

求准确率的代码：

#-*-coding:utf-8-*-
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score
import time
from sklearn import preprocessing
from sklearn import metrics
from matplotlib import pyplot as plt
from xgboost import plot_importance

start_name = time.time()
path = "D:\\"
#读入数据
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
#Sex_ohe_1 = pd.get_dummies(train['Gender'])
#print Sex_ohe_1.head()
array = ['Gender', 'City', 'DOB', 'Lead_Creation_Date', 'Employer_Name', 'Salary_Account', 'Mobile_Verified', 'Var1', 'Filled_Form', 'Device_Type', 'Var2', 'Source']
#array = train.columns显示label
#print array.size可以用来测试数组的距离
for i in range(len(array)) :
    class_mapping = {label:idx for idx,label in enumerate(set(train[array[i]]))}
    train[array[i]] = train[array[i]].map(class_mapping)
#    print train[array[i]]
for i in range(len(array)) :
    class_mapping = {label:idx for idx,label in enumerate(set(test[array[i]]))}
    test[array[i]] = train[array[i]].map(class_mapping)
#    print test[array[i]]
#用train_test_split进行训练数据集的划分,将训练集和交叉验证集分为8:2
train1, val = train_test_split(train, test_size = 0.2, random_state=1)
print len(train1), len(val)
y = train1.Disbursed
x = train1.drop(['Disbursed', 'ID', 'LoggedIn'], axis=1)
test = test.drop('ID', axis=1)
valy = val.Disbursed
valx = val.drop(['Disbursed', 'ID', 'LoggedIn'], axis=1)
#数据归一化处理
scaler = preprocessing.StandardScaler().fit(x)
x = scaler.transform(x)
#x = preprocessing.scale(x)
print(x)
scaler = preprocessing.StandardScaler().fit(valx)
valx = scaler.transform(valx)
#valx = preprocessing.scale(valx)
print(valx)
scaler = preprocessing.StandardScaler().fit(test)
test = scaler.transform(test)
#test = preprocessing.scale(test)
print(test)
xgb_val = xgb.DMatrix(valx, label = valy)
xgb_train1 = xgb.DMatrix(x, label = y)
xgb_test = xgb.DMatrix(test)
print xgb_test.feature_names
params = {
    'booster':'gbtree',
    'objective': 'multi:softmax', #多分类的问题
    'num_class':2, # 类别数，与 multisoftmax 并用
    'gamma':0.1,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth':6, # 构建树的深度，越大越容易过拟合
    'lambda':2,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample':0.7, # 随机采样训练样本
    'colsample_bytree':0.7, # 生成树时进行的列采样
    'min_child_weight':3,
    # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
    #，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
    #这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。
    'silent':0 ,#设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.05, # 如同学习率
    'seed':1000,
    'nthread':7,# cpu 线程数
    #'eval_metric': 'auc'
}
plst = list(params.items())
num_iter = 10000
watchlist = [(xgb_train1,'train'),(xgb_val,'val')]
bst = xgb.train(plst,xgb_train1,num_boost_round=num_iter,evals = watchlist)
#bst = xgb.train(plst, xgb_train1, num_iter)
train_preds = bst.predict(xgb_train1)
#print "train_preds", train_preds

train_predictions = [round(value) for value in train_preds]
#print "train_predictions", train_predictions

y_train = xgb_train1.get_label()
#print "y_train", y_train
count1 = 0
if i in range(len(train_predictions)):
    if(train_predictions[i] == 1):
        count = count + 1
print "count:", count
train_acc = accuracy_score(y_train, train_predictions)
print "Train Accuary:%.2f%%" %(train_acc * 100.0)
print xgb_train1
print xgb_test.feature_names
preds = bst.predict(xgb_val)
test_predictions = [round(value) for value in preds]
y_test = xgb_val.get_label()
test_acc = accuracy_score(y_test, test_predictions)
print "Test Accuary:%.2f%%" %(test_acc * 100.0)
#检验模型是否稳健,macro表示宏平均），是先对每一个类统计指标值，然后在对所有类求算术平均值。
#微平均（Micro-averaging），是对数据集中的每一个实例不分类别进行统计建立全局混淆矩阵，然后计算相应指标。（来源：谈谈评价指标中的宏平均和微平均）
print set(y_train)
print set(train_predictions)
train_f1 = metrics.f1_score(y_train, train_preds, average="weighted", labels=np.unique(train_preds))
print "Train F1 score:%.6f" %train_f1
#labels=np.unique(preds),决定您对未预测的标签分数不感兴趣,然后明确指定您感兴趣的标签(这些标签至少预测过一次)
print set(y_test)
print set(test_predictions)
test_f1 = metrics.f1_score(y_test, preds, average="weighted", labels=np.unique(preds))
print "Test F1 score:%.6f" %test_f1
bst.save_model(path + "xgboost.model")
print "best_ntree_limit", bst.best_ntree_limit
predict = bst.predict(xgb_test, ntree_limit = bst.best_ntree_limit)
np.savetxt(path + 'result.csv', np.c_[range(1, len(test)+1), predict], delimiter=',',header='ID, Disbursed',comments='',fmt='%d')
cost_time = time.time() - start_name
print "运行时间：" , cost_time
plot_importance(bst)
plt.show()

测试结果：

Train Accuary:99.99%
Test Accuary:98.54%
Train F1 score:0.999871
Test F1 score:0.978390
best_ntree_limit 10000
运行时间： 2621.94099998

欢迎大家多交流。如果有错误请指正。

机智帅气的葫芦娃

关注

2
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
xgboost实现Happy Customer Bank目标客户识别

老师最近布置了一个作业，第一次接触了xgboost，之前只是听老师理论的讲解，这次布置作业让我对这个模型有了深入的理解。作业任务说明：Happy Customer Bank目标客户识别https://discuss.analyticsvidhya.com/t/hackathon-3-x-predict-customer-worth-for-happy-customer-bank/3802这...
复制链接

扫一扫

专栏目录