数据科学 案例10 组合模型之宽带营销(代码)

组合模型

from sklearn.model_selection import train_test_split
import sklearn.tree as tree
import sklearn.ensemble as ensemble
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV #网格搜索  
import matplotlib.pyplot as plt

1、导入数据

model_data = pd.read_csv(r'.\data\broadband.csv',encoding='gbk')
model_data.head()
CUST_IDGENDERAGETENURECHANNELAUTOPAYARPB_3MCALL_PARTY_CNTDAY_MOUAFTERNOON_MOUNIGHT_MOUAVG_CALL_LENGTHBROADBAND
063134272020300.00.00.03.041
164062581036000.01910.00.03.301
26513955303040437.2200.30.04.920
36613955303040437.2182.80.04.920
46713955303040437.2214.50.04.920
target = model_data["BROADBAND"]
orgData1 = model_data.iloc[ :,1:-2]   #第二列到倒数第三列的数据
#查看样本分布
plt.figure(figsize=[5,4])
target_cout = target.value_counts()
target_cout.plot(kind='bar')
plt.show()

在这里插入图片描述

train_data, test_data, train_target, test_target = train_test_split(
    orgData1, target, test_size=0.4, train_size=0.6, random_state=12345)  #划分训练集和测试集

2、决策树算法

2.1 构建模型

param_grid = {
    'criterion':['entropy','gini'],
    'max_depth':[2,3,4,5,6,7,8],
    'min_samples_split':[4,8,12,16,20,24,28] #最小叶子树
}

clf = tree.DecisionTreeClassifier()
clfcv = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', cv=4)
clfcv.fit(train_data, train_target)

GridSearchCV(cv=4, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8],
                         'min_samples_split': [4, 8, 12, 16, 20, 24, 28]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)

2.2 打印结果

test_est = clfcv.predict(test_data)
print("decision tree accuracy:")

print(metrics.classification_report(test_target,test_est))
print("decision tree AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))
decision tree accuracy:
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       359
           1       0.79      0.22      0.34        87

    accuracy                           0.84       446
   macro avg       0.82      0.60      0.62       446
weighted avg       0.83      0.84      0.80       446

decision tree AUC:
AUC = 0.6022
print(metrics.classification_report(test_target,test_est))
              precision    recall  f1-score   support

           0       0.88      0.97      0.92       359
           1       0.80      0.45      0.57        87

    accuracy                           0.87       446
   macro avg       0.84      0.71      0.75       446
weighted avg       0.86      0.87      0.86       446

3、随机森林

3.1 构建模型

param_grid = {
    'criterion':['entropy','gini'],
    'max_depth':[7,8,10,12],
    'n_estimators':[11,13,15],  #决策树个数-随机森林特有参数
    'max_features':[0.2,0.3,0.4,0.5], #每棵决策树使用的变量占比-随机森林特有参数
    'min_samples_split':[4,8,12,16] 
}

rfc = ensemble.RandomForestClassifier()
rfccv = GridSearchCV(estimator=rfc, param_grid=param_grid, scoring='roc_auc', cv=4)
rfccv.fit(train_data, train_target)
test_est = rfccv.predict(test_data)

3.2 打印结果

print("random forest accuracy:")
print(metrics.classification_report(test_target,test_est))
print("random forest AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))
random forest accuracy:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92       359
           1       0.75      0.45      0.56        87

    accuracy                           0.86       446
   macro avg       0.81      0.71      0.74       446
weighted avg       0.85      0.86      0.85       446

random forest AUC:
AUC = 0.7060

3.3 打印最优参数

  • 由于一般缺乏对网格搜索参数的经验,建议把最优参数打印出来,看看取值是否在边届上
  • 如果在边界上,就需要扩大搜索范围
  • 网格搜索需要由宽到细多进行几次

‘max_depth’: 12是边界,得放大

‘n_estimators’: 15是边界,得放大

‘min_samples_split’: 4是边界,得缩小

rfccv.best_params_
{'criterion': 'gini',
 'max_depth': 12,
 'max_features': 0.3,
 'min_samples_split': 4,
 'n_estimators': 15}

4、Adaboost算法

? GridSearchCV

4.1 构建模型

param_grid = {
    #'base_estimator':['DecisionTreeClassifier'],
    'learning_rate':[0.1,0.3,0.5,0.7,1]
}

abc = ensemble.AdaBoostClassifier(n_estimators=100,algorithm='SAMME')
abccv = GridSearchCV(estimator=abc, param_grid=param_grid, scoring='roc_auc', cv=4)
abccv.fit(train_data, train_target)
test_est = abccv.predict(test_data)

4.2 打印结果

print("abc classifier accuracy:")
print(metrics.classification_report(test_target,test_est))
print("abc classifier AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))
abc classifier accuracy:
              precision    recall  f1-score   support

           0       0.85      0.96      0.90       359
           1       0.64      0.29      0.40        87

    accuracy                           0.83       446
   macro avg       0.74      0.62      0.65       446
weighted avg       0.81      0.83      0.80       446

abc classifier AUC:
AUC = 0.6242

4.3 打印最优参数

abccv.best_params_
{'learning_rate': 0.3}

5、GBDT

5.1 构建模型

param_grid = {
    'loss':['deviance','exponential'],
    'learning_rate':[0.1,0.3,0.5,0.7,1],
    'n_estimators':[10,15,20,30],  #决策树个数-GBDT特有参数
    'max_depth':[1,2,3],  #单棵树最大深度-GBDT特有参数
    'min_samples_split':[2,4,8,12,16,20] 
    
}

gbc = ensemble.GradientBoostingClassifier()
gbccv = GridSearchCV(estimator=gbc, param_grid=param_grid, scoring='roc_auc', cv=4)
gbccv.fit(train_data, train_target)
test_est = gbccv.predict(test_data)

5.2 打印结果

print("gradient boosting accuracy:")
print(metrics.classification_report(test_target,test_est))
print("gradient boosting AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))
gradient boosting accuracy:
              precision    recall  f1-score   support

           0       0.88      0.97      0.92       359
           1       0.80      0.45      0.57        87

    accuracy                           0.87       446
   macro avg       0.84      0.71      0.75       446
weighted avg       0.86      0.87      0.86       446

gradient boosting AUC:
AUC = 0.7102

4.3 打印最优参数

#为什么一定要打印模型参数看一下?

gbccv.best_params_
{'learning_rate': 0.3,
 'loss': 'deviance',
 'max_depth': 3,
 'min_samples_split': 2,
 'n_estimators': 30}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

irober

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值