数据科学 案例10 组合模型之宽带营销(代码)
组合模型
from sklearn.model_selection import train_test_split
import sklearn.tree as tree
import sklearn.ensemble as ensemble
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV #网格搜索
import matplotlib.pyplot as plt
1、导入数据
model_data = pd.read_csv(r'.\data\broadband.csv',encoding='gbk')
model_data.head()
CUST_ID | GENDER | AGE | TENURE | CHANNEL | AUTOPAY | ARPB_3M | CALL_PARTY_CNT | DAY_MOU | AFTERNOON_MOU | NIGHT_MOU | AVG_CALL_LENGTH | BROADBAND | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 63 | 1 | 34 | 27 | 2 | 0 | 203 | 0 | 0.0 | 0.0 | 0.0 | 3.04 | 1 |
1 | 64 | 0 | 62 | 58 | 1 | 0 | 360 | 0 | 0.0 | 1910.0 | 0.0 | 3.30 | 1 |
2 | 65 | 1 | 39 | 55 | 3 | 0 | 304 | 0 | 437.2 | 200.3 | 0.0 | 4.92 | 0 |
3 | 66 | 1 | 39 | 55 | 3 | 0 | 304 | 0 | 437.2 | 182.8 | 0.0 | 4.92 | 0 |
4 | 67 | 1 | 39 | 55 | 3 | 0 | 304 | 0 | 437.2 | 214.5 | 0.0 | 4.92 | 0 |
target = model_data["BROADBAND"]
orgData1 = model_data.iloc[ :,1:-2] #第二列到倒数第三列的数据
#查看样本分布
plt.figure(figsize=[5,4])
target_cout = target.value_counts()
target_cout.plot(kind='bar')
plt.show()
train_data, test_data, train_target, test_target = train_test_split(
orgData1, target, test_size=0.4, train_size=0.6, random_state=12345) #划分训练集和测试集
2、决策树算法
2.1 构建模型
param_grid = {
'criterion':['entropy','gini'],
'max_depth':[2,3,4,5,6,7,8],
'min_samples_split':[4,8,12,16,20,24,28] #最小叶子树
}
clf = tree.DecisionTreeClassifier()
clfcv = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', cv=4)
clfcv.fit(train_data, train_target)
GridSearchCV(cv=4, error_score=nan,
estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None,
max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
presort='deprecated',
random_state=None,
splitter='best'),
iid='deprecated', n_jobs=None,
param_grid={'criterion': ['entropy', 'gini'],
'max_depth': [2, 3, 4, 5, 6, 7, 8],
'min_samples_split': [4, 8, 12, 16, 20, 24, 28]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='roc_auc', verbose=0)
2.2 打印结果
test_est = clfcv.predict(test_data)
print("decision tree accuracy:")
print(metrics.classification_report(test_target,test_est))
print("decision tree AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))
decision tree accuracy:
precision recall f1-score support
0 0.84 0.99 0.91 359
1 0.79 0.22 0.34 87
accuracy 0.84 446
macro avg 0.82 0.60 0.62 446
weighted avg 0.83 0.84 0.80 446
decision tree AUC:
AUC = 0.6022
print(metrics.classification_report(test_target,test_est))
precision recall f1-score support
0 0.88 0.97 0.92 359
1 0.80 0.45 0.57 87
accuracy 0.87 446
macro avg 0.84 0.71 0.75 446
weighted avg 0.86 0.87 0.86 446
3、随机森林
3.1 构建模型
param_grid = {
'criterion':['entropy','gini'],
'max_depth':[7,8,10,12],
'n_estimators':[11,13,15], #决策树个数-随机森林特有参数
'max_features':[0.2,0.3,0.4,0.5], #每棵决策树使用的变量占比-随机森林特有参数
'min_samples_split':[4,8,12,16]
}
rfc = ensemble.RandomForestClassifier()
rfccv = GridSearchCV(estimator=rfc, param_grid=param_grid, scoring='roc_auc', cv=4)
rfccv.fit(train_data, train_target)
test_est = rfccv.predict(test_data)
3.2 打印结果
print("random forest accuracy:")
print(metrics.classification_report(test_target,test_est))
print("random forest AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))
random forest accuracy:
precision recall f1-score support
0 0.88 0.96 0.92 359
1 0.75 0.45 0.56 87
accuracy 0.86 446
macro avg 0.81 0.71 0.74 446
weighted avg 0.85 0.86 0.85 446
random forest AUC:
AUC = 0.7060
3.3 打印最优参数
- 由于一般缺乏对网格搜索参数的经验,建议把最优参数打印出来,看看取值是否在边届上
- 如果在边界上,就需要扩大搜索范围;
- 网格搜索需要由宽到细多进行几次。
‘max_depth’: 12是边界,得放大
‘n_estimators’: 15是边界,得放大
‘min_samples_split’: 4是边界,得缩小
rfccv.best_params_
{'criterion': 'gini',
'max_depth': 12,
'max_features': 0.3,
'min_samples_split': 4,
'n_estimators': 15}
4、Adaboost算法
? GridSearchCV
4.1 构建模型
param_grid = {
#'base_estimator':['DecisionTreeClassifier'],
'learning_rate':[0.1,0.3,0.5,0.7,1]
}
abc = ensemble.AdaBoostClassifier(n_estimators=100,algorithm='SAMME')
abccv = GridSearchCV(estimator=abc, param_grid=param_grid, scoring='roc_auc', cv=4)
abccv.fit(train_data, train_target)
test_est = abccv.predict(test_data)
4.2 打印结果
print("abc classifier accuracy:")
print(metrics.classification_report(test_target,test_est))
print("abc classifier AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))
abc classifier accuracy:
precision recall f1-score support
0 0.85 0.96 0.90 359
1 0.64 0.29 0.40 87
accuracy 0.83 446
macro avg 0.74 0.62 0.65 446
weighted avg 0.81 0.83 0.80 446
abc classifier AUC:
AUC = 0.6242
4.3 打印最优参数
abccv.best_params_
{'learning_rate': 0.3}
5、GBDT
5.1 构建模型
param_grid = {
'loss':['deviance','exponential'],
'learning_rate':[0.1,0.3,0.5,0.7,1],
'n_estimators':[10,15,20,30], #决策树个数-GBDT特有参数
'max_depth':[1,2,3], #单棵树最大深度-GBDT特有参数
'min_samples_split':[2,4,8,12,16,20]
}
gbc = ensemble.GradientBoostingClassifier()
gbccv = GridSearchCV(estimator=gbc, param_grid=param_grid, scoring='roc_auc', cv=4)
gbccv.fit(train_data, train_target)
test_est = gbccv.predict(test_data)
5.2 打印结果
print("gradient boosting accuracy:")
print(metrics.classification_report(test_target,test_est))
print("gradient boosting AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))
gradient boosting accuracy:
precision recall f1-score support
0 0.88 0.97 0.92 359
1 0.80 0.45 0.57 87
accuracy 0.87 446
macro avg 0.84 0.71 0.75 446
weighted avg 0.86 0.87 0.86 446
gradient boosting AUC:
AUC = 0.7102
4.3 打印最优参数
#为什么一定要打印模型参数看一下?
gbccv.best_params_
{'learning_rate': 0.3,
'loss': 'deviance',
'max_depth': 3,
'min_samples_split': 2,
'n_estimators': 30}