数据科学 案例5 神经网络之电信客户流失(代码)
本案例采用BP神经网络模型。
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
1、导入数据和数据清洗
churn = pd.read_csv(r'.\data\telecom_churn.csv',encoding='gbk')
churn.head()
subscriberID | churn | gender | AGE | edu_class | incomeCode | duration | feton | peakMinAv | peakMinDiff | posTrend | negTrend | nrProm | prom | curPlan | avgplan | planChange | posPlanChange | negPlanChange | call_10086 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 19164958.0 | 1.0 | 0.0 | 20.0 | 2.0 | 12.0 | 16.0 | 0.0 | 113.666667 | -8.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 39244924.0 | 1.0 | 1.0 | 20.0 | 0.0 | 21.0 | 5.0 | 0.0 | 274.000000 | -371.0 | 0.0 | 1.0 | 2.0 | 1.0 | 3.0 | 2.0 | 2.0 | 1.0 | 0.0 | 1.0 |
2 | 39578413.0 | 1.0 | 0.0 | 11.0 | 1.0 | 47.0 | 3.0 | 0.0 | 392.000000 | -784.0 | 0.0 | 1.0 | 0.0 | 0.0 | 3.0 | 3.0 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | 40992265.0 | 1.0 | 0.0 | 43.0 | 0.0 | 4.0 | 12.0 | 0.0 | 31.000000 | -76.0 | 0.0 | 1.0 | 2.0 | 1.0 | 3.0 | 3.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 43061957.0 | 1.0 | 1.0 | 60.0 | 0.0 | 9.0 | 14.0 | 0.0 | 129.333333 | -334.0 | 0.0 | 1.0 | 0.0 | 0.0 | 3.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2、神经网络
2.1 划分训练集和测试集
from sklearn.model_selection import train_test_split
data = churn.iloc[:,2:]
target = churn['churn']
train_data, test_data, train_target, test_target = train_test_split(\
data, target, test_size=0.4, train_size=0.6, random_state=123)
2.2 极差标准化(神经网络一定要做)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_data)
scaled_train_data = scaler.transform(train_data)
scaled_test_data = scaler.transform(test_data)
2.3 调用神经网络包
from sklearn.neural_network import MLPClassifier #多层感知器
mlp = MLPClassifier(hidden_layer_sizes=(10,),
activation='logistic', alpha=0.1, max_iter=1000)
mlp.fit(scaled_train_data,train_target)
mlp
MLPClassifier(activation='logistic', alpha=0.1, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(10,), learning_rate='constant',
learning_rate_init=0.001, max_iter=1000, momentum=0.9,
n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
random_state=None, shuffle=True, solver='adam', tol=0.0001,
validation_fraction=0.1, verbose=False, warm_start=False)
3、预测
3.1 预测分类标签
train_predict = mlp.predict(scaled_train_data)
test_predict = mlp.predict(scaled_test_data)
3.1 预测概率
# 计算分别属于各类的概率,取标签为1的概率
train_proba = mlp.predict_proba(scaled_train_data)[:, 1]
test_proba = mlp.predict_proba(scaled_test_data)[:, 1]
4、验证
from sklearn import metrics
print(metrics.confusion_matrix(test_target, test_predict, labels=[0,1]))
print(metrics.classification_report(test_target, test_predict))
[[667 134]
[ 96 489]]
precision recall f1-score support
0.0 0.87 0.83 0.85 801
1.0 0.78 0.84 0.81 585
micro avg 0.83 0.83 0.83 1386
macro avg 0.83 0.83 0.83 1386
weighted avg 0.84 0.83 0.83 1386
4.1 平均精度
mlp.score(scaled_test_data, test_target) # Mean accuracy
0.834054834054834
4.2 ROC曲线
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_proba)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_proba)
plt.figure(figsize=[4, 4])
plt.plot(fpr_test, tpr_test, 'b-')
plt.plot(fpr_train, tpr_train, 'r-')
plt.title('ROC curve')
plt.show()
print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))
AUC = 0.9207
4.3 模型优化
我的电脑运行了7分钟才出结果(所以耐心等待)
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
param_grid = {
'hidden_layer_sizes':[(10, ), (15, ), (20, ), (5, 5)],
'activation':['logistic', 'tanh', 'relu'],
'alpha':[0.001, 0.01, 0.1, 0.2, 0.4, 1, 10]
}
mlp = MLPClassifier(max_iter=1000)
gcv = GridSearchCV(estimator=mlp, param_grid=param_grid,
scoring='roc_auc', cv=4, n_jobs=-1)
gcv.fit(scaled_train_data, train_target)
GridSearchCV(cv=4, error_score='raise-deprecating',
estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(100,), learning_rate='constant',
learning_rate_init=0.001, max_iter=1000, momentum=0.9,
n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
random_state=None, shuffle=True, solver='adam', tol=0.0001,
validation_fraction=0.1, verbose=False, warm_start=False),
fit_params=None, iid='warn', n_jobs=-1,
param_grid={'hidden_layer_sizes': [(10,), (15,), (20,), (5, 5)], 'activation': ['logistic', 'tanh', 'relu'], 'alpha': [0.001, 0.01, 0.1, 0.2, 0.4, 1, 10]},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='roc_auc', verbose=0)
gcv.best_score_
0.9223851258606411
gcv.best_params_
{'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (15,)}
gcv.best_estimator_
MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(15,), learning_rate='constant',
learning_rate_init=0.001, max_iter=1000, momentum=0.9,
n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
random_state=None, shuffle=True, solver='adam', tol=0.0001,
validation_fraction=0.1, verbose=False, warm_start=False)
gcv.score(scaled_test_data, test_target) # Mean accuracy
0.9218668971477961