Python数据分析与机器学习-用户流失预警

import pandas as pd
import numpy as np

pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)

churn_df = pd.read_csv('churn.csv')
'''
  State  Account Length  Area Code     Phone Int'l Plan VMail Plan  VMail Message  Day Mins  Day Calls  Day Charge  Eve Mins  Eve Calls  Eve Charge  Night Mins  Night Calls  Night Charge  Intl Mins  Intl Calls  Intl Charge  CustServ Calls  Churn?
0    KS             128        415  382-4657         no        yes             25     265.1        110       45.07     197.4         99       16.78       244.7           91         11.01       10.0           3         2.70               1  False.
1    OH             107        415  371-7191         no        yes             26     161.6        123       27.47     195.5        103       16.62       254.4          103         11.45       13.7           3         3.70               1  False.
2    NJ             137        415  358-1921         no         no              0     243.4        114       41.38     121.2        110       10.30       162.6          104          7.32       12.2           5         3.29               0  False.
3    OH              84        408  375-9999        yes         no              0     299.4         71       50.90      61.9         88        5.26       196.9           89          8.86        6.6           7         1.78               2  False.
4    OK              75        415  330-6626        yes         no              0     166.7        113       28.34     148.3        122       12.61       186.9          121          8.41       10.1           3         2.73               3  False.

'''
churn_feat_space = churn_df.drop(['State', 'Area Code', 'Phone', 'Churn?'], axis=1)
yes_no_cols = ["Int'l Plan", "VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
# features = churn_feat_space.columns
# print(churn_feat_space.head())
'''
   Account Length  Int'l Plan  VMail Plan  VMail Message  Day Mins  Day Calls  Day Charge  Eve Mins  Eve Calls  Eve Charge  Night Mins  Night Calls  Night Charge  Intl Mins  Intl Calls  Intl Charge  CustServ Calls
0             128       False        True             25     265.1        110       45.07     197.4         99       16.78       244.7           91         11.01       10.0           3         2.70               1
1             107       False        True             26     161.6        123       27.47     195.5        103       16.62       254.4          103         11.45       13.7           3         3.70               1
2             137       False       False              0     243.4        114       41.38     121.2        110       10.30       162.6          104          7.32       12.2           5         3.29               0
3              84        True       False              0     299.4         71       50.90      61.9         88        5.26       196.9           89          8.86        6.6           7         1.78               2
4              75        True       False              0     166.7        113       28.34     148.3        122       12.61       186.9          121          8.41       10.1           3         2.73               3
'''
X = churn_feat_space.as_matrix().astype(np.float)
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.', 1, 0)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
# print(X[0])
'''
[ 0.67648946 -0.32758048  1.6170861   1.23488274  1.56676695  0.47664315
  1.56703625 -0.07060962 -0.05594035 -0.07042665  0.86674322 -0.46549436
  0.86602851 -0.08500823 -0.60119509 -0.0856905  -0.42793202]
'''

'''交叉验证通用函数'''
from sklearn.cross_validation import KFold


# X,y,选择的分类器,参数
def run_cv(X, y, clf_class, **kwargs):
    # Construct a kfolds object
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train, y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred


from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN


# 精度
def accuracy(y_true, y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)


print("Support vector machines:")
print("%.3f" % accuracy(y, run_cv(X, y, SVC)))
print("Random forest:")
print("%.3f" % accuracy(y, run_cv(X, y, RF)))
print("K-nearest-neighbors:")
print("%.3f" % accuracy(y, run_cv(X, y, KNN)))


# 客户流失的概率
def run_prob_cv(X, y, clf_class, **kwargs):
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y), 2))
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train, y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob


# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
# print pred_prob[0]
pred_churn = pred_prob[:, 1]
is_churn = y == 1

# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
# print counts

# calculate true probabilities
true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

# pandas-fu
counts = pd.concat([counts, true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
print(counts)

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值