Python数据分析与机器学习-用户流失预警

最新推荐文章于 2022-11-08 10:53:02 发布

未来，值得期待

最新推荐文章于 2022-11-08 10:53:02 发布

阅读量2.1k

点赞数 1

分类专栏： python 文章标签： Python数据分析与机器学习

本文链接：https://blog.csdn.net/adam_wzs/article/details/79222162

版权

python 专栏收录该内容

30 篇文章 2 订阅

订阅专栏

import pandas as pd
import numpy as np

pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)

churn_df = pd.read_csv('churn.csv')
'''
  State  Account Length  Area Code     Phone Int'l Plan VMail Plan  VMail Message  Day Mins  Day Calls  Day Charge  Eve Mins  Eve Calls  Eve Charge  Night Mins  Night Calls  Night Charge  Intl Mins  Intl Calls  Intl Charge  CustServ Calls  Churn?
0    KS             128        415  382-4657         no        yes             25     265.1        110       45.07     197.4         99       16.78       244.7           91         11.01       10.0           3         2.70               1  False.
1    OH             107        415  371-7191         no        yes             26     161.6        123       27.47     195.5        103       16.62       254.4          103         11.45       13.7           3         3.70               1  False.
2    NJ             137        415  358-1921         no         no              0     243.4        114       41.38     121.2        110       10.30       162.6          104          7.32       12.2           5         3.29               0  False.
3    OH              84        408  375-9999        yes         no              0     299.4         71       50.90      61.9         88        5.26       196.9           89          8.86        6.6           7         1.78               2  False.
4    OK              75        415  330-6626        yes         no              0     166.7        113       28.34     148.3        122       12.61       186.9          121          8.41       10.1           3         2.73               3  False.

'''
churn_feat_space = churn_df.drop(['State', 'Area Code', 'Phone', 'Churn?'], axis=1)
yes_no_cols = ["Int'l Plan", "VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
# features = churn_feat_space.columns
# print(churn_feat_space.head())
'''
   Account Length  Int'l Plan  VMail Plan  VMail Message  Day Mins  Day Calls  Day Charge  Eve Mins  Eve Calls  Eve Charge  Night Mins  Night Calls  Night Charge  Intl Mins  Intl Calls  Intl Charge  CustServ Calls
0             128       False        True             25     265.1        110       45.07     197.4         99       16.78       244.7           91         11.01       10.0           3         2.70               1
1             107       False        True             26     161.6        123       27.47     195.5        103       16.62       254.4          103         11.45       13.7           3         3.70               1
2             137       False       False              0     243.4        114       41.38     121.2        110       10.30       162.6          104          7.32       12.2           5         3.29               0
3              84        True       False              0     299.4         71       50.90      61.9         88        5.26       196.9           89          8.86        6.6           7         1.78               2
4              75        True       False              0     166.7        113       28.34     148.3        122       12.61       186.9          121          8.41       10.1           3         2.73               3
'''
X = churn_feat_space.as_matrix().astype(np.float)
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.', 1, 0)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
# print(X[0])
'''
[ 0.67648946 -0.32758048  1.6170861   1.23488274  1.56676695  0.47664315
  1.56703625 -0.07060962 -0.05594035 -0.07042665  0.86674322 -0.46549436
  0.86602851 -0.08500823 -0.60119509 -0.0856905  -0.42793202]
'''

'''交叉验证通用函数'''
from sklearn.cross_validation import KFold


# X,y,选择的分类器,参数
def run_cv(X, y, clf_class, **kwargs):
    # Construct a kfolds object
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train, y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred


from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN


# 精度
def accuracy(y_true, y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)


print("Support vector machines:")
print("%.3f" % accuracy(y, run_cv(X, y, SVC)))
print("Random forest:")
print("%.3f" % accuracy(y, run_cv(X, y, RF)))
print("K-nearest-neighbors:")
print("%.3f" % accuracy(y, run_cv(X, y, KNN)))


# 客户流失的概率
def run_prob_cv(X, y, clf_class, **kwargs):
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y), 2))
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train, y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob


# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
# print pred_prob[0]
pred_churn = pred_prob[:, 1]
is_churn = y == 1

# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
# print counts

# calculate true probabilities
true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

# pandas-fu
counts = pd.concat([counts, true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
print(counts)

未来，值得期待

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
Python数据分析与机器学习-用户流失预警

import pandas as pdimport numpy as nppd.set_option('display.height', 9999)pd.set_option('display.max_rows', 9999)pd.set_option('display.max_columns', 9999)pd.set_option('display.width', 9999)c
复制链接

扫一扫