机器学习项目实战之用户流失预警

from __future__ import division
import pandas as pd
import numpy as np

churn_df = pd.read_csv("D:\\test\\machineLearning\\churn.csv")
col_names = churn_df.columns.tolist()

print "Column_names:"
print col_names

to_show = col_names[:6]+col_names[-6:]
print "\nSample_data:"
churn_df[to_show].head(3)
Column_names: [‘State’, ‘Account Length’, ‘Area Code’, ‘Phone’, “Int’l Plan”, ‘VMail Plan’, ‘VMail Message’, ‘Day Mins’, ‘Day Calls’, ‘Day Charge’, ‘Eve Mins’, ‘Eve Calls’, ‘Eve Charge’, ‘Night Mins’, ‘Night Calls’, ‘Night Charge’, ‘Intl Mins’, ‘Intl Calls’, ‘Intl Charge’, ‘CustServ Calls’, ‘Churn?’] Sample_data:
StateAccount LengthArea CodePhoneInt’l PlanVMail PlanNight ChargeIntl MinsIntl CallsIntl ChargeCustServ CallsChurn?
0KS128415382-4657noyes11.0110.032.701False.
1OH107415371-7191noyes11.4513.733.701False.
2NJ137415358-1921nono7.3212.253.290False.
#将字符改变成数值,便于分析
#Churn是客户量流失的意思
churn_result = churn_df["Churn?"]
y = np.where(churn_result == 'True.',1,0)

#去掉一些特征
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)

#将这些yes和no转化为布尔值
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

feaures = churn_feat_space.columns

X = churn_feat_space.as_matrix().astype(np.float)

#重点:不同的特征项有不同的值,如1-2,3万到4万,不同特征间的数值上的巨大差异会影响我们的分析
#例如作图的时候,所以我们需要统一将这些数据压缩到一定的区间上
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

print "Feature space holds %d observations and %d features"% X.shape
print "Unique target labels:",np.unique(y)
print X[0]
print len(y[y == 0])
Feature space holds 3333 observations and 17 features Unique target labels: [0 1] [ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315 1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436 0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202] 2850
from sklearn.cross_validation import KFold

#交叉验证函数:X是特征数据,y是label,clf_class是你选择的分类器,kwargs指定的参数
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred
from sklearn.svm import SVC  #支持向量机
from sklearn.ensemble import RandomForestClassifier as RF  #随机森林
from sklearn.neighbors import KNeighborsClassifier as KNN  #k最近邻

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

#尝试使用多种分类器来验证效果
print "Support vector machines:"
print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Random forest:"
print "%.3f" % accuracy(y, run_cv(X,y,RF))
print "K-nearest-neighbors:"
print "%.3f" % accuracy(y, run_cv(X,y,KNN))
Support vector machines: 0.913 Random forest: 0.942 K-nearest-neighbors: 0.897
#以上的准确率的意义并不大,对于客户来说,重要的是ROC指标FN,即我预测错了,认为客户不会流失,但是客户流失了

from sklearn.cross_validation import KFold

#交叉验证函数:X是特征数据,y是label,clf_class是你选择的分类器,kwargs指定的参数
def run_prob_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_prob = np.zeros((len(y),2))

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob
import warnings
warnings.filterwarnings('ignore')

pred_prob = run_prob_cv(X,y,RF,n_estimators=10)

pred_churn = pred_prob[:,1]
is_churn = y == 1

counts = pd.value_counts(pred_churn)

true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

counts = pd.concat([counts,true_prob],axis=1).reset_index()
counts.columns = ["pred_prob","count","true_prob"]
counts
#通过观测以下数据进行预警,当实际的可能性是百分之30或者40时,对应的真实的用户流失情况,由用户选择阈值进行预警
pred_probcounttrue_prob
00.017790.029230
10.16960.020115
20.22650.060377
30.31260.142857
40.8910.978022
50.9750.960000
60.4730.438356
70.7650.953846
80.5570.561404
91.0560.982143
100.6500.820000
  • 2
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 7
    评论
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值