机器学习项目实战之用户流失预警

最新推荐文章于 2024-06-29 16:56:12 发布

一个人的旅行qiu

最新推荐文章于 2024-06-29 16:56:12 发布

阅读量5.7k

点赞数 2

分类专栏： python机器学习项目实战文章标签：机器学习

本文链接：https://blog.csdn.net/qiujiahao123/article/details/65018822

版权

python机器学习项目实战专栏收录该内容

8 篇文章 1 订阅

订阅专栏

from __future__ import division
import pandas as pd
import numpy as np

churn_df = pd.read_csv("D:\\test\\machineLearning\\churn.csv")
col_names = churn_df.columns.tolist()

print "Column_names:"
print col_names

to_show = col_names[:6]+col_names[-6:]
print "\nSample_data:"
churn_df[to_show].head(3)

Column_names: [‘State’, ‘Account Length’, ‘Area Code’, ‘Phone’, “Int’l Plan”, ‘VMail Plan’, ‘VMail Message’, ‘Day Mins’, ‘Day Calls’, ‘Day Charge’, ‘Eve Mins’, ‘Eve Calls’, ‘Eve Charge’, ‘Night Mins’, ‘Night Calls’, ‘Night Charge’, ‘Intl Mins’, ‘Intl Calls’, ‘Intl Charge’, ‘CustServ Calls’, ‘Churn?’] Sample_data:

	State	Account Length	Area Code	Phone	Int’l Plan	VMail Plan	Night Charge	Intl Mins	Intl Calls	Intl Charge	CustServ Calls	Churn?
0	KS	128	415	382-4657	no	yes	11.01	10.0	3	2.70	1	False.
1	OH	107	415	371-7191	no	yes	11.45	13.7	3	3.70	1	False.
2	NJ	137	415	358-1921	no	no	7.32	12.2	5	3.29	0	False.

#将字符改变成数值，便于分析
#Churn是客户量流失的意思
churn_result = churn_df["Churn?"]
y = np.where(churn_result == 'True.',1,0)

#去掉一些特征
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)

#将这些yes和no转化为布尔值
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

feaures = churn_feat_space.columns

X = churn_feat_space.as_matrix().astype(np.float)

#重点:不同的特征项有不同的值，如1-2，3万到4万，不同特征间的数值上的巨大差异会影响我们的分析
#例如作图的时候，所以我们需要统一将这些数据压缩到一定的区间上
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

print "Feature space holds %d observations and %d features"% X.shape
print "Unique target labels:",np.unique(y)
print X[0]
print len(y[y == 0])

Feature space holds 3333 observations and 17 features Unique target labels: [0 1] [ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315 1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436 0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202] 2850

from sklearn.cross_validation import KFold

#交叉验证函数：X是特征数据，y是label，clf_class是你选择的分类器，kwargs指定的参数
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

from sklearn.svm import SVC  #支持向量机
from sklearn.ensemble import RandomForestClassifier as RF  #随机森林
from sklearn.neighbors import KNeighborsClassifier as KNN  #k最近邻

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

#尝试使用多种分类器来验证效果
print "Support vector machines:"
print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Random forest:"
print "%.3f" % accuracy(y, run_cv(X,y,RF))
print "K-nearest-neighbors:"
print "%.3f" % accuracy(y, run_cv(X,y,KNN))

Support vector machines: 0.913 Random forest: 0.942 K-nearest-neighbors: 0.897

#以上的准确率的意义并不大，对于客户来说，重要的是ROC指标FN，即我预测错了，认为客户不会流失，但是客户流失了

from sklearn.cross_validation import KFold

#交叉验证函数：X是特征数据，y是label，clf_class是你选择的分类器，kwargs指定的参数
def run_prob_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_prob = np.zeros((len(y),2))

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob

import warnings
warnings.filterwarnings('ignore')

pred_prob = run_prob_cv(X,y,RF,n_estimators=10)

pred_churn = pred_prob[:,1]
is_churn = y == 1

counts = pd.value_counts(pred_churn)

true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

counts = pd.concat([counts,true_prob],axis=1).reset_index()
counts.columns = ["pred_prob","count","true_prob"]
counts
#通过观测以下数据进行预警，当实际的可能性是百分之30或者40时，对应的真实的用户流失情况，由用户选择阈值进行预警

	pred_prob	count	true_prob
0	0.0	1779	0.029230
1	0.1	696	0.020115
2	0.2	265	0.060377
3	0.3	126	0.142857
4	0.8	91	0.978022
5	0.9	75	0.960000
6	0.4	73	0.438356
7	0.7	65	0.953846
8	0.5	57	0.561404
9	1.0	56	0.982143
10	0.6	50	0.820000