实战-----用户流失预警

最新推荐文章于 2024-07-01 19:35:08 发布

骑着蜗牛逛世界

最新推荐文章于 2024-07-01 19:35:08 发布

阅读量1.2k

点赞数

分类专栏：小唐机器学习实战

本文链接：https://blog.csdn.net/qq_34514046/article/details/81987088

版权

小唐机器学习实战专栏收录该内容

9 篇文章 0 订阅

订阅专栏

1用户流失预测

数据预处理：包括删除没用的列、归一化各特征的区间、把yes no、TRUE、False转换为0/1

SVC、RF、KNN三种分类器

只看预测准确率没用，要考虑Recall,因为只有预测出真正流失的客户才有意义

按有多大概率流失可能性来获取预测准确度：70%概率流失时，预测准确度是94%

#! /usr/bin/python
# -*-coding:utf-8 -*-
from __future__ import division
__author__ = "chunming"
import pandas as pd
import numpy as np
import warnings
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
churn_df = pd.read_csv('C://churn.csv')
col_names = churn_df.columns.tolist()
print("Column names:")
print(col_names)
to_show = col_names[:6] + col_names[-6:]
print("\nSample data:")
print(churn_df[to_show].head(6))

churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)

yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

features = churn_feat_space.columns
X = churn_feat_space.as_matrix().astype(np.float)#将表格转换为矩阵


scaler = StandardScaler()
X = scaler.fit_transform(X)

print("Feature space holds %d observations and %d features" % X.shape)
print("Unique target labels:", np.unique(y))
print(X[0])
print(len(y[y == 0]))

def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()
    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

print("Support vector machines:")
print("%.3f" % accuracy(y, run_cv(X,y,SVC)))
print("Random forest:")
print("%.3f" % accuracy(y, run_cv(X,y,RF)))
print("K-nearest-neighbors:")
print("%.3f" % accuracy(y, run_cv(X,y,KNN)))


def run_prob_cv(X, y, clf_class, **kwargs):
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y),2))
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob
warnings.filterwarnings('ignore')

# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
#print pred_prob[0]
pred_churn = pred_prob[:,1]
is_churn = y == 1

# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
#print counts
# calculate true probabilities
true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)
# pandas-fu
counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
print(counts)