from __future__ import division
import pandas as pd
import numpy as np
churn_df = pd.read_csv("D:\\test\\machineLearning\\churn.csv" )
col_names = churn_df.columns.tolist()
print "Column_names:"
print col_names
to_show = col_names[:6 ]+col_names[-6 :]
print "\nSample_data:"
churn_df[to_show].head(3 )
Column_names: [‘State’, ‘Account Length’, ‘Area Code’, ‘Phone’, “Int’l Plan”, ‘VMail Plan’, ‘VMail Message’, ‘Day Mins’, ‘Day Calls’, ‘Day Charge’, ‘Eve Mins’, ‘Eve Calls’, ‘Eve Charge’, ‘Night Mins’, ‘Night Calls’, ‘Night Charge’, ‘Intl Mins’, ‘Intl Calls’, ‘Intl Charge’, ‘CustServ Calls’, ‘Churn?’] Sample_data:
State Account Length Area Code Phone Int’l Plan VMail Plan Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls Churn? 0 KS 128 415 382-4657 no yes 11.01 10.0 3 2.70 1 False. 1 OH 107 415 371-7191 no yes 11.45 13.7 3 3.70 1 False. 2 NJ 137 415 358-1921 no no 7.32 12.2 5 3.29 0 False.
churn_result = churn_df["Churn?" ]
y = np.where(churn_result == 'True.' ,1 ,0 )
to_drop = ['State' ,'Area Code' ,'Phone' ,'Churn?' ]
churn_feat_space = churn_df.drop(to_drop,axis=1 )
yes_no_cols = ["Int'l Plan" ,"VMail Plan" ]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
feaures = churn_feat_space.columns
X = churn_feat_space.as_matrix().astype(np.float)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
print "Feature space holds %d observations and %d features" % X.shape
print "Unique target labels:" ,np.unique(y)
print X[0 ]
print len(y[y == 0 ])
Feature space holds 3333 observations and 17 features Unique target labels: [0 1] [ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315 1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436 0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202] 2850
from sklearn.cross_validation import KFold
def run_cv (X,y,clf_class,**kwargs) :
kf = KFold(len(y),n_folds=5 ,shuffle=True )
y_pred = y.copy()
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
y_pred[test_index] = clf.predict(X_test)
return y_pred
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
def accuracy (y_true,y_pred) :
return np.mean(y_true == y_pred)
print "Support vector machines:"
print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Random forest:"
print "%.3f" % accuracy(y, run_cv(X,y,RF))
print "K-nearest-neighbors:"
print "%.3f" % accuracy(y, run_cv(X,y,KNN))
Support vector machines: 0.913 Random forest: 0.942 K-nearest-neighbors: 0.897
from sklearn.cross_validation import KFold
def run_prob_cv (X,y,clf_class,**kwargs) :
kf = KFold(len(y),n_folds=5 ,shuffle=True )
y_prob = np.zeros((len(y),2 ))
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
y_prob[test_index] = clf.predict_proba(X_test)
return y_prob
import warnings
warnings.filterwarnings('ignore' )
pred_prob = run_prob_cv(X,y,RF,n_estimators=10 )
pred_churn = pred_prob[:,1 ]
is_churn = y == 1
counts = pd.value_counts(pred_churn)
true_prob = {}
for prob in counts.index:
true_prob[prob] = np.mean(is_churn[pred_churn == prob])
true_prob = pd.Series(true_prob)
counts = pd.concat([counts,true_prob],axis=1 ).reset_index()
counts.columns = ["pred_prob" ,"count" ,"true_prob" ]
counts
pred_prob count true_prob 0 0.0 1779 0.029230 1 0.1 696 0.020115 2 0.2 265 0.060377 3 0.3 126 0.142857 4 0.8 91 0.978022 5 0.9 75 0.960000 6 0.4 73 0.438356 7 0.7 65 0.953846 8 0.5 57 0.561404 9 1.0 56 0.982143 10 0.6 50 0.820000