import pandas as pda import numpy as np import missingno import matplotlib.pyplot as plt userData=pda.read_csv("churn.csv") print(userData.shape) # print(userData.describe()) # print(userData.columns.tolist()) # print(userData.head(),userData.tail()) # #查看缺失情况 无缺失 # missingno.bar(userData,color="g") # plt.show() # # standerUser=StandardScaler().fit_transform(userData["Night Calls"]) # data1=userData[["Night Calls","Intl Mins"]] # print(data1) # standerUser=StandardScaler().fit_transform(data1) # print(standerUser) data=userData["Churn?"] y=np.where(data=="True.",1,0) # print(y) drop_column=["State","Area Code","Phone","Churn?"] userData=userData.drop(drop_column,axis=1) # print(userData.head()) # yes_no=["Int'l Plan", "VMail Plan"] # # userData[yes_no]=userData[yes_no]=="yes" # userData["1"]=userData["Int'l Plan"] # userData["2"]=userData["VMail Plan"] userData.ix[userData["Int'l Plan"]=="yes",["Int'l Plan"]]=1 userData.ix[userData["Int'l Plan"]=="no",["Int'l Plan"]]=0 userData.ix[userData["VMail Plan"]=="yes",["VMail Plan"]]=1 userData.ix[userData["VMail Plan"]=="no",["VMail Plan"]]=0 # print(userData.head()) features=userData.columns X=userData.as_matrix().astype(np.float) from sklearn.preprocessing import StandardScaler X=StandardScaler().fit_transform(X) print(X[0]) print(X.shape) print(len(y[y==0])) from sklearn.cross_validation import KFold from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier as RF from sklearn.neighbors import KNeighborsClassifier as KNN def run_cv(X,y,model,**kwargs): kf=KFold(len(y),n_folds=5,shuffle=True) y_pred=y.copy() for train_index,test_index in kf: X_train,X_test=X[train_index],X[test_index] # print(X_train) # print(X_test) y_train=y[train_index] clf=model(**kwargs) # clf=SVC() clf.fit(X_train,y_train) y_pred[test_index]=clf.predict(X_test) return y_pred def accuracy(y_true,y_pred): return np.mean(y_pred==y_true) print("SVC",accuracy(y,run_cv(X,y,SVC))) print("RF",accuracy(y,run_cv(X,y,RF))) print("KNN",accuracy(y,run_cv(X,y,KNN))) def run_prob_cv(X,y,model,**kwargs): kf=KFold(len(y),n_folds=5,shuffle=True) y_prob=np.zeros((len(y),2)) for train_index,test_index in kf: X_train,X_test=X[train_index],X[test_index] # print(X_train) # print(X_test) y_train=y[train_index] clf=model(**kwargs) clf.fit(X_train,y_train) y_prob[test_index]=clf.predict_proba(X_test) return y_prob # pred_prob=run_prob_cv(X,y,RF,n_estimators=10) pred_p=pred_prob[:,1] print(pred_p) is_lost=y==1 counts=pda.value_counts(pred_p) print("counts:") print(counts) true_prob={} print("===========================================") for prob in counts.index: true_prob[prob]=np.mean(is_lost[pred_p==prob]) print(true_prob[prob]) true_prob=pda.Series(true_prob) counts=pda.concat([counts,true_prob],axis=1).reset_index() counts.columns=["pred_prob","count","true_prob"] print(counts) counts["pred_prob"]=counts["pred_prob"].astype(np.float) sortcounts=counts.sort_values(by =['pred_prob'],ascending = [True]) print(sortcounts) #