文章目录
📈Overview
The tasks completed by this kernel are as follows:
-
clean and manipulate the data with python and figure out the distribution of each features.
-
use matplotlib & seaborn to visualization some features which might be significant in build the model.
-
use statistic knowledge(Covariance matrix, hypothesis test, correspondence analysis, PCA, FA, etc.) to analysis some features which I am really interested in.
-
use sklearn to build some models, for instance svm,KNN, naive bayes, random forest, GBDT, XGBoost, lightgbm and so on, and measure the performance on each model. Especially the svm model.
dataset:
🛠️Import Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # For creating plots
import matplotlib.ticker as mtick # For specifying the axes tick format
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
import scikitplot as skplt
import optuna
import tensorflow as tf
sns.set(style = 'white')
# Input data files are available in the "../input/" directory.
import os
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.
telecom_cust = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
📋 Have a quick glance at the dataset
- use and .describe() to see the structure of the dataset and use .info() to find the statistical descriptions of the continuous features
- use .sample() instead of .tail() or .header() to grab some random records from the dataset.
telecom_cust.describe()
telecom_cust.info()
telecom_cust.sample(5)
overview of the dataset:
We found that the dataset does not have “NaN” data and most of the features(2/18) are categorized features
📋 use profile_report we can find the following characteristics in features:
- Percentage of missing value in each feature
- Distinct values in each categorized feature and frequency in each distinct value
- Histogram in each categorized and continuous feature
- Statistical describe, such as maximum, minimum, mean, etc, in each continuous feature
- Correlations Matrix between each two features in the dataset
import pandas_profiling
telecom_cust.profile_report()
📋 Data procession:
make a deep copy
%matplotlib inline
df=telecom_cust.copy()
df_Churn_yes=df[df['Churn']=='Yes'].copy()
df_Churn_no=df[df['Churn']=='No'].copy()
df_Churn_yes['TotalCharges']=pd.to_numeric(df_Churn_yes['TotalCharges'],errors='coerce')
df_Churn_no['TotalCharges']=pd.to_numeric(df_Churn_no['TotalCharges'],errors='coerce')
# convert TotalCharges
# use mean of TotalCharges to fill nan
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')
df['TotalCharges']=df['TotalCharges'].fillna(df['TotalCharges'].mean())
Alter ‘No internet service’ and ‘No phone service’ to ‘No’
services_alert = ['MultipleLines','OnlineSecurity',
'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
dict_services={'No internet service':'No',
'No phone service' : 'No',
'No':'No',
'Yes':'Yes'}
df[services_alert]=df[services_alert].apply(lambda x:x.map(dict_services))
# check
for i in df.columns:
print(i,':',pd.unique(df[i]))
📋 Try to make some visualizations
Some visualizations have been done in profile_report, just give some demos
Target Distribution: Churn
Method1: use axes
fig, ax = plt.subplots(figsize=(8,6))
feature_data = df['Churn'].value_counts()
plt.bar(feature_data.index,feature_data)
ax.set_title('Distribution in Churn')
ax.set_ylabel('count')
for i in feature_data.index:
ax.text(i,feature_data[i]-300,feature_data[i],va='center',ha='center',
fontsize=15,color='white')
# remember reduce non-data ink
for s in ['top', 'right']:
ax.spines[s].set_visible(False)
fig, ax = plt.subplots(figsize=(8,6))
feature_data = df['gender'].value_counts()
sns.barplot(x=np.unique(df['gender']),y=df['gender'].value_counts())
ax.set_title('Distribution in gender')
ax.set_ylabel('count')
for i in range(len(feature_data.index)):
ax.text(i,feature_data[i]-300,feature_data[i],va='center',ha='center',
fontsize=15,color='white')
# remember reduce non-data ink
for s in ['top', 'right']:
ax.spines[s].set_visible(False)
fig, ax = plt.subplots(figsize=(8,6))
sns.countplot(x='gender',hue='Churn',data=df,ax=ax)
#present values in each bar
for i in ax.patches:
ax.text(i.get_x()+i.get_width()/2,i.get_height()*0.90,i.get_height(),color='k',
ha='center',va='center',c='white',fontsize=15)
# remember reduce non-data ink
for s in ['top', 'right']:
ax.spines[s].set_visible(False)
fig, ax = plt.subplots(figsize=(8,6))
feature_data=df['SeniorCitizen'].value_counts()*100/len(df)
feature_data.plot.pie(y=feature_data,shadow=True,autopct='%.1f%%',labels = ['No','Yes'],fontsize=15,explode=[0,0.05],figsize=(6, 6))
ax.set_title('Percentage of Senior Citizens')
ax.set_ylabel('')
ax.legend()
fig, axes = plt.subplots(1,2,figsize=(16,6))
plt.sca(axes[0])
plt.hist(df_Churn_yes['tenure'],bins=50,alpha=0.9,label ='Yes')
plt.hist(df_Churn_no['tenure'],bins=50,alpha=0.5, label='No')
plt.legend()
plt.title('Histgram of tenure')
plt.sca(axes[1])
plt.hist([df_Churn_yes['tenure'],df_Churn_no['tenure']],bins=50,
histtype='barstacked', density=True)
v3 = np.concatenate((df_Churn_yes['tenure'],df_Churn_no['tenure']))
sns.kdeplot(v3);
plt.title('Histgram of tenure with barstacked method')
KDE of Monthly Charge & Totsl Charges
fig, axes = plt.subplots(1,2,figsize=(16,6))
plt.sca(axes[0])
sns.kdeplot(df_Churn_yes['MonthlyCharges'],shade=True,alpha=0.2,legend=True,label='Churn')
sns.kdeplot(df_Churn_no['MonthlyCharges'],shade=True,alpha=0.2,legend=True,label='Not Churn')
plt.legend()
plt.title('The KDE of Monthly Chanrges categorized by Churn')
plt.sca(axes[1])
sns.histplot(df_Churn_yes['MonthlyCharges'],bins=20,color='red',legend=True,label='Churn')
sns.histplot(df_Churn_no['MonthlyCharges'],bins=20,alpha=0.5,legend=True,label='Not Churn')
plt.legend()
plt.title('The histgram of Monthly Chanrges categorized by Churn')
fig, axes = plt.subplots(1,2,figsize=(16,6))
plt.sca(axes[0])
sns.kdeplot(df_Churn_yes['TotalCharges'],shade=True,alpha=0.2,legend=True,label='Churn')
sns.kdeplot(df_Churn_no['TotalCharges'],shade=True,alpha=0.2,legend=True,label='Not Churn')
plt.legend()
plt.title('The KDE of Total Charges categorized by Churn')
plt.sca(axes[1])
sns.histplot(df_Churn_yes['TotalCharges'],bins=20,color='red',legend=True,label='Churn')
sns.histplot(df_Churn_no['TotalCharges'],bins=20,alpha=0.5,legend=True,label='Not Churn')
plt.legend()
plt.title('The histgram of Total Charges categorized by Churn')
fig, axes = plt.subplots(1,2,figsize=(15,6))
sns.histplot(x = df['TotalCharges'],hue=df['Churn'],ax=axes[0],kde=True,bins=20,palette='ocean')
sns.histplot(x = df['MonthlyCharges'],hue=df['Churn'],ax=axes[1],kde=True,bins=20,palette='CMRmap')
plt.tight_layout()
plt.show()
#convert SeniorCitizen to string
df['SeniorCitizen'] = df['SeniorCitizen'].astype(str)
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot = True)
fig, ax = plt.subplots(figsize=(8,6))
sns.countplot(x='Contract',hue='Churn',data=df,ax=ax)
#present values in each bar
for i in ax.patches:
ax.text(i.get_x()+i.get_width()/2,i.get_height()*0.90,i.get_height(),color='k',
ha='center',va='center',c='k',fontsize=15)
# remember reduce non-data ink
for s in ['top', 'right']:
ax.spines[s].set_visible(False)
services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity',
'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
fig, axes = plt.subplots(nrows = 3,ncols = 3,figsize = (18,18))
for i in range(9):
ax=axes[int(i/3),i%3]
ser =services[i]
sns.countplot(x=ser,hue='Churn',data=df,ax=ax,order=df[ser].value_counts().index)
#present values in each bar
for i in ax.patches:
ax.text(i.get_x()+i.get_width()/2,i.get_height()*1,i.get_height(),color='k',
ha='center',c='k',fontsize=10)
#plt.legend(loc='best')
# remember reduce non-data ink
for s in ['top', 'right']:
ax.spines[s].set_visible(False)
📋 Let’s do analysis
First, try Correspondence analysis between various Services and Churn
Actually, you can choice the features which you are really interested in to replace the parameter ‘feature’ below
import scipy
from scipy import stats
### hypothesis test and Chi square independence test
H0: Records have same performance in specified feature
H1: Records do not have same performance in specified feature
def Chi_square_test(data,use_method=1,alpha=0.05):
#1 Chi square independence test
#Use mode 1 or mode 2: select 2 for 1 degree of freedom and 1 for others
# use_method = 1
# alpha=0.05 # Set quantile value
data=data.astype(int)
data_j=data.sum()/data.sum().sum() #Calculate column marginal probability
data_i=data.sum(axis=1) #Calculate row marginal probability
#Here, the degree of freedom of the matrix should be:(columns_numbers-1)*(row_numbers-1)
k=(len(data.columns)-1)*(len(data)-1)
#The corresponding rejection domain is:
rej_boundary=stats.chi2.ppf(1-alpha/2, k)
data_exp=np.dot(data_i.values.reshape(-1,1),data_j.values.reshape(1,-1))
data_exp=pd.DataFrame(data_exp,index=data_i.index,columns=data_j.index)
data_obj=data.values.flatten()
data_exp=data_exp.values.flatten()
def calculate_chi_val(use_method):
if use_method == 1:
# method1: The matrix is compressed into one-dimensional calculation,
# which is applicable to the case where the degree of freedom is greater than 1
# while using stats.chisquare, the degree of freedom is len(f_obs)-1-ddof=(len(data.columns)-1)*(len(data)-1),so the parameter 'ddof' should be
chi_val,p_val=stats.chisquare(f_obs=data_obj,f_exp=data_exp,ddof=len(data_obj)-1-(len(data.columns)-1)*(len(data)-1))
return chi_val,p_val
elif use_method == 2:
#method2:When the degree of freedom is 1, chi square test is performed on the frequency data,
# the probability will be low when the chi square distribution of continuous variables is used to calculate the probability
#use this formula: X_c^2 = sum(|O-E|-0.5)^2/E
chi_val=0
for i in range(len(data_obj)):
chi_val += (abs(data_obj[i]-data_exp[i])-0.5)**2/data_exp[i]
p_val=scipy.stats.chi2.sf(abs(chi_val),k)
return chi_val,p_val
chi_val,p_val = calculate_chi_val(use_method)
print('The Chi square value of the sample is:{:.3f}, The corresponding degrees of freedom is: {}, \nwhen alpha={}, p-value is :{:.8f}, Reject domain boundary is: {:.3f}'
.format(chi_val,k,alpha,p_val,rej_boundary))
if chi_val>rej_boundary:
print(f'Conclusion: alpha=0.05, refuse H0, accpet H1. There are essential differences between {data.index.values} in {data.index.name}')
print('-'*120)
else:
print(f'Conclusion: alpha=0.05, accpet H0, refuse H1. There is no essential difference between {data.index.values} in {data.index.name}')
print('-'*120)
services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity',
'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
for service in services:
feature=[service,'Churn']
data = df[feature]
data=data.pivot_table(index=feature[0],columns=feature[1],aggfunc=len,fill_value=0)
if 'No internet service' in data.index.values:
data.drop('No internet service',inplace=True)
if 'No phone service' in data.index.values:
data.drop('No phone service',inplace=True)
print(f'data matrix:\n{data}\n')
#print(f'The freedom of the matrix is {len(data.index.values)-1}\n')
#print(data.index.values,data.index.name)
Chi_square_test(data,len(data.index.values)-1,0.01)
data matrix:
Churn No Yes
PhoneService
No 512 170
Yes 4662 1699
The Chi square value of the sample is:1.004, The corresponding degrees of freedom is: 1,
when alpha=0.01, p-value is :0.31624613, Reject domain boundary is: 7.879
Conclusion: alpha=0.05, accpet H0, refuse H1. There is no essential difference between [‘No’ ‘Yes’] in PhoneService
data matrix:
Churn No Yes
MultipleLines
No 3053 1019
Yes 2121 850
The Chi square value of the sample is:11.326, The corresponding degrees of freedom is: 1,
when alpha=0.01, p-value is :0.00076412, Reject domain boundary is: 7.879
Conclusion: alpha=0.05, refuse H0, accpet H1. There are essential differences between [‘No’ ‘Yes’] in MultipleLines
data matrix:
Churn No Yes
InternetService
DSL 1962 459
Fiber optic 1799 1297
No 1413 113
The Chi square value of the sample is:730.154, The corresponding degrees of freedom is: 2,
when alpha=0.01, p-value is :0.00000000, Reject domain boundary is: 10.597
Conclusion: alpha=0.05, refuse H0, accpet H1. There are essential differences between [‘DSL’ ‘Fiber optic’ ‘No’] in InternetService
data matrix:
Churn No Yes
OnlineSecurity
No 3450 1574
Yes 1724 295
The Chi square value of the sample is:206.490, The corresponding degrees of freedom is: 1,
when alpha=0.01, p-value is :0.00000000, Reject domain boundary is: 7.879
Conclusion: alpha=0.05, refuse H0, accpet H1. There are essential differences between [‘No’ ‘Yes’] in OnlineSecurity
data matrix:
Churn No Yes
OnlineBackup
No 3268 1346
Yes 1906 523
The Chi square value of the sample is:47.652, The corresponding degrees of freedom is: 1,
when alpha=0.01, p-value is :0.00000000, Reject domain boundary is: 7.879
Conclusion: alpha=0.05, refuse H0, accpet H1. There are essential differences between [‘No’ ‘Yes’] in OnlineBackup
data matrix:
Churn No Yes
DeviceProtection
No 3297 1324
Yes 1877 545
The Chi square value of the sample is:30.828, The corresponding degrees of freedom is: 1,
when alpha=0.01, p-value is :0.00000003, Reject domain boundary is: 7.879
Conclusion: alpha=0.05, refuse H0, accpet H1. There are essential differences between [‘No’ ‘Yes’] in DeviceProtection
data matrix:
Churn No Yes
TechSupport
No 3440 1559
Yes 1734 310
The Chi square value of the sample is:190.988, The corresponding degrees of freedom is: 1,
when alpha=0.01, p-value is :0.00000000, Reject domain boundary is: 7.879
Conclusion: alpha=0.05, refuse H0, accpet H1. There are essential differences between [‘No’ ‘Yes’] in TechSupport
data matrix:
Churn No Yes
StreamingTV
No 3281 1055
Yes 1893 814
The Chi square value of the sample is:28.156, The corresponding degrees of freedom is: 1,
when alpha=0.01, p-value is :0.00000011, Reject domain boundary is: 7.879
Conclusion: alpha=0.05, refuse H0, accpet H1. There are essential differences between [‘No’ ‘Yes’] in StreamingTV
data matrix:
Churn No Yes
StreamingMovies
No 3260 1051
Yes 1914 818
The Chi square value of the sample is:26.536, The corresponding degrees of freedom is: 1,
when alpha=0.01, p-value is :0.00000026, Reject domain boundary is: 7.879
Conclusion: alpha=0.05, refuse H0, accpet H1. There are essential differences between [‘No’ ‘Yes’] in StreamingMovies
Conclusion:
We found that at the parameter \alpha = 0.01
1. There is no essential difference between the churn percentage of customer wether choice the service of "Streaming TV" and "Phone Service" or not.
2. There is essential difference between the churn percentage of customer wether choice other 7 services or not.
para = 2
#2 Convert data into probability matrix, and then normalize
def calculate_probaility_matrix(data):
data_p=data/data.sum().sum() #Probability matrix
data_j=data_p.sum() #Column edge probability
data_i=data_p.sum(axis=1) #Row edge probability
#Standardization of probability matrix
tmp=np.dot(data_i.values.reshape(-1,1),data_j.values.reshape(1,-1))
data_z=(data_p-tmp)/np.sqrt(tmp)
data_z=pd.DataFrame(data_z,index=data.index,columns=data.columns)
data_z_np=data_z.to_numpy()
print(data_p)
return data_i,data_j,data_p, data_z,data_z_np
Q-R facter analysis and Correspondence_Analysis
def Correspondence_Analysis(data,para=3):
#3 #R-factor load matrix
#Solve singular value and inertia, and determine the number of common factors
data_i, data_j,data_p, data_z,data_z_np = calculate_probaility_matrix(data)
S_R=np.dot(data_z_np.T,data_z_np)
eig_val_R,eig_fea_R=np.linalg.eig(S_R) #Returns the eigenvalue and eigenvector of R matrix
#print(eig_val_R,eig_fea_R)
dim_matrix=pd.DataFrame(sorted([i for i in eig_val_R if i>0],reverse=True),columns=['inertia'])
dim_matrix['eigenvalue']=np.sqrt(dim_matrix['inertia'])
dim_matrix['Corresponding part']=dim_matrix['inertia']/dim_matrix['inertia'].sum()
dim_matrix['accumulative total']=dim_matrix['Corresponding part'].cumsum()
print(dim_matrix)
com_fea_index=[x[0] for x in sorted(enumerate(eig_val_R),reverse=True,key=lambda x:x[1])][:para]
cols=['c'+str(i+1) for i in range(len(com_fea_index))]
eig_fea_R=np.multiply(eig_fea_R[:,com_fea_index],np.sqrt(eig_val_R[com_fea_index]))
R_matrix=pd.DataFrame(eig_fea_R,index=data_j.index,columns=cols)
R_matrix['tmp']=np.sqrt(data_j)
for col in cols:
R_matrix[col]=R_matrix[col]/R_matrix['tmp']
R_matrix.drop('tmp',axis=1,inplace=True)
#4 #Q-factor load matrix
S_Q=np.dot(data_z_np,data_z_np.T)
eig_val_Q,eig_fea_Q=np.linalg.eig(S_Q) #Returns the eigenvalue and eigenvector of the Q matrix
#print(eig_val_Q,eig_fea_Q)
dim_matrix=pd.DataFrame(sorted([i for i in eig_val_Q if i>0],reverse=True),columns=['inertia'])
dim_matrix['eigenvalue']=np.sqrt(dim_matrix['inertia'])
dim_matrix['Corresponding part']=dim_matrix['inertia']/dim_matrix['inertia'].sum()
dim_matrix['accumulative total']=dim_matrix['Corresponding part'].cumsum()
print(dim_matrix)
com_fea_index=[x[0] for x in sorted(enumerate(eig_val_Q),reverse=True,key=lambda x:x[1])][:para]
cols=['c'+str(i+1) for i in range(len(com_fea_index))]
eig_fea_Q=np.multiply(eig_fea_Q[:,com_fea_index],np.sqrt(eig_val_Q[com_fea_index]))
Q_matrix=pd.DataFrame(eig_fea_Q,index=data_i.index,columns=cols)
Q_matrix=Q_matrix.astype(float,copy=False) #The data type in the matrix is complex, but the imaginary part is 0, so it is converted to float here
Q_matrix['tmp']=np.sqrt(data_i)
for col in cols:
Q_matrix[col]=Q_matrix[col]/Q_matrix['tmp']
Q_matrix.drop('tmp',axis=1,inplace=True)
#5 Visualization
dimension_name=feature #Row dimension name and column dimension name
%matplotlib inline
plt.figure(figsize=(10,8))
#4 Select common factors C0 and C1 to draw a positioning map
plot_data=pd.concat([Q_matrix[['c1','c2']],R_matrix[['c1','c2']]],axis=0)
plot_data.index=[feature[0]+'_'+i for i in list(data_i.index)]+[feature[1]+'_'+i for i in list(data_j.index)]
plot_data['style']=[dimension_name[0]]*data_i.shape[0]+[dimension_name[1]]*data_j.shape[0]
plot_data=plot_data.fillna(0)
#depicts
marks={dimension_name[0]:'o',dimension_name[1]:'s'}
ax=sns.scatterplot(x='c2',y='c1',hue='style',style='style',markers=marks,data=plot_data)
ax.set_xlim(left=plot_data['c2'].min()-0.1,right=plot_data['c2'].max()+0.1)
ax.set_ylim(bottom=plot_data['c1'].min()-0.1,top=plot_data['c1'].max()+0.1)
#ax.set_xticks([-1,-0.5,0,0.5,1])
#ax.set_yticks([-1,-0.5,0,0.5,1])
ax.axhline(0,color='k',lw=0.5,ls='--')
ax.axvline(0,color='k',lw=0.5,ls='--')
for idx in plot_data.index:
ax.text(plot_data.loc[idx,'c2']+0.005,plot_data.loc[idx,'c1']+0.005,idx)
plt.show()
# Example
feature=['Contract','Churn']
data = df[feature]
data=data.pivot_table(index=feature[0],columns=feature[1],aggfunc=len,fill_value=0)
Correspondence_Analysis(data,2) # use "Contract" as an example
conclusion:
we found that if the user’s contract is pay Month-to-Month, he/she has a greater probability to become a losing customers. On the other hand, if users sign contracts every two years or once a year, they are more inclined not to become lost users.