电信用户流失分析完整流程
客户流失分析流程
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from PIL import Image
%matplotlib inline
import seaborn as sns
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
1.数据预处理
data = pd.read_csv("./data2.csv")
data.head()
- ‘customerID’,用户ID
- ‘gender’,性别
- ‘SeniorCitizen’,是否老年人
- ‘Partner’,是否有伴侣
- ‘Dependents’, 是否有家属
- ‘tenure’,在网时长
- ‘PhoneService’,电话服务
- ‘MultipleLines’,多线路
- ‘InternetService’,互联网服务
- ‘OnlineSecurity’, 在线安全
- ‘OnlineBackup’,在线备份
- ‘DeviceProtection’,设备保护
- ‘TechSupport’,技术支持
- ‘StreamingTV’,电视流
- ‘StreamingMovies’,电影流
- ‘Contract’,订阅方式
- ‘PaperlessBilling’,无纸计费
- ‘PaymentMethod’,支付方式
- ‘MonthlyCharges’,月费
- ‘TotalCharges’,总消费
- 'Churn’流失与否
描述统计
- 数据形状
- 数据字段
- 缺失值
data.info()
print("row:",data.shape[0])
print("columns",data.shape[1])
print("\nFeatures:\n",data.columns.tolist())
print("\nMissing value:\n",data.isnull().sum().values.sum())
print("\nUnique value:\n",data.nunique())
-
异常值
- 查找空格值
在用astype将TotalCharges转化为数值列的时候报错,经查原因是包含空格值,可以空格值删除也可用pd.to_numeric(df['TotalCharges'],errors='coerce')
- 查找空格值
data['TotalCharges'] = data['TotalCharges'].replace(" ",np.nan)#把空格替换成NaN值
data = data[data['TotalCharges'].notnull()]#删除空值的行
data = data.reset_index()[data.columns]#删除行以后重建索引
data['TotalCharges'] = data['TotalCharges'].astype(float)#转换为浮点数
replace_cols = ['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
for i in replace_cols:
data[i] = data[i].replace({'No internet service':'No'})
- 去重值
- 每列数据值的种类
- 数值描述统计
数据转换
-
二分类转换
-
连续数据分组转换
-
连续数据离散化处理
- 多值分类变量进行分组
- 二元分类转化为数值
- 连续变量转化为分组变量
def tenure_lab(data):
if data['tenure'] <=12:
return "Tenure_0-12"
elif (data['tenure'] >12) and(data['tenure']<=24):
return "Tenure_12-24"
elif (data['tenure'] >24) and(data['tenure']<=48):
return "Tenure_24-48"
elif (data['tenure'] >48) and(data['tenure']<=60):
return "Tenure_48-60"
elif data['tenure'] >60:
return "Tenure_gt_60"
data['tenure_group'] = data.apply(lambda data:tenure_lab(data),axis=1)#对tenure列分组
- 定义函数,手动分组
- Kmeans聚类转化
2.数据可视化
churn = data[data['Churn']=='Yes']
not_churn = data[data['Churn']=='No']#新建两个数据框分开Churn
单列分析
- 将单列数值计数,标签和计数值分离
Id_col = ['customerID']
target_col = ["Churn"]#ID列,目标列单独标记
cat_cols = data.nunique()[data.nunique()<6].keys().tolist()#分类列
cat_cols = [x for x in cat_cols if x not in target_col]
num_cols = [x for x in data.columns if x not in cat_cols+target_col+Id_col]#数值列
-
分类变量分析
- 定义视图函数,遍历分类列,饼图
trace = go.Pie(labels= lab,
values=val,
marker = dict(colors =['royalbue','lime'],
line = dict(color = 'white',width=1.3)
),
rotation=90,
hoverinfo="label + value+text",
hole=.5
)
layout = go.Layout(dict(title = "Customer attrition in data",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
)
data2 = [trace]
fig = go.Figure(data = data2,layout=layout)
py.iplot(fig)
def plot_pie(column) :
trace1 = go.Pie(values = churn[column].value_counts().values.tolist(),
labels = churn[column].value_counts().keys().tolist(),
hoverinfo = "label+percent+name",
domain = dict(x = [0,.48]),
name = "Churn Customers",
marker = dict(line = dict(width = 2,
color = "rgb(243,243,243)")
),
hole = .6
)
trace2 = go.Pie(values = not_churn[column].value_counts().values.tolist(),
labels = not_churn[column].value_counts().keys().tolist(),
hoverinfo = "label+percent+name",
marker = dict(line = dict(width = 2,
color = "rgb(243,243,243)")
),
domain = dict(x = [.52,1]),
hole = .6,
name = "Non churn customers"
)
layout = go.Layout(dict(title = column + " distribution in customer attrition ",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
annotations = [dict(text = "churn customers",
font = dict(size = 13),
showarrow = False,
x = .15, y = .5),
dict(text = "Non churn customers",
font = dict(size = 13),
showarrow = False,
x = .88,y = .5
)
]
)
)
data = [trace1,trace2]
fig = go.Figure(data = data,layout = layout)
py.iplot(fig)
for i in cat_cols:
plot_pie(i)
-
数值型变量
- 定义视图函数,直方图
- 自身对比,以流失和未流失为方向,并列柱状图,颜色按流失未流失,可以直接定义百分比
def histogram(column) :
trace1 = go.Histogram(x = churn[column],
histnorm= "percent",
name = "Churn Customers",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
trace2 = go.Histogram(x = not_churn[column],
histnorm = "percent",
name = "Non churn customers",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
data = [trace1,trace2]
layout = go.Layout(dict(title =column + " distribution in customer attrition ",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = column,
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "percent",
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
)
)
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
for i in num_cols:
histogram(i)
多列分析
- 数值型变量两两相关性分析
def scatter_matrix(df) :
df = df.sort_values(by = "Churn" ,ascending = True)
classes = df["Churn"].unique().tolist()
classes
class_code = {classes[k] : k for k in range(2)}
class_code
color_vals = [class_code[cl] for cl in df["Churn"]]
color_vals
pl_colorscale = "Portland"
pl_colorscale
text = [df.loc[k,"Churn"] for k in range(len(df))]
text
trace = go.Splom(dimensions = [dict(label = "tenure",
values = df["tenure"]),
dict(label = 'MonthlyCharges',
values = df['MonthlyCharges']),
dict(label = 'TotalCharges',
values = df['TotalCharges'])],
text = text,
marker = dict(color = color_vals,
colorscale = pl_colorscale,
size = 3,
showscale = False,
line = dict(width = .1,
color='rgb(230,230,230)'
)
)
)
axis = dict(showline = True,
zeroline = False,
gridcolor = "#fff",
ticklen = 4
)
layout = go.Layout(dict(title =
"Scatter plot matrix for Numerical columns for customer attrition",
autosize = False,
height = 800,
width = 800,
dragmode = "select",
hovermode = "closest",
plot_bgcolor = 'rgba(240,240,240, 0.95)',
xaxis1 = dict(axis),
yaxis1 = dict(axis),
xaxis2 = dict(axis),
yaxis2 = dict(axis),
xaxis3 = dict(axis),
yaxis3 = dict(axis),
)
)
data = [trace]
fig = go.Figure(data = data,layout = layout )
py.iplot(fig)
分组计数分析
- 按在网时长分组计数,并列条形图,颜色按流失与否
tg_ch = churn["tenure_group"].value_counts().reset_index()
tg_ch.columns = ["tenure_group","count"]
tg_nch = not_churn["tenure_group"].value_counts().reset_index()
tg_nch.columns = ["tenure_group","count"]
trace1 = go.Bar(x = tg_ch["tenure_group"] , y = tg_ch["count"],
name = "Churn Customers",
marker = dict(line = dict(width = .5,color = "black")),
opacity = .9)
#bar - not churn
trace2 = go.Bar(x = tg_nch["tenure_group"] , y = tg_nch["count"],
name = "Non Churn Customers",
marker = dict(line = dict(width = .5,color = "black")),
opacity = .9)
layout = go.Layout(dict(title = "Customer attrition in tenure groups",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "tenure group",
zerolinewidth=1,ticklen=5,gridwidth=2),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "count",
zerolinewidth=1,ticklen=5,gridwidth=2),
)
)
data3 = [trace1,trace2]
fig = go.Figure(data=data3,layout=layout)
py.iplot(fig)
- 按在网时长分组,分析月费和总消费的散点图,颜色按在网时长分组
- 按在流失与否分组,分析月费和总消费的散点图,颜色按流失与否
def plot_tenure_scatter(tenure_group,color) :
tracer = go.Scatter(x = data[data["tenure_group"] == tenure_group]["MonthlyCharges"],
y = data[data["tenure_group"] == tenure_group]["TotalCharges"],
mode = "markers",marker = dict(line = dict(color = "black",
width = .2),
size = 4 , color = color,
symbol = "diamond-dot",
),
name = tenure_group,
opacity = .9
)
return tracer
def plot_churncharges_scatter(churn,color) :
tracer = go.Scatter(x = data[data["Churn"] == churn]["MonthlyCharges"],
y = data[data["Churn"] == churn]["TotalCharges"],
mode = "markers",marker = dict(line = dict(color = "black",
width = .2),
size = 4 , color = color,
symbol = "diamond-dot",
),
name = "Churn - " + churn,
opacity = .9
)
return tracer
trace1 = plot_tenure_scatter("Tenure_0-12","#FF3300")
trace2 = plot_tenure_scatter("Tenure_12-24","#6666FF")
trace3 = plot_tenure_scatter("Tenure_24-48","#99FF00")
trace4 = plot_tenure_scatter("Tenure_48-60","#996600")
trace5 = plot_tenure_scatter("Tenure_gt_60","grey")
trace6 = plot_churncharges_scatter("Yes","red")
trace7 = plot_churncharges_scatter("No","blue")
data1 = [trace1,trace2,trace3,trace4,trace5]
data2 = [trace7,trace6]
def layout_title(title) :
layout = go.Layout(dict(title = title,
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "Monthly charges",
zerolinewidth=1,ticklen=5,gridwidth=2),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "Total Charges",
zerolinewidth=1,ticklen=5,gridwidth=2),
height = 600
)
)
return layout
layout1 = layout_title("Monthly Charges & Total Charges by Tenure group")
layout2 = layout_title("Monthly Charges & Total Charges by Churn group")
fig1 = go.Figure(data = data1,layout = layout1)
fig2 = go.Figure(data = data2,layout = layout2)
py.iplot(fig1)
py.iplot(fig2)
多重分组分析
- 按在网时长分组、流失与否两个指标分组,并列条形图,计算平均月费和在网时长分组的关系,颜色流失与否
- 按在网时长分组、流失与否两个指标分组,并列条形图,计算平均总消费和在网时长分组的关系,颜色流失与否
avg_tgc = data.groupby(["tenure_group","Churn"])[["MonthlyCharges",
"TotalCharges"]].mean().reset_index()
#function for tracing
def mean_charges(column,aggregate) :
tracer = go.Bar(x = avg_tgc[avg_tgc["Churn"] == aggregate]["tenure_group"],
y = avg_tgc[avg_tgc["Churn"] == aggregate][column],
name = aggregate,marker = dict(line = dict(width = 1)),
text = "Churn"
)
return tracer
#function for layout
def layout_plot(title,xaxis_lab,yaxis_lab) :
layout = go.Layout(dict(title = title,
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = xaxis_lab,
zerolinewidth=1,ticklen=5,gridwidth=2),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = yaxis_lab,
zerolinewidth=1,ticklen=5,gridwidth=2),
)
)
return layout
trace1 = mean_charges("MonthlyCharges","Yes")
trace2 = mean_charges("MonthlyCharges","No")
layout1 = layout_plot("Average Monthly Charges by Tenure groups",
"Tenure group","Monthly Charges")
data1 = [trace1,trace2]
fig1 = go.Figure(data=data1,layout=layout1)
trace3 = mean_charges("TotalCharges","Yes")
trace4 = mean_charges("TotalCharges","No")
layout2 = layout_plot("Average Total Charges by Tenure groups",
"Tenure group","Total Charges")
data2 = [trace3,trace4]
fig2 = go.Figure(data=data2,layout=layout2)
py.iplot(fig1)
py.iplot(fig2)
3D图
- 月费,总费,在网时长分组的关系
tel_df = data.copy()
data = data.drop(columns = "tenure_group",axis = 1)
trace1 = go.Scatter3d(x = churn["MonthlyCharges"],
y = churn["TotalCharges"],
z = churn["tenure"],
mode = "markers",
name = "Churn customers",
text = "Id : " + churn["customerID"],
marker = dict(size = 1,color = "red")
)
trace2 = go.Scatter3d(x = not_churn["MonthlyCharges"],
y = not_churn["TotalCharges"],
z = not_churn["tenure"],
name = "Non churn customers",
text = "Id : " + not_churn["customerID"],
mode = "markers",
marker = dict(size = 1,color= "green")
)
layout = go.Layout(dict(title = "Monthly charges,total charges & tenure in customer attrition",
scene = dict(camera = dict(up=dict(x= 0 , y=0, z=0),
center=dict(x=0, y=0, z=0),
eye=dict(x=1.25, y=1.25, z=1.25)),
xaxis = dict(title = "monthly charges",
gridcolor='rgb(255, 255, 255)',
zerolinecolor='rgb(255, 255, 255)',
showbackground=True,
backgroundcolor='rgb(230, 230,230)'),
yaxis = dict(title = "total charges",
gridcolor='rgb(255, 255, 255)',
zerolinecolor='rgb(255, 255, 255)',
showbackground=True,
backgroundcolor='rgb(230, 230,230)'
),
zaxis = dict(title = "tenure",
gridcolor='rgb(255, 255, 255)',
zerolinecolor='rgb(255, 255, 255)',
showbackground=True,
backgroundcolor='rgb(230, 230,230)'
)
),
height = 700,
)
)
data4 = [trace1,trace2]
fig = go.Figure(data = data4,layout = layout)
py.iplot(fig)
3.数据处理
构建建模数据
-
将列分类
-
去掉ID列,和目标列
-
分类变量列
-
二分类列
- 2个分类值
-
多分类列
- 小于6个分类值
-
-
数值列
- 超过6个不同值,不包括ID、流失
-
-
对数值列做标准化处理
-
分类列做特征化处理
-
替换原先数值列,加入标准化后数值列
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
Id_col = ['customerID']
target_col = ["Churn"]
cat_cols = data.nunique()[data.nunique() < 6].keys().tolist()
cat_cols = [x for x in cat_cols if x not in target_col]
num_cols = [x for x in data.columns if x not in cat_cols + target_col + Id_col]
#二分类值
bin_cols = data.nunique()[data.nunique() == 2].keys().tolist()
#多酚类值
multi_cols = [i for i in cat_cols if i not in bin_cols]
#二进制编码
le = LabelEncoder()
for i in bin_cols :
data[i] = le.fit_transform(data[i])
#多值列复制
data = pd.get_dummies(data = data,columns = multi_cols )
#缩放数值列
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)
#删除原始数值列合并新数值列
df_telcom_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index=True,right_index=True,how = "left")
summary = (df_telcom_og[[i for i in df_telcom_og.columns if i not in Id_col]].
describe().transpose().reset_index())
summary = summary.rename(columns = {"index" : "feature"})
summary = np.around(summary,3)
val_lst = [summary['feature'], summary['count'],
summary['mean'],summary['std'],
summary['min'], summary['25%'],
summary['50%'], summary['75%'], summary['max']]
trace = go.Table(header = dict(values = summary.columns.tolist(),
line = dict(color = ['#506784']),
fill = dict(color = ['#119DFF']),
),
cells = dict(values = val_lst,
line = dict(color = ['#506784']),
fill = dict(color = ["lightgrey",'#F5F8FF'])
),
columnwidth = [200,60,100,100,60,60,80,80,80])
layout = go.Layout(dict(title = "Variable Summary"))
figure = go.Figure(data=[trace],layout=layout)
py.iplot(figure)
统计分析,所有数值保留3位小数
新变量相关矩阵分析
#correlation
correlation = data.corr()
#tick labels
matrix_cols = correlation.columns.tolist()
#convert to array
corr_array = np.array(correlation)
#Plotting
trace = go.Heatmap(z = corr_array,
x = matrix_cols,
y = matrix_cols,
colorscale = "Viridis",
colorbar = dict(title = "Pearson Correlation coefficient",
titleside = "right"
) ,
)
layout = go.Layout(dict(title = "Correlation Matrix for variables",
autosize = False,
height = 720,
width = 800,
margin = dict(r = 0 ,l = 210,
t = 25,b = 210,
),
yaxis = dict(tickfont = dict(size = 9)),
xaxis = dict(tickfont = dict(size = 9))
)
)
data5 = [trace]
fig = go.Figure(data=data5,layout=layout)
py.iplot(fig)
主成分分析与可视化
- 散点图,颜色流失与否
二分类变量可视化
- 定义极坐标函数,变量分类计数,颜色1,0
bi_cs = telcom.nunique()[telcom.nunique() == 2].keys()
dat_rad = telcom[bi_cs]
def plot_radar(df,aggregate,title) :
data_frame = df[df["Churn"] == aggregate]
data_frame_x = data_frame[bi_cs].sum().reset_index()
data_frame_x.columns = ["feature","yes"]
data_frame_x["no"] = data_frame.shape[0] - data_frame_x["yes"]
data_frame_x = data_frame_x[data_frame_x["feature"] != "Churn"]
trace1 = go.Scatterpolar(r = data_frame_x["yes"].values.tolist(),
theta = data_frame_x["feature"].tolist(),
fill = "toself",name = "count 1",
mode = "markers+lines",
marker = dict(size = 5)
)
trace2 = go.Scatterpolar(r = data_frame_x["no"].values.tolist(),
theta = data_frame_x["feature"].tolist(),
fill = "toself",name = "count 0",
mode = "markers+lines",
marker = dict(size = 5)
)
layout = go.Layout(dict(polar = dict(radialaxis = dict(visible = True,
side = "counterclockwise",
showline = True,
linewidth = 2,
tickwidth = 2,
gridcolor = "white",
gridwidth = 2),
angularaxis = dict(tickfont = dict(size = 10),
layer = "below traces"
),
bgcolor = "rgb(243,243,243)",
),
paper_bgcolor = "rgb(243,243,243)",
title = title,height = 700))
data = [trace2,trace1]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
#plot
plot_radar(dat_rad,1,"Churn")
plot_radar(dat_rad,0,"No Churn")
4.数据建模
定义算法视图函数
- 混淆视图
- ROC曲线
- 特征重要性
- 阈值图
回归模型
- baseline
特征smote
回归模型-smote处理
- 克服样本不均衡
回归模型-RFE
- 递归特征消除
决策树分类
-
特征选取
-
SelectKBest,Chi
-
分类变量得分
- 散点图
-
数值变量得分
- 柱形图
-
-
-
模型拟合
-
取分类变量、数值变量得分的前三特征
-
数值变量
- gini
-
分类变量
- entropy
-
KNN分类器
-
定义新的视图函数
-
knn拟合
- smote特征
随机森林
-
定义随机森林视图函数
-
10颗决策树
-
原始特征
- entyopy
-
-
10棵决策树
-
rfe特征
- gini
-
高斯朴素贝叶斯
- smote特征
SVM支持向量机
-
smote特征
- linear核函数
-
调参
-
smote特征
-
非线性超平面
- rbf核函数
-
-
LGBM
- smote特征
XGBoost
- smote特征
XMind: ZEN - Trial Version