这是一个数据可视化的demo
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
warnings.filterwarnings('ignore')
df = pd.read_csv(
'https://labfile.oss.aliyuncs.com/courses/1283/telecom_churn.csv')
df.head()
#hist()方法绘制直方图
features = ['Total day minutes','Total intl calls']
#df[features].hist(figsize=(10,4))
#绘制密度图(density plots)
#df[features].plot(kind='density',subplots=True,layout=(1,2),sharex=False,figsize=(10,4),legend=False,title=features)
'''kindstr
The kind of plot to produce:
‘line’ : line plot (default)
‘bar’ : vertical bar plot
‘barh’ : horizontal bar plot
‘hist’ : histogram
‘box’ : boxplot
‘kde’ : Kernel Density Estimation plot
‘density’ : same as ‘kde’
‘area’ : area plot
‘pie’ : pie plot
‘scatter’ : scatter plot
‘hexbin’ : hexbin plot.
'''
#使用seaborn中的distplot()方法显示直方图和密度图,归一化处理
#sns.distplot(df['Total intl calls'])
#绘制箱型图
#sns.boxplot(x='Total intl calls',data=df)
#绘制提琴形图
#_,axes = plt.subplots(1,2,sharey=True,figsize=(6,4))
#sharey = True or 'all': x- or y-axis will be shared among all subplots.
#sns.boxplot(data=df['Total intl calls'],ax=axes[0])
#Axes object to draw the plot onto, otherwise uses the current Axes.
#sns.violinplot(data=df['Total intl calls'],ax=axes[1])
#数据描述
df[feature].describe()
df['Churn'].value_counts()#得到分布情况,数量
#绘制条形图,查看类别特征
# _,axes = plt.subplots(1,2,sharey=True,figsize=(10,4))
# sns.countplot(x='Churn',data=df,ax=axes[0])
# sns.countplot(x='Customer service calls',data=df,ax=axes[1])
#多变量可视化尝试
#丢弃非数值变量(bool)
numerical = list(set(df.columns)-set([
'state','International plan','Voice mail plan','Area code','Churn','Customer service calls'
]))
#计算绘图,corr()计算出没对特征间的相关性
#corr_matrix = df[numerical].corr()
#sns.heatmap(corr_matrix)
#根据计算结果可以得出Total day charge可以通过四个变量得出,去掉
numerical = list(set(numerical)-set([
'Total day charge','Total eve charge','Total night charge','Total intl charge'
]))
# corr_matrix = df[numerical].corr()
# sns.heatmap(corr_matrix)
#散点图
# plt.scatter(df['Total day minutes'],df['Total night minutes'])
#seaborn中的joinplot()可以在绘制散点图时绘制两张直方图
# sns.jointplot(x='Total day minutes',y='Total night minutes',data=df,kind='scatter')
#绘制平滑过的散点直方图
sns.jointplot('Total day minutes','Total night minutes',data=df,kind='kde',color='g',hue='minutes')
# kind{ “scatter” | “kde” | “hist” | “hex” | “reg” | “resid” }
#散点图矩阵
# %config InlineBackend.figure_format = 'png'
# sns.pairplot(df[numerical])
#数量和类别
sns.lmplot('Total day minutes','Total night minutes',data=df,hue='Churn',fit_reg=False)
#创建箱型图
numerical.append('Customer service calls')
fig,axes = plt.subplots(3,4,figsize=(10,6))
for idx,feat in enumerate(numerical):
ax = axes[int(idx/4),idx%4]
sns.boxplot(x='Churn',y=feat,data=df,ax=ax)
ax.set_xlabel('')
ax.set_ylabel(feat)
fig.tight_layout()
#创建箱型图和提琴形图,查看忠实客户和不忠实客户日通话分数
_,axes = plt.subplots(1,2,sharey=True,figsize=(10,6))
sns.boxplot(x='Churn',y='Total day minutes',data=df,ax=axes[0])
sns.violinplot(x='Churn',y='Total day minutes',data=df,ax=axes[1])
#分析两个类别维度下的数量变化
sns.catplot(x='Churn',y = 'Total day minutes',col='Customer service calls',data=df[df['Customer service calls']<8],
kind='box',col_wrap=4,height=3,aspect=.8)
#类别和类别
sns.countplot('Customer service calls',hue='Churn',data=df)
#交叉表
pd.crosstab(df['State'],df['Churn']).T
#计算离网率,排序
df.groupby(['State'])['Churn'].agg([np.mean]).sort_values(by='mean',ascending=False).T
部分图形显示