import numpy as np
import pandas as pd
import warnings
#加载模块绘制图形
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
df = pd.read_csv('https://labfile.oss.aliyuncs.com/courses/1283/telecom_churn.csv')
df.head()#显示前五行数据
df.shape#查看数据库维度
df.columns#打印列名
#df.info()#输出DataFrame的一些总体信息
df.describe()#显示数值特征的基本统计学特性
df.describe(include=['object','bool'])#查看非数值特征的统计数据
df['Churn'].value_counts()#查看类别(object)和bool值得特征
df['Churn'].value_counts(normalize=True)#以比例的形式呈现
#排序
df.sort_values(by='Total day charge',ascending=False).head()#根据每日消费额倒叙排序,默认是正序从小到大
df.sort_values(by=['Churn','Total day charge'],ascending=[True,False]).head()#以Churn正序,以Total day charge倒序
#索引和获取数据
df['Churn'].mean()#离网率
df[df['Churn']==1].mean()#离网用户数值变量的均值
df[df['Churn']==1]['Total day minutes'].mean()#离网用户白天打电话的时长是多少
df[(df['Churn']==0)&(df['International plan']=='No')]['Total intl minutes'].max()#未使用国际套餐的忠实用户所打的国际长途是多久
df.loc[0:5,'State':'Area code']#输出0至5行,state至area code区号的数据
df.iloc[0:5,0:3]#输出前五行,前三列的数据
df[:1]#首行
df[-1:]#末行
#应用函数到单元格、列、行
df.apply(np.max)#输出每列的最大值
df[df['State'].apply(lambda state:state[0]=='W')].head()#选中以w开头的州
#替换
d = {'No':False,'Yes':True}
df['International plan']=df['International plan'].map(d)
df.head()
df = df.replace({'Voice mail plan':d})
df.head()
#分组(groupby)
columns_to_show = ['Total day minutes','Total eve minutes','Total night minutes']
df.groupby(['Churn'])[columns_to_show].describe(percentiles=[])
df.groupby(['Churn'])[columns_to_show].agg([np.mean,np.std,np.min,np.max])
#汇总表
#透视表
df.pivot_table(['Total day calls','Total eve calls','Total night calls'],['Area code'],aggfunc='mean')
#交叉表
pd.crosstab(df['Churn'],df['International plan'])
pd.crosstab(df['Churn'],df['Voice mail plan'],normalize=True)
#增加DataFrame的行列
total_calls = df['Total day calls']+df['Total eve calls']+df['Total night calls']+df['Total intl calls']
#loc参数是插入series对象后选择的列数
#设置为len(df.columns)将计算后的Total calls粘贴到最后一列
df.insert(loc=len(df.columns),column='Total calls',value=total_calls)
df.head()
#other way
#df['Total calls'] = df['Total day calls']+df['Total eve calls']+df['Total night calls']+df['Total intl calls']
#删除列和行,drop()
df.drop(['Total calls'],axis=1,inplace=True)#删除列,axis=1,inplace=True表示修改当前DataFrame
df.drop([1,2]).head()#删除行,axis=0默认
sns.countplot(x='International plan',hue='Churn',data=df)
pd.crosstab(df['Churn'], df['Customer service calls'], margins=True)
sns.countplot(x='Customer service calls',hue='Churn',data=df)
机器学习——pandas
最新推荐文章于 2024-06-12 21:47:21 发布