请处理该数据中的缺失值和异常值。
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
data=pd.read_csv("某超市的销售数据.csv",encoding='gbk')
data.dropna(axis=0,how="any",inplace=True)
print(data.head())
print(data.shape)
print(data.isnull().sum())
解析
import numpy as np
import pandas as pd
data=pd.read_csv("某超市的销售数据.csv",encoding='gbk')
data.dropna(axis=0,how='any',inplace=True)#删除空值
#print(data.shape)#数据形状
#print(data.isnull().sum())
print(data.head())#输出前五条数据
数据统计处理
进行数据统计处理,统计各大类商品的销售总额。
销售总额=销售数量*销售金额
import numpy as np
import pandas as pd
data=pd.read_csv("某超市的销售数据.csv",encoding='gbk')
data.dropna(axis=0,how='any',inplace=True)#删除空值
data['销售金额']=data['销售金额'].astype(float)
data['销售数量']=data['销售数量'].astype(float)
data['销售总额']=data['销售金额']*data['销售数量']
#print(data.shape)#数据形状
#print(data.isnull().sum())
print(data.head())#输出前五条数据
dl=data.groupby('大类名称')
dlsum=dl['销售总额'].sum()
print(dlsum)
数据可视化
绘制各大类商品的销售总额的条形图
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
#正常显示中文
mpl.rcParams['font.sans-serif']=['KaiTi']
mpl.rcParams['axes.unicode_minus']=False
data=pd.read_csv("某超市的销售数据.csv",encoding='gbk')
data.dropna(axis=0,how='any',inplace=True)#删除空值
data['销售金额']=data['销售金额'].astype(float)
data['销售数量']=data['销售数量'].astype(float)
data['销售总额']=data['销售金额']*data['销售数量']
#print(data.shape)#数据形状
#print(data.isnull().sum())
print(data.head())#输出前五条数据
dl=data.groupby('大类名称')
dlsum=dl['销售总额'].sum()
print(dlsum)
plt.figure(figsize=(10,10),dpi=80)
plt.xlabel("大类名称")
plt.ylabel("销售总额")
plt.title("大类销售总额图")
plt.bar(dlsum.index,dlsum.values)
plt.show()
饼图绘制
按月绘制个大类商品销售总额的占比饼图(每个月的数据绘制一个子图,如下图所示)。
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['font.sans-serif']=['SimHei']
matplotlib.rcParams['axes.unicode_minus']=False
data=pd.read_csv("某超市的销售数据.csv",encoding='gbk')
data.dropna(axis=0,how="any",inplace=True)
del data['顾客编号']
data["销售金额"]=data["销售金额"].astype(float)
data["销售数量"]=data["销售数量"].astype(float)
data['销售总额'] = data['销售数量'] * data['销售金额']
print(data['销售总额'] )
x = data.groupby(['大类名称'])
y=x['销售总额'].sum()
print(y)
plt.figure()
plt.bar(x=y.index,height=y.values)
plt.show()
dl=data.groupby(['销售月份','大类名称'])
dlsum=dl['销售总额'].sum()
print(dlsum[201501])
print(dlsum[201501].index)
print(dlsum[201501].values)
p=plt.figure(figsize=(20,20),dpi=600)
p.add_subplot(2,2,1)
plt.title("1月销售总额")
plt.pie(dlsum[201501].values,labels=dlsum[201501].index,autopct="%1.1f%%")
months =[201501,201502,201503,201504]
for i in range(4):
size=dlsum[months[i]].values
label=dlsum[months[i]].index
p.add_subplot(2,2,(i+1))
plt.title(f"{i+1}月大类销售总额图")
plt.pie(size, labels=label, autopct='%1.1f%%')
plt.show()