一.读取与处理新零售智能销售数据
import pandas as pd data4 = pd.read_csv('../data/订单表2018-4.csv', encoding='gbk') data5 = pd.read_csv('../data/订单表2018-5.csv', encoding='gbk') data6 = pd.read_csv('../data/订单表2018-6.csv', encoding='gbk') data7 = pd.read_csv('../data/订单表2018-7.csv', encoding='gbk') data8 = pd.read_csv('../data/订单表2018-8.csv', encoding='gbk') data9 = pd.read_csv('../data/订单表2018-9.csv', encoding='gbk') goods_info = pd.read_excel('../data/商品表.xlsx') print(data4.shape, data5.shape, data6.shape, data7.shape, data8.shape, data9.shape, goods_info.shape)
data = pd.concat([data4, data5, data6, data7, data8, data9], ignore_index=True) print('订单表合并后的形状为', data.shape) print('订单表各列的缺失值数目为:\n', data.isnull().sum()) # 删除缺失值 print('未做删除缺失值前订单表行列数目为:', data.shape) data = data.dropna(how='any') # 删除 print('删除完缺失值后订单表行列数目为:', data.shape) # 清洗商品表 print('商品表各列的缺失值数目为:\n', goods_info.isnull().sum()) # 删除缺失值 print('未做删除缺失值前商品表行列数目为:', goods_info.shape) goods_info = goods_info.dropna(how='any') print('删除完缺失值后商品表行列数目为:', goods_info.shape) # 从省市区中提取市的信息,并创建新列 data['市'] = data['省市区'].str[3: 6] print('经过处理后前5行为:\n', data.head())
# 定义一个需剔除的字符的list error_str = [' ', '(', ')', '(', ')', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'g', 'l', 'm', 'M', 'L', '听', '特', '饮', '罐', '瓶', '只', '装', '欧', '式', '&', '%', 'X', 'x', ';'] # 使用循环剔除指定字符 for i in error_str: data['商品详情'] = data['商品详情'].str.replace(i, '') # 新建一列 商品名称用于新数据存放 data['商品名称'] = data['商品详情'] data['商品名称'][0: 5]
# 删除金额较少的订单前的数据量
print(data.shape)
# 删除金额较少的订单后的数据量
data = data[data['总金额(元)'] >= 0.5]
print(data.shape)
# 将商品名称表中的部分商品进行名字统一 goods_info['商品名称'] = goods_info['商品名称'].str.replace('可口可乐', '可乐') goods_info['商品名称'] = goods_info['商品名称'].str.replace(' ', '') goods_info['商品名称'] = goods_info['商品名称'].str.replace('可比克薯片烧烤味', '可比克烧烤味') goods_info['商品名称'] = goods_info['商品名称'].str.replace('可比克薯片牛肉味', '可比克牛肉味') goods_info['商品名称'] = goods_info['商品名称'].str.replace('可比克薯片番茄味', '可比克番茄味') goods_info['商品名称'] = goods_info['商品名称'].str.replace('阿沙姆奶茶', '阿萨姆奶茶') goods_info['商品名称'] = goods_info['商品名称'].str.replace('罐装百威', '罐装百威啤酒') print(goods_info['商品名称']) goods_info.to_csv('../tmp/goods_info.csv', index=False, encoding = 'gbk')
# 降维订单数据 data = data.drop(['手续费(元)', '收款方', '软件版本', '省市区', '商品详情', '退款金额(元)'], axis=1) print('降维后,数据列为:\n', data.columns.values)
# 归约订单数据字段 # 将时间格式的字符串转换为标准的时间 data['下单时间'] = pd.to_datetime(data['下单时间']) data['小时'] = data['下单时间'].dt.hour # 提取时间中的小时,将其赋给新列 小时 data['月份'] = data['下单时间'].dt.month data['下单时间段'] = 'time' # 新增一列下单时间段,并将其初始化为time exp1 = data['小时'] <= 5 # 判断小时是否小于等于5 # 条件为真则时间段为凌晨 data.loc[exp1, '下单时间段'] = '凌晨' # 判断小时是否大于5且小于等于8 exp2 = (5 < data['小时']) & (data['小时'] <= 8) # 条件为真则时间段为早晨 data.loc[exp2, '下单时间段'] = '早晨' # 判断小时是否大于8且小于等于11 exp3 = (8 < data['小时']) & (data['小时'] <= 11) # 条件为真则时间段为上午 data.loc[exp3, '下单时间段'] = '上午' # 判断小时是否小大于11且小于等于13 exp4 = (11 < data['小时']) & (data['小时'] <= 13) # 条件为真则时间段为中午 data.loc[exp4, '下单时间段'] = '中午' # 判断小时是否大于13且小于等于16 exp5 = (13 < data['小时']) & (data['小时'] <= 16) # 条件为真则时间段为下午 data.loc[exp5, '下单时间段'] = '下午' # 判断小时是否大于16且小于等于19 exp6 = (16 < data['小时']) & (data['小时'] <= 19) # 条件为真则时间段为傍晚 data.loc[exp6, '下单时间段'] = '傍晚' # 判断小时是否大于19且小于等于24 exp7 = (19 < data['小时']) & (data['小时'] <= 24) # 条件为真则时间段为晚上 data.loc[exp7, '下单时间段'] = '晚上' print('处理完成后的订单表前5行为:\n', data.head()) data.to_csv('../tmp/order.csv', index=False, encoding = 'gbk')
二. 绘制可视化图形
# 销售额和新零售智能销售设备数量之间的关系 import pandas as pd import numpy as np from pyecharts.charts import Line from pyecharts import options as opts import matplotlib.pyplot as plt from pyecharts.charts import Bar from pyecharts.charts import Pie from pyecharts.charts import Grid data = pd.read_csv('../tmp/order.csv', encoding='gbk') def f(x): return len(list(set((x.values)))) groupby1 = data.groupby(by='月份', as_index=False).agg( {'设备编号': f, '总金额(元)': np.sum}) groupby1.columns = ['月份', '设备数量', '销售额'] line = (Line() .add_xaxis([str(i) for i in groupby1['月份'].values.tolist()]) .add_yaxis('销售额', np.round(groupby1['销售额'].values.tolist(), 2)) .add_yaxis('设备数量', groupby1['设备数量'].values.tolist(), yaxis_index=1) .set_series_opts(label_opts=opts.LabelOpts(is_show=True, position='top', font_size=10)) .set_global_opts( xaxis_opts=opts.AxisOpts( name='月份', name_location='center', name_gap=25), title_opts=opts.TitleOpts( title='销售额和新零售智能销售设备数量之间的关系'), yaxis_opts=opts.AxisOpts( name='销售额(元)', name_location='center', name_gap=60, axislabel_opts=opts.LabelOpts( formatter='{value}'))) .extend_axis( yaxis=opts.AxisOpts( name='设备数量(台)', name_location='center', name_gap=40, axislabel_opts=opts.LabelOpts( formatter='{value}'), interval=50)) ) line.render_notebook()
# 订单量和新零售智能销售设备数量的关系 groupby2 = data.groupby(by='月份', as_index=False).agg( {'设备编号': f, '订单编号': f}) groupby2.columns = ['月份', '设备数量', '订单数量'] # 绘制图形 plt.figure(figsize=(10, 4)) plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False fig, ax1 = plt.subplots() # 使用subplots函数创建窗口 ax1.plot(groupby2['月份'], groupby2['设备数量'], '--') ax1.set_yticks(range(0, 350, 50)) # 设置y1轴的刻度范围 ax1.legend(('设备数量',), loc='upper left', fontsize=10) ax2 = ax1.twinx() # 创建第二个坐标轴 ax2.plot(groupby2['月份'], groupby2['订单数量']) ax2.set_yticks(range(0, 100000, 10000)) # 设置y2轴的刻度范围 ax2.legend(('订单数量',), loc='left', fontsize=10) ax1.set_xlabel('月份') ax1.set_ylabel('设备数量(台)') ax2.set_ylabel('订单数量(单)') plt.title('订单数量和新零售智能销售设备数量之间的关系') plt.show()
# 各城市新零售智能设备平均销售总额条形图 gruop3 = data.groupby(by='市', as_index=False).agg({'总金额(元)':sum, '设备编号':f}) gruop3['销售总额'] = np.round(gruop3['总金额(元)'], 2) gruop3['平均销售总额'] = np.round(gruop3['销售总额'] / gruop3['设备编号'], 2) plt.bar(gruop3['市'].values.tolist(), gruop3['平均销售总额'].values.tolist(), color='#483D8B') # 给条形图添加数据标注 for x, y in enumerate(gruop3['平均销售总额'].values): plt.text(x - 0.4, y + 100, '%s' %y, fontsize=8) plt.title('各市新零售智能设备平均销售总额') plt.show()
from pyecharts.charts import Bar from pyecharts import options as opts import numpy as np group4 = data.groupby(by='商品名称', as_index=False)['总金额(元)'].sum() group4.sort_values(by='总金额(元)', ascending=False, inplace=True) d = group4.iloc[: 10] x_data = d['商品名称'].values.tolist() y_data = np.round(d['总金额(元)'].values, 2).tolist() bar = (Bar() .add_xaxis(x_data) .add_yaxis('', y_data, color='#CD853F') .set_global_opts(title_opts=opts.TitleOpts(title='畅销前10的商品'), xaxis_opts=opts.AxisOpts( type_='category', name_rotate='45', axislabel_opts={'interval': '0'}))) bar.render_notebook()
# 销售金额后10的商品及其金额 group4 = data.groupby(by='商品名称', as_index=False)['总金额(元)'].sum() group4.sort_values(by='总金额(元)', ascending=False, inplace=True) d = group4.iloc[-10: ] x_data = d['商品名称'].values.tolist() y_data = np.round(d['总金额(元)'].values, 2).tolist() bar = (Bar() .add_xaxis(x_data) .add_yaxis('', y_data, label_opts=opts.LabelOpts(position='right')) .set_global_opts(title_opts=opts.TitleOpts( title='滞销后10的商品'), xaxis_opts=opts.AxisOpts( axislabel_opts={'interval': '0'})) .reversal_axis() ) grid=Grid(init_opts=opts.InitOpts(width='600px',height='400px')) grid.add(bar,grid_opts=opts.GridOpts(pos_left='18%')) grid.render_notebook()
# 绘制各个城市销售金额的饼图 # 销售金额前10的商品及其占比 from pyecharts.charts import Pie from pyecharts import options as opts import numpy as np group5 = data.groupby(by=['市', '商品名称'], as_index=False)['总金额(元)'].sum() group5.sort_values(by='总金额(元)', ascending=False, inplace=True) citys = list(set(group5['市'].values)) for j in range(len(citys)): city = group5[group5['市'] == citys[j]] city = city.iloc[:10] d = [[city.iloc[i][1], np.round(city.iloc[i][2], 2)] for i in range(len(city))] pie = (Pie(init_opts=opts.InitOpts(width='800px', height='600px')) .add('', d, radius=[20, 180], rosetype='radius', center=[400, 300], color =['#FF3366','#FF00CC','#666FF','#FFCC00','#FFCCCC', '#CCFF33','#33FF99','#999900','#99FFFF','#CCCCCC']) .set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{d}%')) .set_global_opts(title_opts=opts.TitleOpts( title=citys[j], pos_bottom='10%', pos_left='50%'), legend_opts=opts.LegendOpts(is_show=False)) ) pie.render(citys[j]+'.html') pie.render_notebook()
# 绘制各自动售货机的销售总金额 group6 = data.groupby(by=['市', '设备编号'], as_index=False)['总金额(元)'].sum() group6.sort_values(by='总金额(元)', ascending=False, inplace=True) b = group6[: 10] label = [] # 前10的设备编号以及所在市 for i in range(len(b)): a=b.iloc[i, 0] + str(b.iloc[i, 1]) label.append(a) x = np.round(b['总金额(元)'], 2).values.tolist() y = range(10) plt.bar(x=0, bottom=y, height=0.4, width=x, orientation='horizontal') plt.xticks(range(0, 80000, 10000)) # 设置x轴的刻度范围 plt.yticks(range(10), label) for y, x in enumerate(np.round(b['总金额(元)'], 2).values): plt.text(x + 500, y - 0.2, "%s" %x) plt.xlabel('总金额(元)') plt.title('销售额前10的设备以及其所在市') plt.show()
# 绘制各新零售智能销售设备的销售总金额 group6 = data.groupby(by=['市', '设备编号'], as_index=False)['总金额(元)'].sum() group6.sort_values(by='总金额(元)', ascending=False, inplace=True) b = group6[-10: ] label1 = [] # 前10的设备编号以及所在市 for i in range(len(b)): a = b.iloc[i, 0] + str(b.iloc[i, 1]) label1.append(a) x = np.round(b['总金额(元)'], 2).values.tolist() y = range(10) plt.bar(x=0, bottom=y, height=0.4, width=x, orientation='horizontal') plt.xticks(range(0, 4, 1)) # 设置x轴的刻度范围 plt.yticks(range(10), label1) for y, x in enumerate(np.round(b['总金额(元)'], 2).values): plt.text(x, y, "%s" %x) plt.xlabel('总金额(元)') plt.title('销售额后10的设备以及其所在市') plt.show()
# 统计各城市销售金额小于100的设备数理 l_b = group6[group6['总金额(元)'] < 100] lb = l_b.groupby(by='市', as_index=False)['设备编号'].count() x_data = lb['市'].values.tolist() y_data = lb['设备编号'].values.tolist() bar = (Bar(init_opts=opts.InitOpts(width='500px', height='400px')) .add_xaxis(x_data) .add_yaxis('', y_data) .set_global_opts(title_opts=opts.TitleOpts( title='各市销售额小于100的设备数量')) ) bar.render_notebook()
# 计算售罄率 # 售罄率 = 销售量/进货量 goods_info = pd.read_csv('../tmp/goods_info.csv', encoding='gbk') sale_out = goods_info.groupby('月份').agg( {'销售数量': sum})['销售数量'] / goods_info.groupby('月份').agg( {'进货数量': sum})['进货数量'] # print('各月份的售罄率为:\n',sale_out) # 绘制售罄率月走势折线图 x_data = [str(i) + '月' for i in sale_out.index.tolist()] y_data = np.round(sale_out, 4).values.tolist() plt.plot(x_data, y_data) for i in range(len(y_data)): plt.text(x_data[i], y_data[i], '%s' %round(y_data[i],3), fontsize=10) plt.title('售罄率月走势') plt.show()
# 计算各个月库存成本 # 库存成本 = 销售单价 * 库存量 goods_info['库存成本'] = goods_info['销售金额'] / goods_info['销售数量'] * ( goods_info['库存数量']) goods_cost = goods_info.groupby('月份').agg({'库存成本': sum}) x_data = [str(i) + '月' for i in goods_cost.index.tolist()] y_data = np.round(goods_cost, 2).values.tolist() line = (Line() .add_xaxis(x_data) .add_yaxis('', y_data) .set_series_opts(label_opts=opts.LabelOpts(is_show=True, position='left')) .set_global_opts(title_opts=opts.TitleOpts( title='各个月库存成本走势')) ) line.render_notebook()
# 各个月销售数量,库存数量,进货数量的折线图 sale_in_out = goods_info.groupby( by='月份')['销售数量', '库存数量', '进货数量'].sum() x_data = [str(i) + '月' for i in sale_in_out.index.tolist()] line = (Line() .add_xaxis(x_data) .add_yaxis('销售数量', sale_in_out['销售数量'].values.tolist(), color='red', label_opts=opts.LabelOpts(is_show=False)) .add_yaxis('库存数量', sale_in_out['库存数量'].values.tolist(), color='blue', label_opts=opts.LabelOpts(is_show=False)) .add_yaxis('进货数量', sale_in_out['进货数量'].values.tolist(), color='green', label_opts=opts.LabelOpts(is_show=False)) .set_global_opts(title_opts=opts.TitleOpts( title='进货数量、库存数量和销售数量月走势')) ) line.render_notebook()
# 用户支付方式饼图 group7 = data.groupby(by='支付状态')['支付状态'].count() method = group7.index.tolist() num = group7.values.tolist() pie_data = [(i, j) for i, j in zip(method, num)] pie = (Pie() .add('', pie_data, label_opts=opts.LabelOpts(formatter='{b}:{c}({d}%)')) .set_global_opts(title_opts=opts.TitleOpts(title='用户支付方式'))) pie.render_notebook()
# 各区域用户数目饼图 group8 = data.groupby(by='市')['购买用户'].count() cities = group8.index.tolist() num = group8.values.tolist() pie_data_2 = [(i, j) for i, j in zip(cities, num)] pie=(Pie() .add('', pie_data_2,label_opts=opts.LabelOpts( formatter='{b}:{c}({d}%)'), radius=[20, 100]) .set_global_opts(title_opts=opts.TitleOpts(title='用户所在城市')) ) pie.render_notebook()
# 用户的消费时段饼图 group9 = data.groupby(by='下单时间段')['购买用户'].count() times = group9.index.tolist() num = group8.values.tolist() pie_data_2 = [(i, j) for i, j in zip(times, num)] pie = (Pie() .add('', pie_data_2, label_opts=opts.LabelOpts(formatter='{b}:{c}({d}%)'), radius=[60, 200], rosetype='radius', is_clockwise=False) .set_global_opts(title_opts=opts.TitleOpts(title='用户消费时间段')) ) pie.render_notebook()