【数据分析】京东订单数据分析思路及Python代码

最新推荐文章于 2023-08-21 14:25:20 发布

♚人间海

最新推荐文章于 2023-08-21 14:25:20 发布

阅读量3.5k

点赞数 7

文章标签：数据分析 python 数据挖掘

本文链接：https://blog.csdn.net/weixin_44727383/article/details/125019521

版权

京东订单数据介绍
● 2020年5月25日
● 10%抽样数据
● 大家电-家用电器-冰箱
● 70k+

	用户属性
user_log_acct	用户账号
user_site_city_id	用户城市ID
user_site_province_id	用户省份ID
user_lv_cd	用户等级

	订单属性
sale_ord_id	订单ID
sale_ord_tm	订单创建时间
sale_ord_valid_flag	订单是否有效
cancel_flag	订单是否被取消
check_account_tm	支付时间

	商品属性
item_name	商品名称
brandname	品牌名称
sale_qtty	产品数量
before_prefr_unit_price	优惠前单价
after_prefr_unit_price	优惠后单价
user_actual_pay_amount	实际支付价格
total_offer_amount	总优惠金额

数据清理
缺失值处理
用户城市和省份信息有部分缺失，猜测原因是部分用户隐藏了IP地址，不影响分析
部分订单的订单时间是空值，原因是订单尚未支付，可以将其赋特殊值比如-1或者不处理，不影响分析
数据逻辑错误处理
有用户的城市ID为-1
优惠前冰箱的最低价格为288，数据中存在大量的低价订单，其中绝大部分是保修、返现等无价值订单，一小部分是异常订单，可以忽略
订单数据分析的基本思路
数据指标
订单总量（有效订单量、取消订单量、待支付订单量）
GMV （所有有效订单的总交易额）
实际销售额
客单价
商品销售数量
用户数
复购率
支付时长时长过长用户体验、用户流失

数据维度

商品维度	不同类目的商品、同一类目不同净值的商品
时间维度	趋势变化、节假日及特殊日期
用户维度	商品偏好
地区维度	用户行为、用户画像

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy import stats
from matplotlib.ticker import FuncFormatter
plt.rcParams['font.sans-serif']=['Arial Unicode MS']

import warnings
warnings.filterwarnings('ignore')

订单数据指标描述
t.user_log_acct, --用户账号
t.parent_sale_ord_id, --父订单号
t.sale_ord_id, --订单号
t.sale_ord_tm, --订单时间
t.sale_ord_dt, --订单日期
t.item_sku_id, --商品sku
t.item_name, --商品名称
t.brandname, --品牌名称
t.sale_qtty, --商品数量
t.item_first_cate_name, --一级品类名称
t.item_second_cate_name, --二级品类名称
t.item_third_cate_name, --一级品类名称
t.before_prefr_unit_price, --优惠前单价
t.after_prefr_unit_price, --优惠后单价
t.user_actual_pay_amount, --实际支付价格
t.sale_ord_valid_flag, --订单有效标志
t.cancel_flag, --订单取消标志
t.check_account_tm, --支付时间
t.total_offer_amount, --总优惠金额
t.self_ord_flag, --自营标志
t.user_site_city_id ,–用户所在城市编号
t.user_site_province_id,–用户所在省份编号
t.user_lv_cd --用户会员等级

#读取订单数据

order = 'course_order_d.csv'
df = pd.read_csv(order,sep='\t', encoding="utf-8", dtype=str)
df.head()

在这里插入图片描述
#查看数据信息df.info()

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76631 entries, 0 to 76630
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   user_log_acct            76631 non-null  object
 1   parent_sale_ord_id       76631 non-null  object
 2   sale_ord_id              76631 non-null  object
 3   sale_ord_tm              76631 non-null  object
 4   sale_ord_dt              76631 non-null  object
 5   item_sku_id              76631 non-null  object
 6   item_name                76631 non-null  object
 7   brandname                76631 non-null  object
 8   sale_qtty                76631 non-null  object
 9   item_first_cate_name     76631 non-null  object
 10  item_second_cate_name    76631 non-null  object
 11  item_third_cate_name     76631 non-null  object
 12  before_prefr_unit_price  76631 non-null  object
 13  after_prefr_unit_price   76631 non-null  object
 14  user_actual_pay_amount   76631 non-null  object
 15  sale_ord_valid_flag      76631 non-null  object
 16  cancel_flag              76631 non-null  object
 17  check_account_tm         53360 non-null  object
 18  total_offer_amount       76631 non-null  object
 19  self_ord_flag            76631 non-null  object
 20  user_site_city_id        38441 non-null  object
 21  user_site_province_id    38598 non-null  object
 22  user_lv_cd               76631 non-null  object
dtypes: object(23)
memory usage: 13.4+ MB

#查看空数据isnull()

df.isnull().sum().sort_values(ascending=False)

#数据类型转换astype

df['sale_qtty'] = df['sale_qtty'].astype('int')
df['sale_ord_valid_flag'] = df['sale_ord_valid_flag'].astype('int')
df['cancel_flag'] = df['cancel_flag'].astype('int')
df['self_ord_flag'] = df['self_ord_flag'].astype('int')

df['before_prefr_unit_price'] = df['before_prefr_unit_price'].astype('float')
df['after_prefr_unit_price'] = df['after_prefr_unit_price'].astype('float')
df['user_actual_pay_amount'] = df['user_actual_pay_amount'].astype('float')
df['total_offer_amount'] = df['total_offer_amount'].astype('float')

df.loc[:,'check_account_tm '] = pd.to_datetime(df.loc[:,'check_account_tm'])
df.loc[:,'sale_ord_tm'] = pd.to_datetime(df.loc[:,'sale_ord_tm'])
df.loc[:,'sale_ord_dt'] = pd.to_datetime(df.loc[:,'sale_ord_dt'])

#缺失值&异常值处理

# 优惠前冰箱的最低价格为288元，低于此价格的订单认为是异常订单
(df.loc[:,'before_prefr_unit_price']<288).sum() 

(df.loc[:,'after_prefr_unit_price']<0).sum()

(df.loc[:,'user_actual_pay_amount']<0).sum()

(df.loc[:,'total_offer_amount']<0).sum()

df = df[df['before_prefr_unit_price' ]>= 288]
print('删除异常值后：',df.shape)

#查看重复订单ID

df.sale_ord_id.duplicated()
# 去掉订单号重复的数据（这里京东的建议保留第一个）
df.drop_duplicates(subset['sale_ord_id'],keep='first',inplace=True) 
df.info()

#t填充缺失值fillna

df.user_site_city_id=df.user_site_city_id.fillna('Not Given')
df.user_site_province_id =df.user_site_province_id.fillna('Not Given')

#查看数值信息

df.describe()

#计算总支付金额

df['total_actual_pay'] = df['sale_qtty'] * df['after_prefr_unit_price']
df

宏观分析

把握总体特征

订单总量: 62162
总用户数：42464
GMV：52716192 元
销售额: 46375448 元

#取消订单数量
order_cancel = df[df.cancel_flag==1]['sale_ord_id'].count()
order_cancel
#订单数量
order_num = df['sale_ord_id'].count()
order_num

解决matplotlib中文乱码


matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False

labels = ['取消','未取消']
X = [order_cancel, order_num-order_cancel]
fig = plt.figure()
plt.pie(X,labels=labels,autopct='%1.2f%%') # autopct :控制饼图内百分比设置, '%1.1f'指小数点前后位数(没有用空格补齐)；
plt.title("订单总数")

在这里插入图片描述
#df2只包含有效订单

df2 = df.copy()
df2 = df2[(df2['sale_ord_valid_flag'] == 1)&(df2['cancel_flag'] == 0)&('before_prefr_unit_price' != 0)] 
#有效订单数量
order_vaild = df2['sale_ord_id'].count()
order_vaild
#支付订单数量
order_payed = df2['sale_ord_id'][df2['user_actual_pay_amount'] != 0].count()
order_payed
#未支付订单数量
order_unpay = df2['sale_ord_id'][df2['user_actual_pay_amount'] == 0].count()
order_unpay

labels = ['支付','未支付']
Y = [order_payed, order_unpay]
fig = plt.figure()
plt.pie(Y,labels=labels,autopct='%1.2f%%')
plt.title("有效订单总数")

在这里插入图片描述
订单的价格分布

price_series = df2['after_prefr_unit_price']
price_series
price_series_num = price_series.count()

#生成直方图函数
hist, bin_edges = np.histogram(price_series, bins=80) 
hist_sum = np.cumsum(hist)
hist_per = hist_sum / price_series_num

print('hist:{}'.format(hist))
print('*'*100)
print('bin_edges:{}'.format(bin_edges))
print('*'*100)
print('hist_sum:{}'.format(hist_sum))

bin_edges_plot = np.delete(bin_edges, 0)

plt.figure(figsize=(20,8), dpi=80)
plt.xlabel('订单价格')
plt.ylabel('百分比')

plt.style.use('ggplot')

def to_percent(temp, position):
    return '%1.0f'%(100*temp) + '%'
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))

plt.plot(bin_edges_plot, hist_per, color='blue')

在这里插入图片描述

通过整体的价格分布我们可以发现，约80%的订单在3000元以下。

微观分析

#有效订单量

df3 = df2.copy()
df3['order_time_hms'] = df3['sale_ord_tm'].apply(lambda x: x.strftime('%H:00:00'))
pay_time_df = df3.groupby('order_time_hms')['sale_ord_id'].count()
pay_time_df

x = pay_time_df.index
y = pay_time_df.values

plt.figure(figsize=(20,8),dpi=80)
plt.style.use('ggplot')
plt.xlabel('时间')
plt.ylabel("有效订单量")
plt.xticks(range(len(x)), x, rotation=90)
rect = plt.bar(x, y, width=0.3, color=['#6699CC'])

在这里插入图片描述

通过对时间维度进行深挖，我们发现0时是订单数量最多的时间，在这个小时内我们有超过4000个订单，远远超过其他任何时间的订单数。
除了0时之外，上午十点和晚上八点也是订单的高峰期。
0时的订单量是不是因为异常值呢？比如某几个顾客下了很多笔订单？

#时间维度来对订单数据进行拆分-人均有效订单量

order_time_df = df3.groupby('order_time_hms')['sale_ord_id'].agg({'order_num':'count'})

user_time_df = df3.groupby('order_time_hms')['user_log_acct'].agg({'user_num':'nunique'})

order_num_per_user = order_time_df['order_num'] / user_time_df['user_num']

x = order_num_per_user.index
y = order_num_per_user.values

plt.figure(figsize=(20,8),dpi=80)
plt.style.use('ggplot')
plt.xlabel('时间')
plt.ylabel("人均有效订单量")
plt.xticks(range(len(x)),x,rotation=90)
plt.plot(x, y)

在这里插入图片描述

我们可以用时间维度上的人均有效订单量来验证我们的猜想。
从数据来看，0时的人均有效订单量确实偏高（1.06），但是低于峰值（1时的1.08）并且和11时的数值非常接近。
这说明0时出现异常高的单人订单量的可能性很小。

#客单价和平均订单价格

total_pay_time_df = df3.groupby('order_time_hms')['total_actual_pay'].agg({'total_pay':'sum'})

pay_per_user = total_pay_time_df['total_pay'] / user_time_df['user_num'] # 客单价：销售额 / 顾客数
pay_per_order = total_pay_time_df['total_pay'] / order_time_df['order_num'] # 平均订单价：销售额 / 订单数

x = pay_per_user.index
y = pay_per_user.values
y2 = pay_per_order.values

plt.figure(figsize=(20,8),dpi=80)
plt.style.use('ggplot')
plt.xlabel('时间')
plt.ylabel("价格")
plt.xticks(range(len(x)),x,rotation=90)

plt.plot(x, y, color='red',linewidth=2.0,linestyle='--')
plt.plot(x, y2, color='blue',linewidth=3.0,linestyle='-.')
plt.legend(['客单价','平均订单价'])

在这里插入图片描述

从客单价和平均订单价格来看，0时的客单价和平均订单价格是全天中最高的。这表明在这个时间段下单的用户净值较高
那么问题又来了, 会不会是一小部分用户的极端购物行为产生了这样的数据呢？

#价格累计分布图

df4 = df3.copy()
df5 = df3.copy()

df4 = df4[df4['order_time_hms'] == '00:00:00']
df5 = df5[df5['order_time_hms'] == '20:00:00']

def plot_acc_line(price_series, bin_num):
    len = price_series.count()
    hist, bin_edges = np.histogram(price_series, bins=bin_num) #生成直方图函数
    hist_sum = np.cumsum(hist)
    hist_per = hist_sum / len * 100
    hist_per_plot = np.insert(hist_per, 0, 0)

    plt.figure(figsize=(20,8), dpi=80)
    plt.xlabel('订单价格')
    plt.ylabel('百分比')

    plt.plot(bin_edges, hist_per_plot, color='blue')

#0时价格累积分布折线图

price_series_0 = df4['after_prefr_unit_price']
plot_acc_line(price_series_0, 100)

在这里插入图片描述
#20时价格累积分布折线图

price_series_20 = df5['after_prefr_unit_price']
plot_acc_line(price_series_0, 100)

在这里插入图片描述

我们把0时下的订单的价格累计分布和20时（除了0时外订单数量最多的时间）的订单价格累计分布来进行对比：
约25%的0时用户的订单在2000元以下，而20时用户2000元以下的订单占约75%
这说明0时的用户并不是只有一小部分的订单价格很高，而是大部分0时下单的用户都具有更高的客单价

从时间维度对订单进行拆分

#0时的优惠订单数
offer_order_0 = df4['sale_ord_id'][df4['total_offer_amount'] > 0].count()

#0时订单数
order_num_0 = df4['sale_ord_id'].count()

#0时优惠订单比
offer_order_per_0 = offer_order_0 / order_num_0

print('0时的优惠订单数:{}, 0时的订单数:{}, 优惠订单比例：{}'.format(offer_order_0, order_num_0, offer_order_per_0))

0时的优惠订单数:3788, 0时的订单数:4032, 优惠订单比例：0.939484126984127

#全部优惠订单数
offer_order_all = df3['sale_ord_id'][df3['total_offer_amount'] > 0].count()

#全部订单数
order_all = df3['sale_ord_id'].count()

#其他时间优惠订单数
offer_order_other = offer_order_all - offer_order_0

#其他时间订单数
order_num_other = order_all - order_num_0

offer_order_per_other = offer_order_other / order_num_other

print('其他时间的优惠订单数:{}, 其他时间的订单数:{}, 其他时间优惠订单比例：{}'.format(offer_order_other, order_num_other, offer_order_per_other))

其他时间的优惠订单数:25983, 其他时间的订单数:29814, 其他时间优惠订单比例：0.8715033205876433

#0时和其他时间的优惠订单的占比对比:可视化

plt.figure(figsize=(8, 6), dpi=80)
N = 2
index = ('0时', '除了0时以外')
data = (offer_order_per_0, offer_order_per_other)
width = 0.35
plt.ylabel("优惠订单占比")

def to_percent(temp, position):
    return '%1.0f'%(100*temp) + '%'
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))

p2 = plt.bar(index, data, width, color='#6699CC')

在这里插入图片描述

total_pay_time_df = df3.groupby('order_time_hms')['total_offer_amount'].agg({'total_offer_amount':'sum'})

offer_amount_0 = total_pay_time_df['total_offer_amount'][0]

offer_amount_other = total_pay_time_df[1:].apply(lambda x: x.sum())['total_offer_amount'] #按行求和

offer_amount_0_avg = offer_amount_0 / offer_order_0
offer_amount_other_avg = offer_amount_other / offer_order_other

print('0时平均优惠价格:{}, 其他时间平均优惠价格:{}'.format(offer_amount_0_avg, offer_amount_other_avg))

0时平均优惠价格:732.0646779303062, 其他时间平均优惠价格:456.9508043720895

#0时和其他时间的平均优惠价格对比：可视化
plt.figure(figsize=(8, 6), dpi=80)
N = 2
index = ('0时', '除了0时以外')

values = (offer_amount_0_avg, offer_amount_other_avg)
width = 0.35

plt.ylabel("优惠价格／元")

p2 = plt.bar(index, values, width, color='#6699CC')

在这里插入图片描述

从优惠价格上来看，在0时，接近94%的订单都有优惠，而在其他时间，只有约87%的订单可以得到价格优惠。
在得到价格优惠的订单里，0时下单的订单的平均优惠价格约739元，而在其他时间下单的订单，平均可以获得461元的优惠，严重小于0时下单的订单

从地区维度对订单进行拆分

把眼光投向地区维度，首先可以帮助我们确定头部的市场

df6 = df2.copy()

order_area_df = df6.groupby('user_site_province_id', as_index=False)['sale_ord_id'].agg({'order_num':'count'})
order_area_df.columns = ['province_id','order_num']
order_area_df

order_area_df.drop([34], inplace=True)
order_area_df['province_id']=order_area_df['province_id'].astype('int')

city = 'city_level.csv'
df_city = pd.read_csv(city,sep = ',', encoding="gbk", dtype=str)
df_city['province_id'] = df_city['province_id'].astype('int')

# 保留重复数据的第一个，也就是只保留省份数据
df_city = df_city.drop_duplicates(subset=['province_id'], keep='first') 

df_city = df_city[['province_id','dim_province_name']].sort_values(by='province_id',ascending=True).reset_index()
df_city.drop(['index'], axis=1, inplace=True)

order_province_df = pd.merge(order_area_df, df_city, on='province_id').sort_values(by='order_num', ascending=False)

#有效订单量


plt.style.use('ggplot')

x = order_province_df['dim_province_name']
y = order_province_df['order_num']

plt.figure(figsize=(20,8),dpi=80)
plt.style.use('ggplot')
plt.xlabel('时间')
plt.ylabel("有效订单量")
plt.xticks(range(len(x)), x, rotation=90)
rect = plt.bar(x, y, width=0.3, color=['#6699CC'])

在这里插入图片描述
#有效订单量-饼图


plt.figure(figsize=(6,9)) 
labels = order_province_df['dim_province_name']

plt.pie(order_province_df['order_num'], labels=labels,autopct='%1.2f%%') # autopct :控制饼图内百分比设置, '%1.1f'指小数点前后位数(没有用空格补齐)；

plt.axis('equal')
plt.show()

在这里插入图片描述
#各省份客单价对比

通过对客单价进行比较，可以找出购买力比较强的地区



cust_price_df = df6.groupby('user_site_province_id', as_index=False)['total_actual_pay'].agg({'total_pay':'sum'})
cust_price_df.columns = ['province_id','total_pay']
cust_price_df.drop([34], inplace=True)
cust_price_df['province_id'] = cust_price_df['province_id'].astype('int')
cust_price_df = pd.merge(cust_price_df, df_city, on='province_id').sort_values(by='total_pay', ascending=False)
cust_price_df['order_num'] = order_province_df['order_num']

cust_df = df6.groupby('user_site_province_id', as_index=False)['user_log_acct'].agg({'user_num':'nunique'})
cust_df.columns = ['province_id','user_num']
cust_df.drop([34], inplace=True)
cust_df['province_id'] = cust_df['province_id'].astype('int')

cust_price_df = pd.merge(cust_price_df, cust_df, on='province_id')
cust_price_df['cust_price'] = cust_price_df['total_pay'] / cust_price_df['user_num'] #计算客单价
cust_price_df = cust_price_df.sort_values(by='order_num', ascending=False)
cust_price_df = cust_price_df[:10]
cust_price_df = cust_price_df.sort_values(by='cust_price', ascending=False)

cust_price_df

plt.style.use('ggplot')

x = cust_price_df['dim_province_name']
y = cust_price_df['cust_price']

plt.figure(figsize=(20,8),dpi=80)
plt.style.use('ggplot')
plt.xlabel('时间')
plt.ylabel("客单价")
rect = plt.bar(x, y, width=0.3, color=['#6699CC'])

在这里插入图片描述

plt.figure(figsize = (15,10))

x = cust_price_df['cust_price']
y = cust_price_df['order_num']

ax = sns.scatterplot(x, y, alpha=0.5, s=x*3, c=['#6699CC']) # 绘制气泡图. alpha是不透明度
ax.set_xlabel("客单价",fontsize=12)
ax.set_ylabel("订单数量",fontsize=12)

province_list = [3,5,2,1,6,7,4,9,0,11]
# 在气泡上加文字
for line in province_list:
    ax.text(x[line], y[line], cust_price_df['dim_province_name'][line], horizontalalignment='center', size='large', color='black', weight='semibold')

在这里插入图片描述
#头部省份的四个品牌的渗透率

#不同品牌的产品单价
df7 = df2.copy()

brand_sale_df = df7.groupby('brandname', as_index=False).agg({'total_actual_pay':'sum', 'sale_qtty':'sum'}).sort_values(by='total_actual_pay', ascending=False)
brand_sale_df

df8 = df7.copy()

df8 = df8[df8['user_site_province_id'] == '1'] # 省份取北京，数字是省份id

brand_sale_df_bj = df8.groupby('brandname', as_index=False).agg({'total_actual_pay':'sum', 'sale_qtty':'sum'}).sort_values(by='total_actual_pay', ascending=False)
brand_sale_df_bj = brand_sale_df_bj[(brand_sale_df_bj['brandname'] == '海尔（Haier）')|(brand_sale_df_bj['brandname'] == '容声（Ronshen）')|(brand_sale_df_bj['brandname'] == '西门子（SIEMENS）')|(brand_sale_df_bj['brandname'] == '美的（Midea）')]
brand_sale_df_bj

df8 = df7.copy()

df8 = df8[df8['brandname'] == '海尔（Haier）']

brand_sale_df_haier = df8.groupby('user_site_province_id', as_index=False).agg({'total_actual_pay':'sum', 'sale_qtty':'sum'}).sort_values(by='total_actual_pay', ascending=False)
brand_sale_df_haier = brand_sale_df_haier[(brand_sale_df_haier['user_site_province_id'] == '1')|(brand_sale_df_haier['user_site_province_id'] == '2')|(brand_sale_df_haier['user_site_province_id'] == '12')|(brand_sale_df_haier['user_site_province_id'] == '22')|(brand_sale_df_haier['user_site_province_id'] == '19')]
brand_sale_df_haier['user_site_province_id'] = brand_sale_df_haier['user_site_province_id'].astype('int')
brand_sale_df_haier.columns = ['province_id','total_actual_pay', 'sale_qtty']
brand_sale_df_haier.sort_values(by='province_id')

order_num_df = cust_price_df[['province_id', 'order_num']][(cust_price_df['province_id'] == 1)|(cust_price_df['province_id'] == 12)|(cust_price_df['province_id'] == 19)|(cust_price_df['province_id'] == 2)|(cust_price_df['province_id'] == 22)]
order_num_df = order_num_df.sort_values(by='province_id')
order_num_df

brand_sale_df_haier = pd.merge(brand_sale_df_haier, order_num_df, on='province_id')
brand_sale_df_haier['渗透率'] = brand_sale_df_haier['sale_qtty'] / brand_sale_df_haier['order_num']
brand_sale_df_haier

def province_shentou(df, brandname, cust_price_df):
    df = df[df['brandname'] == brandname]

    brand_sale_df = df.groupby('user_site_province_id', as_index=False).agg({'total_actual_pay':'sum', 'sale_qtty':'sum'}).sort_values(by='total_actual_pay', ascending=False)
    brand_sale_df = brand_sale_df[(brand_sale_df['user_site_province_id'] == '1')|(brand_sale_df['user_site_province_id'] == '2')|(brand_sale_df['user_site_province_id'] == '12')|(brand_sale_df['user_site_province_id'] == '22')|(brand_sale_df['user_site_province_id'] == '19')]
    brand_sale_df['user_site_province_id'] = brand_sale_df['user_site_province_id'].astype('int')
    brand_sale_df.columns = ['province_id','total_actual_pay', 'sale_qtty']
    brand_sale_df.sort_values(by='province_id')
    
    order_num = cust_price_df[['province_id', 'order_num']][(cust_price_df['province_id'] == 1)|(cust_price_df['province_id'] == 12)|(cust_price_df['province_id'] == 19)|(cust_price_df['province_id'] == 2)|(cust_price_df['province_id'] == 22)]
    order_num = order_num.sort_values(by='province_id')
    
    brand_sale_df = pd.merge(brand_sale_df, order_num_df, on='province_id')
    brand_sale_df['渗透率'] = brand_sale_df['sale_qtty'] / brand_sale_df['order_num']
    brand_sale_df = brand_sale_df.sort_values(by='province_id')
    
    return brand_sale_df

df9 = df7.copy()

brand_sale_df_rs = province_shentou(df9, '容声（Ronshen）', cust_price_df)
brand_sale_df_siem = province_shentou(df9, '西门子（SIEMENS）', cust_price_df)
brand_sale_df_mi = province_shentou(df9, '美的（Midea）', cust_price_df)

brand_sale_df_siem

在这里插入图片描述

plt.style.use('ggplot')

x = np.arange(5)

y1 = brand_sale_df_siem['渗透率']
y2 = brand_sale_df_rs['渗透率']
y3 = brand_sale_df_haier['渗透率']
y4 = brand_sale_df_mi['渗透率']

tick_label=['北京', '上海', '江苏', '广东', '四川']

total_width, n = 0.8, 4
width = total_width / n
x = x - (total_width - width) / 2

plt.figure(figsize=(20,8),dpi=80)
plt.style.use('ggplot')
plt.ylabel("渗透率")

bar_width = 0.2
plt.bar(x, y1, width=bar_width, color=['red'])
plt.bar(x+width, y2, width=bar_width, color=['yellow'])
plt.bar(x+2*width, y3, width=bar_width, color=['green'])
plt.bar(x+3*width, y4, width=bar_width, color=['blue'])

# 显示x坐标轴的标签,即tick_label,调整位置，使其落在两个直方图中间位置
plt.xticks(x+bar_width/2, tick_label)

在这里插入图片描述

把地区维度和品牌维度结合起来，可以让我们清楚的看到不同地区的品牌偏好性和市场渗透率。
因为不同的品牌有不同的定价和营销策略，通过不同地区用户的品牌偏好，我们也可以在产品定价和营销策略上发现洞察。

plt.style.use('ggplot')

brand_sale_df['单价'] = brand_sale_df['total_actual_pay'] / brand_sale_df['sale_qtty']
brand_sale_df = brand_sale_df.sort_values(by='单价', ascending=False)

x = brand_sale_df['brandname']
y = brand_sale_df['单价']

plt.figure(figsize=(20,8),dpi=80)
plt.style.use('ggplot')
plt.xlabel('品牌')
plt.ylabel("客单价")

plt.xticks(range(len(x)), x, rotation=90)
rect = plt.bar(x, y, width=0.6, color=['#6699CC'])

plt.show()