在(一)中已经初步清洗完了数据,接下来进行实战分析
RFM电商数据分析(1)
RFM电商数据分析(3)
RFM电商数据分析(4)
数据准备
df['event_time'] = pd.to_datetime(df['event_time'])
df['order_id'] = df['order_id'].astype('object')
df['product_id'] = df['product_id'].astype('object')
df['category_id'] = df['category_id'].astype('object')
df['user_id'] = df['user_id'].astype('object')
df['age'] = df['age'].astype('int64')
df['date'] = pd.to_datetime(df['date'])
df.info()
# 销售额GMV
GMV = df.amount.sum()
out:117320205.17000003
# 计算GMV随时间变化增长曲线
s0 = df.groupby('date').agg(销量 = ('buy_count', 'sum'), 销售额 = ('amount', 'sum')).reset_index()
s0
计算字段
sql 语句
累计销量:
select sum('销量') over(order by date) as '累计销量' from s0
累计销售额:
select sum('销售额') over(order by date) as '累计销售额' from s0
销售增长:
select (s2.'销量' - s1.'销量') as '销售增长'
from (select *, row_number() over(order by date) as t_rank from s0) as s1
left join (select *, row_number() over(order by date) as t_rank from s0) as s2
on s1.date = s2.date
where s2.t_rank = s1.t_rank + 1
order by s1.date asc
第二种方法:
行补偿函数lag('字段名', 行补偿数, 默认0)
select *, (销量 - lag(销量, 1, 0)) as '销售增长' from s0
python实现
s0['累计销量'] = s0.销量.cumsum()
s0['累计销售额'] = s0.销售额.cumsum()
s0['销量增长'] = s0.销量.diff()
s0['销量增长率'] = s0.销量.pct_change() * 100
s0['销售额增长'] = s0.销售额.diff()
s0['销售额增长率'] = s0.销售额.pct_change() * 100
s0
x = s0['date'].dt.strftime('%m-%d').to_list()
y1 = s0.累计销量.to_list()
y2 = [round(x / 10000) for x in s0.累计销售额]
def charts0():
line = (
Line(init_opts= opts.InitOpts(theme=ThemeType.CHALK))
.add_xaxis(x)
.add_yaxis(
series_name='累计销量(件)',
y_axis=y1,
linestyle_opts=opts.LineStyleOpts(),
)
.add_yaxis(
series_name="累计销售额(万元)",
y_axis=y2,
yaxis_index=1,
linestyle_opts=opts.LineStyleOpts(),
)
.extend_axis(
yaxis=opts.AxisOpts(
name="销售额(万元)",
name_location="end",
type_="value",
)
)
.set_global_opts(
datazoom_opts=opts.DataZoomOpts(is_show=True),
xaxis_opts= opts.AxisOpts(axislabel_opts= opts.LabelOpts(rotate=-15)),
title_opts= opts.TitleOpts(title="累计销量/销售额"),
tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),
yaxis_opts=opts.AxisOpts(name="销量(件)", type_="value"),
toolbox_opts=opts.ToolboxOpts(is_show=True, pos_left='28%', pos_top='5%'),
)
.set_series_opts(
label_opts=opts.LabelOpts(is_show=False) # color="rgba(255, 255, 255, 0.5)"
)
.render_notebook()
)
return line
charts0()
# 按月增长的曲线
s1 = df.groupby('month').agg(GMV=('amount','sum'))
s1['环比增长'] = s1.GMV.diff()
s1['环比增长率'] = s1.GMV.pct_change() * 100
s1
def charts1():
bar = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.CHALK))
.add_xaxis([str(x) + '月' for x in s1.index.tolist()])
.add_yaxis("月_GMV(万)", [round(x/10000,2) for x in s1.GMV.to_list()],)
.add_yaxis("环比增长(万)", [round(x/10000,2) for x in s1.环比增长.to_list()])
.set_global_opts(
toolbox_opts=opts.ToolboxOpts(is_show=True, pos_left='28%', pos_top='5%'),
datazoom_opts=opts.DataZoomOpts(is_show=True),
tooltip_opts=opts.TooltipOpts(trigger="axis"),
# tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),
title_opts= opts.TitleOpts(title="2020每月GMV"),
)
)
line = (
Line()
.add_xaxis([str(x)+'月' for x in s1.index.to_list()])
.add_yaxis("环比增长率(%)", [round(x,2) for x in s1.环比增长率.to_list()],)
)
bar.overlap(line)
return bar.render_notebook()
charts1()
# 每月销量
s2 = df.groupby('month').agg(销量=('buy_count','sum'))
s2['环比增长'] = s2.销量.diff()
s2['环比增长率'] = s2.销量.pct_change() * 100
s2
def charts2():
bar = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.CHALK))
.add_xaxis([str(x) + '月' for x in s2.index.tolist()])
.add_yaxis("月_销量(件)", [x for x in s2.销量.to_list()],)
.add_yaxis("环比增长(万)", [x for x in s2.环比增长.to_list()])
.set_global_opts(
toolbox_opts=opts.ToolboxOpts(is_show=True, pos_left='28%', pos_top='5%'),
datazoom_opts=opts.DataZoomOpts(is_show=True),
tooltip_opts=opts.TooltipOpts(trigger="axis"),
# tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),
title_opts= opts.TitleOpts(title="2020每月销量"),
)
)
line = (
Line()
.add_xaxis([str(x)+'月' for x in s1.index.to_list()])
.add_yaxis("环比增长率(%)", [round(x,2) for x in s2.环比增长率.to_list()],)
)
bar.overlap(line)
return bar.render_notebook()
charts2()
# 销量前十商品,订单|销售额占比
s3 = df.groupby('product_id').agg(销量=('buy_count', 'sum'), 销售额=('amount', 'sum'))
s3.sort_values(by='销量', ascending=False, inplace=True)
s3 = s3[:10]
s3['销量占比%'] = s3.销量 / (s3.销量.sum()) * 100
s3['销售额占比%'] = s3.销售额 / (s3.销售额.sum()) * 100
s3
# 销量前10类别 销量/销售额/占比
s4 = df.groupby('category_code').agg(销量=('buy_count', 'sum'), 销售额=('amount', 'sum') )
s4.sort_values(by='销量', ascending=False, inplace=True)
s4 = s4[:10]
s4['销量占比'] = s4.销量 / s4.销量.sum() * 100
s4['销售额占比%'] = s4.销售额 / s4.销售额.sum() * 100
s4
# 订单量排名前十的品牌 销量|销售额|占比
s5 = df.groupby('brand').agg(销量=('buy_count','sum'), 销售额=('amount','sum'))
s5.sort_values(by='销量', ascending=False, inplace=True)
s5 = s5[:10]
s5['销量占比'] = s5.销量 / s5.销量.sum() * 100
s5['销售额占比'] = s5.销售额 / s5.销售额.sum() * 100
s5
# 性别分布
s6 = df[df['brand']=='samsung'].groupby('sex').agg(人数=('user_id','nunique'))
s6
sex
女 17543
男 17672
#年龄分布
s7 = df[df['brand']=='samsung'].groupby('age').agg(人数=('user_id','nunique')).reset_index()
bins = [15,20,25,30,35,40,45,50]
s7['age_bin'] = pd.cut(s7.age, bins=bins)
s7.T
s8 = s7.groupby('age_bin').agg(人数=('人数','sum'))
s8
data_pair = [[str(index) + '岁', value] for index, value in s8['人数'].items()]
def charts4():
pie = (
Pie(init_opts= opts.InitOpts(theme=ThemeType.WONDERLAND))
.add(
series_name="年龄特征分布",
data_pair=data_pair,
radius=["40%", "55%"],
label_opts=opts.LabelOpts(
position="outside",
formatter="{a|{a}}{abg|}\n{hr|}\n {b|{b}: }{c} {per|{d}%} ",
background_color="#eee",
border_color="#aaa",
border_width=1,
border_radius=4,
rich={
"a": {"color": "#999", "lineHeight": 22, "align": "center"},
"abg": {
"backgroundColor": "#e3e3e3",
"width": "100%",
"align": "right",
"height": 22,
"borderRadius": [4, 4, 0, 0],
},
"hr": {
"borderColor": "#aaa",
"width": "100%",
"borderWidth": 0.5,
"height": 0,
},
"b": {"fontSize": 16, "lineHeight": 33},
"per": {
"color": "#eee",
"backgroundColor": "#334455",
"padding": [2, 4],
"borderRadius": 2,
},
},
),
)
.set_global_opts(title_opts=opts.TitleOpts(title="三星消费者年龄特征分布", pos_top="7%", pos_left='center'))
.render_notebook()
)
return pie
charts4()
# 用户年龄分布
s9 = df.groupby('age').agg(人数=('user_id','nunique')).reset_index()
bins = [15,20,25,30,35,40,45,50]
s9['age_bin'] = pd.cut(s9.age, bins=bins)
s9.T
# 用户性别分布
s10 = df.groupby('sex').agg(人数=('user_id','nunique'))
s10
sex
女 47819
男 48147
data_pair = [[str(index) + '性', value] for index, value in s10['人数'].items()]
def charts5():
pie = (
Pie(init_opts= opts.InitOpts(theme=ThemeType.CHALK))
.add(
series_name="性别特征分布",
data_pair=data_pair,
radius=["40%", "55%"],
)
.set_global_opts(title_opts=opts.TitleOpts(title="性别特征分布", pos_top="7%", pos_left='center'))
.set_series_opts(
tooltip_opts=opts.TooltipOpts(
trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"
),
label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.8)", font_size=18, formatter="{b}: {d}%"),
)
.render_notebook()
)
return pie
charts5()
s11 = df.groupby('local').agg(人数=('user_id','nunique')).sort_values('人数',ascending=False)
s11.T
local 广东 上海 北京 江苏 四川 海南 浙江 重庆 天津 湖北 湖南
人数 21636 16253 16121 5635 5518 5508 5428 5396 5393 5393 5387
data_pair = [[index, value] for index, value in s11['人数'].items()]
def charts6():
pie = (
Pie(init_opts= opts.InitOpts(theme=ThemeType.CHALK))
.add(
series_name="地区特征分布",
data_pair=data_pair,
radius=["40%", "55%"],
)
.set_global_opts(title_opts=opts.TitleOpts(title="消费者地区特征分布", pos_top="7%", pos_left='center'))
.set_series_opts(
tooltip_opts=opts.TooltipOpts(
trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"
),
label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.8)", font_size=18, formatter="{b}: {d}%")
)
.render_notebook()
)
return pie
charts6()
s13 = df.groupby('user_id').agg(消费金额=('amount', 'sum'))
s13.sort_values('消费金额', ascending=False, inplace=True)
s13['累计消费总额'] = s13.消费金额.cumsum()
s13['累计消费总额占比'] = s13.累计消费总额 / s13.消费金额.sum() * 100
s13['rank'] = s13.消费金额.rank(ascending=False)
s13['rank'] = s13['rank'].astype('int64')
s13['累计用户占比'] = s13['rank'] / (s13['rank'].max()) * 100
s13.reset_index()
y = [round(x, 2) for x in s13.累计消费总额占比[::1000]]
x = [round(x, 2) for x in s13.累计用户占比[::1000]]
def charts7():
line = (
Line(init_opts=opts.InitOpts(theme=ThemeType.CHALK))
.add_xaxis(x)
.add_yaxis('累计消费总额占比', y)
.set_global_opts(
title_opts=opts.TitleOpts(title="消费者消费总额占比"),
tooltip_opts=opts.TooltipOpts(trigger="axis"),)
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.render_notebook()
)
return line
charts7()
# 头部贡献超过80%的用户
s14 = s13[s13['累计消费总额'] / s13.消费金额.sum() < 0.8].reset_index()
s14.shape[0] / s13.shape[0]
out:0.27227664723000994
结论: 前27.23%的用户贡献了80%的销售额,这些客户需要重点维护