前言
点击获取数据源:数据源
通过用户对商品的行为类型,浏览、收藏、加购物车、购买的行为分析,找出用户在每个环节的转化率。
数据说明
表为tianchi_fresh_comp_train_user_2w是用户在商品全集上的移动端行为数据,包含如下字段:
字段 | 字段说明 | 提取说明 |
---|---|---|
user_id | 用户标识 | 抽样&字段脱敏 |
item_id | 商品标识 | 字段脱敏 |
behavior_type | 用户对商品的行为类型,浏览、收藏、加购物车、购买 ==> 1、2、3、4 | |
user_geohash | 用户位置的空间标识,可以为空 | 由经纬度通过保密的算法生成 |
item_category | 商品分类标识 | 字段脱敏 |
time | 行为时间 | 精确到小时级别 |
数据预览
# 加载库
import numpy as np
import pandas as pd
import pyecharts.options as opts
from pyecharts.charts import Line
from pyecharts.charts import Grid
了解数据
加载数据
data = pd.read_csv('文件路径')
查看数据结构
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15463110 entries, 0 to 15463109
Data columns (total 6 columns):
# Column Dtype
--- ------ -----
0 user_id int64
1 item_id int64
2 behavior_type int64
3 user_geohash object
4 item_category int64
5 time object
dtypes: int64(4), object(2)
memory usage: 707.8+ MB
data.head(5)
user_id | item_id | behavior_type | user_geohash | item_category | time | |
---|---|---|---|---|---|---|
0 | 10001082 | 285259775 | 1 | 97lk14c | 4076 | 2014-12-08 18 |
1 | 10001082 | 4368907 | 1 | NaN | 5503 | 2014-12-12 12 |
2 | 10001082 | 53616768 | 4 | NaN | 9762 | 2014-12-02 15 |
3 | 10001082 | 9947871 | 1 | NaN | 2825 | 2014-11-28 20 |
4 | 10001082 | 150720867 | 1 | 95qoghe | 3200 | 2014-12-15 08 |
统计缺失值
data.isnull().sum()
user_id 0
item_id 0
behavior_type 0
user_geohash 8207386
item_category 0
time 0
dtype: int64
因为不做地理数据的分析user_geohash 这列的缺失值不做处理
处理数据
删除重复数据
data.drop_duplicates(keep='last',inplace=True)
将time转换为datetime格式
data["time"] = pd.to_datetime(data['time'])
data.head(3)
user_id | item_id | behavior_type | user_geohash | item_category | time | |
---|---|---|---|---|---|---|
0 | 10001082 | 285259775 | 1 | 97lk14c | 4076 | 2014-12-08 18:00:00 |
1 | 10001082 | 4368907 | 1 | NaN | 5503 | 2014-12-12 12:00:00 |
2 | 10001082 | 53616768 | 4 | NaN | 9762 | 2014-12-02 15:00:00 |
提取出日期和时间
data['dates'] = data.time.dt.date
data['month'] = data.dates.values.astype('datetime64[M]')
data['hours'] = data.time.dt.hour
data.head(3)
user_id | item_id | behavior_type | user_geohash | item_category | time | dates | month | hours | |
---|---|---|---|---|---|---|---|---|---|
0 | 10001082 | 285259775 | 1 | 97lk14c | 4076 | 2014-12-08 18:00:00 | 2014-12-08 | 2014-12-01 | 18 |
1 | 10001082 | 4368907 | 1 | NaN | 5503 | 2014-12-12 12:00:00 | 2014-12-12 | 2014-12-01 | 12 |
2 | 10001082 | 53616768 | 4 | NaN | 9762 | 2014-12-02 15:00:00 | 2014-12-02 | 2014-12-01 | 15 |
转换数据类型
data['behavior_type']=data['behavior_type'].apply(str)
data['user_id']=data['user_id'].apply(str)
data['item_id']=data['item_id'].apply(str)
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 15463110 entries, 0 to 15463109
Data columns (total 9 columns):
# Column Dtype
--- ------ -----
0 user_id object
1 item_id object
2 behavior_type object
3 user_geohash object
4 item_category int64
5 time datetime64[ns]
6 dates object
7 month datetime64[ns]
8 hours int64
dtypes: datetime64[ns](2), int64(2), object(5)
memory usage: 1.2+ GB
数据可视化
统计每日PV和UV数据
pv_day = data[data.behavior_type=='1'].groupby("dates")["behavior_type"].count()
uv_day = data[data.behavior_type=='1'].drop_duplicates(['user_id','dates']).groupby('dates')['user_id'].count()
分析每天的pv与uv的趋势
# 做出每天的pv与uv趋势图
attr=list(pv_day.index)
pv=(
Line(init_opts=opts.InitOpts(width="1000px",height="500px"))
.add_xaxis(xaxis_data=attr)
.add_yaxis(
"pv",
np.around(pv_day.values/10000,decimals=2),
label_opts=opts.LabelOpts(is_show=False)
)
.add_yaxis(
series_name="uv",
yaxis_index=1,
y_axis=np.around(uv_day.values/10000,decimals=2),
label_opts=opts.LabelOpts(is_show=False),
)
.extend_axis(
yaxis=opts.AxisOpts(
name="uv",
type_="value",
min_=0,
max_=1.6,
interval=0.4,
axislabel_opts=opts.LabelOpts(formatter="{value} 万人"),
)
)
.set_global_opts(
tooltip_opts=opts.TooltipOpts(
is_show=True,trigger="axis",axis_pointer_type="cross"
),
xaxis_opts=opts.AxisOpts(
type_="category",
axispointer_opts=opts.AxisPointerOpts(is_show=True,type_="shadow"),
),
yaxis_opts=opts.AxisOpts(
name="pv",
type_="value",
min_=0,
max_=100,
interval=20,
axislabel_opts=opts.LabelOpts(formatter="{value} 万次"),
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True),
),
title_opts=opts.TitleOpts(title="pv与uv趋势图"),
)
)
pv.render_notebook()
pv、uv差异分析(by day)
pv_uv = pd.concat([pv_day, uv_day], join='outer', axis=1)
pv_uv.columns = ['pv_day', 'uv_day']
new_day=pv_uv.diff()
new_day.columns=['new_pv','new_uv']
attr = new_day.index
v = new_day.new_uv
w = new_day.new_pv
li=(
Line(init_opts=opts.InitOpts(width="1000px",height="500px"))
.add_xaxis(xaxis_data=attr)
.add_yaxis(
"新增pv",
w,
label_opts=opts.LabelOpts(is_show=False)
)
.extend_axis(
yaxis=opts.AxisOpts(
name="新增uv",
type_="value",
min_=-2000,
max_=1600,
interval=400,
axislabel_opts=opts.LabelOpts(formatter="{value}"),
)
)
.set_global_opts(
tooltip_opts=opts.TooltipOpts(
is_show=True, trigger="axis", axis_pointer_type="cross"
),
xaxis_opts=opts.AxisOpts(
type_="category",
axispointer_opts=opts.AxisPointerOpts(is_show=True, type_="shadow"),
),
yaxis_opts=opts.AxisOpts(
name="新增pv",
type_="value",
min_=-350000,
max_=250000,
interval=100000,
axislabel_opts=opts.LabelOpts(formatter="{value}"),
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True),
),
title_opts=opts.TitleOpts(title="pv、uv差异分析"),
)
)
il=(
Line()
.add_xaxis(xaxis_data=attr)
.add_yaxis("新增uv",v,yaxis_index='1',label_opts=opts.LabelOpts(is_show=False),)
)
c=li.overlap(il)
c.render_notebook()
不同时期用户行为分析
shopping_cart = data[data.behavior_type == '3'].groupby('dates')['behavior_type'].count()
collect = data[data.behavior_type=='2'].groupby('dates')['behavior_type'].count()
buy = data[data.behavior_type=='4'].groupby('dates')['behavior_type'].count()
attr_a=list(shopping_cart.index)
v_1=shopping_cart.values.tolist()
v_2=collect.values.tolist()
v_3=buy.values.tolist()
b=(
Line()
.add_xaxis(xaxis_data=attr_a)
.add_yaxis(
"加购人数",
v_1,
label_opts=opts.LabelOpts(is_show=False)
)
.add_yaxis(
"收藏人数",
v_2,
label_opts=opts.LabelOpts(is_show=False)
)
.add_yaxis(
"购买人数",
v_3,
label_opts=opts.LabelOpts(is_show=False)
)
.set_global_opts(title_opts=opts.TitleOpts(title="不同时期用户行为数据"))
)
b.render_notebook()
活动期间不同时段的用户行为分析
data['dates']=pd.to_datetime(data['dates'])
active = data[data["dates"].isin(["2014/12/11","2014/12/12","2014/12/13"])]
daily = data[~data["dates"].isin(["2014/12/11","2014/12/12","2014/12/13"])]
from pyecharts.charts import Bar
# 活动数据
cart_h= active[active.behavior_type == '3'].groupby('hours')['behavior_type'].count()
collect_h=active[active.behavior_type=='2'].groupby('hours')['behavior_type'].count()
buy_h=active[active.behavior_type=='4'].groupby('hours')['behavior_type'].count()
uv_h=active[active.behavior_type== '1'].groupby('hours')['user_id'].count()
attr_h=list(cart_h.index)
h1=np.around(cart_h.values/3,decimals=0).tolist()
h2=np.around(collect_h.values/3,decimals=0).tolist()
h3=np.around(buy_h.values/3,decimals=0).tolist()
h4=np.around(uv_h.values/3,decimals=0).tolist()
h=(
Line(init_opts=opts.InitOpts(width="1000px",height="500px"))
.add_xaxis(xaxis_data=attr_h)
.add_yaxis(
"加购人数",
h1,
label_opts=opts.LabelOpts(is_show=False)
)
.add_yaxis(
"收藏人数",
h2,
label_opts=opts.LabelOpts(is_show=False)
)
.add_yaxis(
"购买人数",
h3,
label_opts=opts.LabelOpts(is_show=False)
)
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=15)),
title_opts=opts.TitleOpts(title="日均各时段活动用户行为",pos_top="48%"),
legend_opts=opts.LegendOpts(pos_top="48%"),
)
)
bar=(
Bar()
.add_xaxis(xaxis_data=attr_h)
.add_yaxis(
"浏览人数",
h4,
label_opts=opts.LabelOpts(is_show=False)
)
.set_global_opts(
title_opts=opts.TitleOpts(title="活动pv对比数据"),
)
)
ggrid = (
Grid()
.add(bar, grid_opts=opts.GridOpts(pos_bottom="60%"))
.add(h, grid_opts=opts.GridOpts(pos_top="60%"))
)
ggrid.render_notebook()
from pyecharts.charts import Funnel
# 活动转化
a_pv=active[active.behavior_type=="1"]["user_id"].count()
a_cart=active[active.behavior_type=="3"]["user_id"].count()
a_collect=active[active.behavior_type=="2"]["user_id"].count()
a_buy=active[active.behavior_type=="4"]["user_id"].count()
a_attr=["点击","加入购物车","收藏","购买"]
values=[np.around((a_pv/a_pv*100),2),
np.around((a_cart/a_pv*100),2),
np.around((a_collect/a_pv*100),2),
np.around((a_buy/a_pv*100),2),
]
data = [[a_attr[i], values[i]] for i in range(len(a_attr))]
a=(
Funnel()
.add(
series_name="用户行为",
data_pair=data,
gap=2,
tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b} : {c}%",is_show=True),
label_opts=opts.LabelOpts(is_show=True, position="ourside"),
itemstyle_opts=opts.ItemStyleOpts(border_color="#fff", border_width=1),
)
.set_global_opts(title_opts=opts.TitleOpts(title="用户转化漏斗", subtitle="活动"))
)
a.render_notebook()
日常期间不同时段的用户行为分析
# 日常数据
cart_d= daily[daily.behavior_type == '3'].groupby('hours')['behavior_type'].count()
collect_d=daily[daily.behavior_type=='2'].groupby('hours')['behavior_type'].count()
buy_d=daily[daily.behavior_type=='4'].groupby('hours')['behavior_type'].count()
uv_d=daily[daily.behavior_type== '1'].groupby('hours')['user_id'].count()
attr_d=list(cart_d.index)
d1=np.around(cart_d.values/28,decimals=0).tolist()
d2=np.around(collect_d.values/28,decimals=0).tolist()
d3=np.around(buy_d.values/28,decimals=0).tolist()
d4=np.around(uv_d.values/3,decimals=0).tolist()
d=(
Line(init_opts=opts.InitOpts(width="1000px",height="500px"))
.add_xaxis(xaxis_data=attr_d)
.add_yaxis(
"加购人数",
d1,
label_opts=opts.LabelOpts(is_show=False)
)
.add_yaxis(
"收藏人数",
d2,
label_opts=opts.LabelOpts(is_show=False)
)
.add_yaxis(
"购买人数",
d3,
label_opts=opts.LabelOpts(is_show=False)
)
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=15)),
title_opts=opts.TitleOpts(title="日均各时段活动用户行为",pos_top="48%"),
legend_opts=opts.LegendOpts(pos_top="48%"),
)
)
y=(
Bar()
.add_xaxis(xaxis_data=attr_d)
.add_yaxis(
"浏览人数",
d4,
label_opts=opts.LabelOpts(is_show=False)
)
.set_global_opts(
title_opts=opts.TitleOpts(title="日常pv对比数据"),
)
)
ggrid = (
Grid()
.add(y, grid_opts=opts.GridOpts(pos_bottom="60%"))
.add(d, grid_opts=opts.GridOpts(pos_top="60%"))
)
ggrid.render_notebook()
# 日常转化
l_pv=daily[daily.behavior_type=="1"]["user_id"].count()
l_cart=daily[daily.behavior_type=="3"]["user_id"].count()
l_collect=daily[daily.behavior_type=="2"]["user_id"].count()
l_buy=daily[daily.behavior_type=="4"]["user_id"].count()
l_attr=["点击","加入购物车","收藏","购买"]
valuel=[np.around((l_pv/l_pv*100),2),
np.around((l_cart/l_pv*100),2),
np.around((l_collect/l_pv*100),2),
np.around((l_buy/l_pv*100),2),
]
datal = [[l_attr[i], valuel[i]] for i in range(len(l_attr))]
dy=(
Funnel()
.add(
series_name="用户行为",
data_pair=datal,
gap=2,
tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b} : {c}%",is_show=True),
label_opts=opts.LabelOpts(is_show=True, position="ourside"),
itemstyle_opts=opts.ItemStyleOpts(border_color="#fff", border_width=1),
)
.set_global_opts(title_opts=opts.TitleOpts(title="用户转化漏斗", subtitle="日常"))
)
dy.render_notebook()