数据分析部分

我不是老白干

于 2021-07-02 22:44:30 发布

阅读量566

点赞数 2

文章标签：数据分析

本文链接：https://blog.csdn.net/qdkgood/article/details/118424432

版权

第一部分

1、数据处理

1.1读取数据

取前500万条数据

#读取指定的文件并规定行数
data<-read_csv("E:/UserBehavior.csv",
  n_max = 5000000,
  col_names = c("user_id","item_id" ,"item_category" ,"behavior_type" ,"time"),
  col_types = list(col_double(),
                  col_double(),
                  col_double(),
                  col_character(),
                  col_double()
                 )
  )
#查看数据
str(data)

输出结果

spec_tbl_df [5,000,000 x 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ user_id      : num [1:5000000] 1 1 1 1 1 1 1 1 1 1 ...
 $ item_id      : num [1:5000000] 2268318 2333346 2576651 3830808 4365585 ...
 $ item_category: num [1:5000000] 2520377 2520771 149192 4181361 2520377 ...
 $ behavior_type: chr [1:5000000] "pv" "pv" "pv" "pv" ...
 $ time         : num [1:5000000] 1.51e+09 1.51e+09 1.51e+09 1.51e+09 1.51e+09 ...
 - attr(*, "spec")=
  .. cols(
  ..   user_id = col_double(),
  ..   item_id = col_double(),
  ..   item_category = col_double(),
  ..   behavior_type = col_character(),
  ..   time = col_double()
  .. )

字段解释

字段名	解释
user_id：	用户ID
item_id：	商品ID
behavior_type：	行为类型,包括"pv",“cart”,“fav”,“buy"即"点击”,“加入购物车”,“收藏”,“支付”
item_category：	商品所属的品类ID
time：	行为发生时间,是时间戳格式

1.2查看数据集缺失值情况

naniar::miss_var_summary(data)
naniar::gg_miss_var(data,show_pct = TRUE)

输出结果

# A tibble: 5 x 3
  variable      n_miss pct_miss
  <chr>          <int>    <dbl>
1 user_id            0        0
2 item_id            0        0
3 item_category      0        0
4 behavior_type      0        0
5 time               0        0

表示没有缺失值，图就不放了

1.3time格式转换

第一步：转换格式

第二步：将time列分裂为date和hour列，并转换数据类型

#时间格式转换
data$time<- as.POSIXct(data$time, origin="1970-01-01 00:00:00")

#将time列分裂为date和hour列
data <- tidyr::separate(data,time,into=c("date","hour"),sep=" ")
data$hour <- stringr::str_extract(data$hour,"^[0-9]{2}") %>% as.integer

转换后的数据

> head(data)
# A tibble: 6 x 6
  user_id item_id item_category behavior_type date        hour
    <dbl>   <dbl>         <dbl> <chr>         <chr>      <int>
1       1 2268318       2520377 pv            2017-11-25     1
2       1 2333346       2520771 pv            2017-11-25     6
3       1 2576651        149192 pv            2017-11-25     9
4       1 3830808       4181361 pv            2017-11-25    15
5       1 4365585       2520377 pv            2017-11-25    15
6       1 4606018       2735466 pv            2017-11-25    21

1.4除去时间异常值

data<-sqldf('select* FROM data WHERE date > "2017-11-25" and date < "2017-12-03"')

2.用户行为分析

2.1日访问量分析

PV(访问量)：即Page View，指网站页面的浏览量或点击量，页面被刷新一次就计算一次。

日访问量分析

#日访问量分析
daily <- data %>% lazy_dt %>% na.omit(date) 
hourly <- data %>% lazy_dt %>% na.omit(hour)

pv.daily <- daily %>% group_by(date) %>% summarise(pv=n()) %>% as_tibble()

p1 <- ggplot(pv.daily,aes(as.Date(date),pv))+
geom_step(size=1) + 
theme_bw()+labs(x="")

2.2日访客分析

UV(独立访客)：即Unique Visitor，访问网站的一台电脑客户端为一个访客。

日访客分析

#日访客分析
uv.daily <- daily %>% group_by(date) %>% distinct(user_id) %>% summarise(uv=n()) %>% as_tibble()

p2 <- ggplot(uv.daily,aes(as.Date(date),uv))+
geom_step(size=1) +
     theme_bw() +
     labs(x="")
ratio <- max(uv.daily$uv) / length(unique(data$user_id))
print(scales::percent(ratio,accuracy = 0.1))
p1 + p2 + plot_layout(ncol = 1)

在这里插入图片描述

2.3时段访问量分析

#小时访问量分析
pv.hour <- hourly %>% 
group_by(hour,behavior_type) %>% summarise(pv=n()) %>% 
as_tibble()
p3 <- ggplot(pv.hour,aes(hour,pv,col=behavior_type)) +
  geom_line(size=1) +
  geom_point(size=2) +
  scale_x_continuous(breaks = c(0:23)) +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(x="")

p4 <- ggplot(pv.hour,aes(hour,pv,col=behavior_type)) +
  geom_line(data=subset(pv.hour,behavior_type!="pv"),size=1) +
  geom_point(data=subset(pv.hour,behavior_type!="pv"),size=2) +
  scale_x_continuous(breaks = c(0:23)) +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(x="")

 p3 + p4 + plot_layout(ncol = 1)

输出结果

在这里插入图片描述

四条线分别代表点击、收藏、加购物车和支付。从上图中可以看到点击的次数远高于其他三种行为，以至于其他三种行为的趋势看不出来，所以下图中去掉了行为点击的曲线。
从整体上看，四种行为的波动情况基本一致，并且在晚上7点-10点间pv量最高。
同时，从下图中也可以看到，加购物车的总量高于收藏，收藏又高于支付。

3.用户消费行为分析

3.1活跃用户每天购买次数情况分析

 #活跃用户每天购买次数情况分析
buy.daily <- daily %>% 
filter(behavior_type=="buy") %>% group_by(user_id) %>%
summarise(n=n()) %>% as_tibble()

ggplot(buy.daily,aes(n)) +
   geom_histogram(stat = "count") +
   theme_bw() +
   labs(x="")

在这里插入图片描述

用户消费次数分布

淘宝用户消费次数普通在10次以下，因此需要重点关注消费次数在10次以上的用户。

3.2活跃用户每天人均消费次数

每天消费总次数 / 每天消费总人数

 #活跃用户每天人均消费次数
daily.totle <- daily %>% 
  filter(behavior_type=="buy") %>% 
  group_by(date) %>% summarise(totle=n()) %>% as_tibble()
daily.count <- daily %>%
   filter(behavior_type=="buy") %>% 
   group_by(date) %>% distinct(user_id) %>% 
   summarise(count=n()) %>% as_tibble()

full_join(daily.totle,daily.count) %>% 
   mutate(freq=totle/count) %>% 
   ggplot(aes(as.Date(date),freq)) +
   geom_line(size=1) +
   theme_bw() +
   labs(x="",y="")

在这里插入图片描述

可以看到每天的平均消费次数一般都在1.52次左右,11.29日的最高

3.3付费率

每日消费总人数 / 每日总活跃人数（每日有操作行为的人数）

 # 每天访问的用户数
daily.user <- daily %>% group_by(date) %>%
   distinct(user_id) %>% 
   summarise(s=n()) %>% as_tibble()

 # 每天交易笔数
 daily.buy <- daily %>% filter(behavior_type=="buy") %>% 
   group_by(date) %>% distinct(user_id) %>%  
   summarise(b=n()) %>% as_tibble()
 
full_join(daily.buy,daily.user) %>% 
   mutate(prob=b / s) %>% 
   ggplot(aes(as.Date(date),prob)) +
   geom_line(size=1) +
   theme_bw() +
   labs(x="",y="")

在这里插入图片描述

用户付费率在19%左右

3.4同一时间段用户消费次数分布

 #同一时间段用户消费次数分布
 # 各用户消费次数
user.buy <- data %>% lazy_dt() %>% 
  filter(behavior_type=="buy") %>% 
  group_by(user_id) %>% summarise(n=n()) %>% as_tibble()
 
summary(user.buy$n)
 
ggplot(user.buy,aes(n)) +
   geom_density(outline.type = "full",fill="gray",
                size=2,col="gray") +
   theme_bw() +
   labs(x="",y="")

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   1.000   2.000   3.008   4.000  84.000

在这里插入图片描述

同时段消费次数分布

大多数用户消费次数为1次，平均消费次数为3.008，75%用户消费次数在4次以下。

4.复购情况分析

复购率 = 有复购行为的用户数 / 有购买行为的用户总数

> head(daily %>% arrange(date) %>% as_tibble() %>% .[c(1,5,6)])
# A tibble: 6 x 3
  user_id date        hour
    <dbl> <chr>      <int>
1       1 2017-11-26     5
2     100 2017-11-26    15
3     100 2017-11-26    15
4     100 2017-11-26    16
5     100 2017-11-26    16
6     100 2017-11-26    16
> tail(daily %>% arrange(date) %>% as_tibble() %>% .[c(1,5,6)])
# A tibble: 6 x 3
  user_id date        hour
    <dbl> <chr>      <int>
1  309813 2017-12-02    21
2  309813 2017-12-02    21
3  309813 2017-12-02    21
4  309813 2017-12-02    21
5  309813 2017-12-02    21
6  309816 2017-12-02    19

先按date和user_id去重，然后按user_id计数，大于1时即为复购行为：

daily.rebuy <- daily %>% filter(behavior_type=="buy") %>% 
   distinct(date,user_id) %>% group_by(user_id) %>% 
   summarise(n=n()) %>% as_tibble()

rebuy.ratio <- nrow(filter(daily.rebuy,n>1)) / nrow(daily.rebuy)
scales::percent(rebuy.ratio,accuracy = 0.1)

[1] "45.8%"

ggplot(daily.rebuy,aes(n)) +
   geom_bar(stat = "count") +
   theme_bw() +
   scale_x_continuous(breaks = c(1:30)) +
   labs(x="",y="")

在这里插入图片描述

复购情况分布

一个月之内的复购率为45.8%，多数用户在一个月内的购买次数为1-5次。

5.漏斗流失分析

反映用户行为状态从起点（点击）到终点（支付）各阶段的转化率情况。
将数据按商品（item_category）和用户行为分组，然后计算各行为的次数。

#漏斗流失分析
data.category <- data %>% lazy_dt %>% group_by(item_id,behavior_type) %>% 
   summarise(n=n()) %>% as_tibble()
 
table(data.category$behavior_type) %>% prop.table() %>% 
   as.data.frame() %>% ggplot(aes(Var1,Freq)) +
   geom_bar(stat = "identity",fill="gray60") +
   geom_text(aes(label=scales::percent(Freq,accuracy = 0.1)),
             col="blue") +
   scale_x_discrete(breaks = c("pv","cart","fav","buy"),
                    labels = c("点击","加入购物车","收藏","支付")) +
   theme_bw() +
   labs(x="",y="")

在这里插入图片描述

> scales::percent(12.7 / 74.5,accuracy = 0.001)
[1] "17.047%"
> scales::percent(5.1 / 12.7,accuracy = 0.001)
[1] "40.157%"

用户点击后，大约有17.047%的概率会加入购物车，而加入购物车后大约有40.157%的概率会支付。

参考：
作者：wonphen
链接：https://www.jianshu.com/p/530f62ee4e8e

我不是老白干

关注

2
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
数据分析部分

第一部分1、数据处理1.1读取数据取前500万条数据#读取指定的文件并规定行数data<-read_csv("E:/UserBehavior.csv", n_max = 5000000, col_names = c("user_id","item_id" ,"item_category" ,"behavior_type" ,"time"), col_types = list(col_double(), col_double(),
复制链接

扫一扫