需求分析
需求:求用户1日、3日、7日留存率
概念:
第N日活跃用户留存率:以基准日的活跃用户数据为主,第N日后依然活跃的用户占基准日活跃用户的比例
- 第1日留存率(即次日留存):(在基准日当天活跃的用户中,基准日之后的第1天还活跃的用户数)/基准日当天活跃用户数
- 第3日留存率:(在基准日当天活跃的用户中,基准日之后的第3天还活跃的用户数)/基准日当天活跃用户数
- 第7日留存率:(在基准日当天活跃的用户中,基准日之后的第7天还活跃的用户数)/基准日当天活跃用户数
数据如下:
解题步骤
1、创建表
CREATE EXTERNAL TABLE user_log
(
`user_id` STRING COMMENT '用户id',
`login_dt` STRING COMMENT '登录时间'
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
1、进行表的自身关联,过滤出右表日期大于左表日期的数据
select t1.`user_id` t1_user_id,
t1.`dt` t1_dt,
t2.`user_id` t2_user_id,
t2.`dt` t2_dt
from (
select `user_id`,
to_date(`login_dt`) dt
from user_log
group by user_id, to_date(`login_dt`)
) t1
left join
(
select `user_id`,
to_date(`login_dt`) dt
from user_log
group by user_id, to_date(`login_dt`)
) t2
on t1.`user_id` = t2.`user_id`
where t2.`dt` >= t1.`dt`;
3、根据数据日期计算出N日留存用户数,核心就是datediff函数的使用
select `t1_user_id`,
datediff(end_date,start_date) dn,
`start_date`,
`end_date`
from (
select t1.`user_id` t1_user_id,
t1.`dt` start_date,
t2.`user_id` t2_user_id,
t2.`dt` end_date
from (
select `user_id`,
to_date(`login_dt`) dt
from user_log
group by user_id, to_date(`login_dt`)
) t1
left join
(
select `user_id`,
to_date(`login_dt`) dt
from user_log
group by user_id, to_date(`login_dt`)
) t2
on t1.`user_id` = t2.`user_id`
where t2.`dt` >= t1.`dt`
) t3;
4、计算用户留存数
select
`start_date`,
count( distinct t1_user_id) base_cnt,
count(if( dn = 1,t1_user_id,null)) remain_1d,
count(if( dn = 3,t1_user_id,null)) remain_3d,
count(if( dn = 7,t1_user_id,null)) remain_7d
from (
select `t1_user_id`,
datediff(end_date,start_date) dn,
`start_date`,
`end_date`
from (
select t1.`user_id` t1_user_id,
t1.`dt` start_date,
t2.`user_id` t2_user_id,
t2.`dt` end_date
from (
select `user_id`,
to_date(`login_dt`) dt
from user_log
group by user_id, to_date(`login_dt`)
) t1
left join
(
select `user_id`,
to_date(`login_dt`) dt
from user_log
group by user_id, to_date(`login_dt`)
) t2
on t1.`user_id` = t2.`user_id`
where t2.`dt` >= t1.`dt`
) t3
) t4
group by `start_date`;
5、最终计算n日留存率 = n日用户留存数/基准日活跃用户数
select `start_date`,
`base_cnt`,
`remain_1d`,
`remain_3d`,
`remain_7d`,
concat( round(remain_1d / base_cnt * 100,2),'%') remain_1d_ratio,
concat( round(remain_3d / base_cnt * 100,2),'%') remain_3d_ratio,
concat( round(remain_7d / base_cnt * 100,2),'%') remain_7d_ratio
from (
select `start_date`,
count(distinct t1_user_id) base_cnt,
count(if(dn = 1, t1_user_id, null)) remain_1d,
count(if(dn = 3, t1_user_id, null)) remain_3d,
count(if(dn = 7, t1_user_id, null)) remain_7d
from (
select `t1_user_id`,
datediff(end_date, start_date) dn,
`start_date`,
`end_date`
from (
select t1.`user_id` t1_user_id,
t1.`dt` start_date,
t2.`user_id` t2_user_id,
t2.`dt` end_date
from (
select `user_id`,
to_date(`login_dt`) dt
from user_log
group by user_id, to_date(`login_dt`)
) t1
left join
(
select `user_id`,
to_date(`login_dt`) dt
from user_log
group by user_id, to_date(`login_dt`)
) t2
on t1.`user_id` = t2.`user_id`
where t2.`dt` >= t1.`dt`
) t3
) t4
group by `start_date`
) t5;
优化
with base_info as (
select
user_id,
to_date(login_dt) dt
from user_log
group by user_id, to_date(login_dt)
)
select
start_date,
remain1_cn,
remain3_cn,
remain7_cn,
concat(round(remain1_cn / base_count * 100,2),'%') remain1_ratio,
concat(round(remain3_cn / base_count * 100,2),'%') remain3_ratio,
concat(round(remain7_cn / base_count * 100,2),'%') remain7_ratio
from (
select
b.dt start_date,
count(b.user_id) base_count,
count(t1.user_id) remain1_cn,
count(t2.user_id) remain3_cn,
count(t3.user_id) remain7_cn
from base_info b
left join base_info t1 on b.user_id = t1.user_id and date_add(b.dt,1) = t1.dt -- 关联1日留存的数据
left join base_info t2 on b.user_id = t2.user_id and date_add(b.dt,3) = t2.dt -- 关联3日留存的数据
left join base_info t3 on b.user_id = t3.user_id and date_add(b.dt,7) = t3.dt -- 关联7日留存的数据
group by b.dt
) t4;