1、需求
使用hql对以下日志进行分析,加工出2021-01-01到2022-02-01期间的次日留存率。
2022-02-01的次日留存率=2022-02-01登录的人中也在2022-02-02登录的人数/2022-02-01登录的人数
user_id | login_time |
c769fefc10eda98b | 2021-01-01 12:00:00 |
c769fefc10eda98b | 2021-01-02 11:00:00 |
c769fefc10eda98b | 2021-01-03 11:10:00 |
ec5e97ab48c92879 | 2021-01-01 12:00:00 |
ec5e97ab48c92879 | 2021-01-02 12:00:00 |
3d610dc3a2414000 | 2021-01-01 12:00:00 |
2、思路
求2021-01-01的次日留存,join后分组是这样的,找到差为1的id再去重就可以了,如果是七日留存,则差为7
user_id | login_time | |
c769fefc10eda98b | 2021-01-01 | 2021-01-02 |
c769fefc10eda98b | 2021-01-01 | 2021-01-03 |
c769fefc10eda98b | 2021-01-02 | 2021-01-03 |
c769fefc10eda98b | 2021-01-03 | NULL |
ec597ab48c92879 | 2021-01-01 | 2021-01-02 |
ec5e97ab48c92879 | 2021-01-02 | NULL |
3d610dc3a241400 | 2021-01-01 | NULL |
3、建表
drop table tmp.test_liucun;
create table tmp.test_liucun(
user_id string,
login_time string
)
row format delimited
fields terminated by "\t";
insert into table tmp.test_liucun
select "c769fefc10eda98b","2021-01-01 12:00:00"
union select"c769fefc10eda98b","2021-01-02 11:00:00"
union select "c769fefc10eda98b","2021-01-03 11:10:00"
union select "ec5e97ab48c92879","2021-01-01 12:00:00"
union select "ec5e97ab48c92879","2021-01-02 12:00:00"
union select "3d610dc3a2414000","2021-01-01 12:00:00"
;
4、实现
select
t1.dt,
count(distinct if(datediff(t2.dt,t1.dt)=1,t2.user_id,null))/count(distinct t1.user_id) cnt
--count(distinct if(datediff(t2.dt,t1.dt)=7,t2.user_id,null))/count(distinct t1.user_id) cnt 如果求7日留存就是差为7
from
(
select user_id,date_format(login_time,"yyyy-MM-dd") dt
from tmp.test_liucun
where date_format(login_time,"yyyy-MM-dd")>="2021-01-01" and date_format(login_time,"yyyy-MM-dd")<=date_sub("2021-02-01",1)
)t1
left join
(
select user_id,date_format(login_time,"yyyy-MM-dd") dt
from tmp.test_liucun
where date_format(login_time,"yyyy-MM-dd")>="2021-01-02" and date_format(login_time,"yyyy-MM-dd")<="2021-02-01"
)t2
on t1.user_id=t2.user_id and t1.dt<t2.dt
group by t1.dt
;