经常遇到统计连续行为日期及连续天数的问题,之前的解决方案相对复杂,本次给出最简单的解决方案。
1、测试数据:
use xxx;
drop table test_serialdate;
create table if not exists test_serialdate (
user_id string comment '用户ID',
sign_date string comment '签到日期'
) stored as orcfile
;
insert into table test_serialdate
select 'u01' as user_id, '2022-01-01' as sign_date union all
select 'u01' as user_id, '2022-01-02' as sign_date union all
select 'u01' as user_id, '2022-01-05' as sign_date union all
select 'u01' as user_id, '2022-01-06' as sign_date union all
select 'u01' as user_id, '2022-01-08' as sign_date union all
select 'u01' as user_id, '2022-01-09' as sign_date union all
select 'u01' as user_id, '2022-01-10' as sign_date union all
select 'u01' as user_id, '2022-01-11' as sign_date union all
select 'u01' as user_id, '2022-01-17' as sign_date union all
select 'u01' as user_id, '2022-01-18' as sign_date
;
2、思路:
行为日期 | 排序 | date_sub(行为日期,序号) | 天数(分组count) |
---|---|---|---|
2022-01-02 | 1 | 2022-01-01 | 3天 |
2022-01-03 | 2 | 2022-01-01 | |
2022-01-04 | 3 | 2022-01-01 | |
2022-01-07 | 4 | 2022-01-03 | 1天 |
2022-01-15 | 5 | 2022-01-10 | 5天 |
2022-01-16 | 6 | 2022-01-10 | |
2022-01-17 | 7 | 2022-01-10 | |
2022-01-18 | 8 | 2022-01-10 | |
2022-01-19 | 9 | 2022-01-10 | |
2022-01-29 | 10 | 2022-01-19 | 1天 |
3、代码:
--统计每次连续打卡的次数
select user_id,sign_group,
min(sign_date) as sign_start_date,
max(sign_date) as sign_end_date,
count(1) as days
from
( --打卡日期-排序序号,若连续打卡则结果相同
select user_id,sign_date,
date_sub(sign_date,rk) as sign_group --连续行为则日期相同,用来分组
from
( --打卡记录(先排重),升序排序
select user_id,sign_date,
rank() over(partition by user_id order by sign_date asc) as rk
from test_serialdate
) tc1
) tc2
group by user_id,sign_group
;