间隔连续问题本质上是分区问题,即将对每个id每次连续登录分为一个区,然后根据最大登录日期减最小登陆日期 + 1
---首先按照id和登录时间(substring)去重,然后使用lag开窗
with
t1 as (
SELECT
user_id,
substring(login_datetime, 0, 10) as login_day
FROM
login_events
group by
user_id,
substring(login_datetime, 0, 10)
),
----然后使用lag开窗,按照user_id分区,login_day生序排序,取上一个日期,
---计算与当前login_day的差值,大于2则记为1都则为0
t2 as (
SELECT
user_id,
login_day,
if (
datediff (
login_day,
lag (login_day) over (
PARTITION by
user_id
order by
login_day asc
)
) > 2,
1,
0
) flag
FROM
t1
), ----按照id分组,day排序,使用sum函数累加flag,对每次连续登录分区号,
t3 as (
SELECT
user_id,
login_day,
sum(flag) over (
PARTITION by
user_id
order by
login_day asc
) as par
FROM
t2
),
-------按照user_id,par分组,由最大登录天-最小+1,求得每个用户的连续登录天数
t4 as (
select
user_id,
datediff (max(login_day), min(login_day)) + 1 as day_count
from
t3
group by
user_id,
par
)
--------按照user_id分组,求得每个用户的最长连续登录天数
SELECT
user_id,
max(day_count) as max_day_count
FROM
t4
GROUP by
user_id