间隔连续问题
题目
数据
1001 2021-12-12
1002 2021-12-12
1001 2021-12-13
1001 2021-12-14
1001 2021-12-16
1002 2021-12-16
1001 2021-12-19
1002 2021-12-17
1001 2021-12-20
建表
create table if not exists intervalcontinuous(
id int,
dt date
)row format delimited fields terminated by '\t';
加载数据
load data local inpath '/opt/module/data/hive-interviews/interval-continuous' into table intervalcontinuous;
解题步骤
步骤1 将上一行数据下移
sql语句
select
id,dt,lag(dt,1,'1970-01-01') over(partition by id order by dt) lagdt
from intervalcontinuous
结果
id dt lagdt
1001 2021-12-12 1970-01-01
1001 2021-12-13 2021-12-12
1001 2021-12-14 2021-12-13
1001 2021-12-16 2021-12-14
1001 2021-12-19 2021-12-16
1001 2021-12-20 2021-12-19
1002 2021-12-12 1970-01-01
1002 2021-12-16 2021-12-12
1002 2021-12-17 2021-12-16
步骤2 将时间上相邻的两行数据相减求出间隔的天数
sql语句
select id,dt,datediff(dt,lagdt) intervaldays
from(
select
id,dt,lag(dt,1,'1970-01-01') over(partition by id order by dt) lagdt
from intervalcontinuous
)t1
结果
id dt intervaldays
1001 2021-12-12 18973
1001 2021-12-13 1
1001 2021-12-14 1
1001 2021-12-16 2
1001 2021-12-19 3
1001 2021-12-20 1
1002 2021-12-12 18973
1002 2021-12-16 4
1002 2021-12-17 1
步骤3 开窗求第一行到当前行大于等于2的个数sumif
sql语句
select
id,dt,sum(if(intervaldays>=2,1,0)) over(partition by id order by dt) groupid
from(
select id,dt,datediff(dt,lagdt) intervaldays
from(
select
id,dt,lag(dt,1,'1970-01-01') over(partition by id order by dt) lagdt
from intervalcontinuous
)t1
)t2
结果
id dt groupid
1001 2021-12-12 1
1001 2021-12-13 1
1001 2021-12-14 1
1001 2021-12-16 2
1001 2021-12-19 3
1001 2021-12-20 3
1002 2021-12-12 1
1002 2021-12-16 2
1002 2021-12-17 2
步骤4 按照id和groupid排序,求组内count
select id,count(groupid) days
from(
select
id,dt,sum(if(intervaldays>2,1,0)) over(partition by id order by dt) groupid
from(
select id,dt,datediff(dt,lagdt) intervaldays
from(
select
id,dt,lag(dt,1,‘1970-01-01’) over(partition by id order by dt) lagdt
from intervalcontinuous
)t1
)t2
)t3
group by id,groupid
id days
1001 4
1001 2
1002 1
1002 2
步骤5 求最大连续登陆日期数据
select t4.id,max(t4.days)
from(
select id,count(groupid) days
from(
select
id,dt,sum(if(intervaldays>2,1,0)) over(partition by id order by dt) groupid
from(
select id,dt,datediff(dt,lagdt) intervaldays
from(
select
id,dt,lag(dt,1,‘1970-01-01’) over(partition by id order by dt) lagdt
from intervalcontinuous
)t1
)t2
)t3
group by id,groupid
)t4
group by id;
t4.id _c1
1001 4
1002 2