流式问题
题目
数据
1001 2021-06-14 12:12:12 2021-06-14 18:12:12
1003 2021-06-14 13:12:12 2021-06-14 16:12:12
1004 2021-06-14 13:15:12 2021-06-14 20:12:12
1002 2021-06-14 15:12:12 2021-06-14 16:12:12
1005 2021-06-14 15:18:12 2021-06-14 20:12:12
1001 2021-06-14 20:12:12 2021-06-14 23:12:12
1006 2021-06-14 21:12:12 2021-06-14 23:15:12
1007 2021-06-14 22:12:12 2021-06-14 23:10:12
建表
create table if not exists simultaneous(
id int,
sdt timestamp,
edt timestamp
)row format delimited fields terminated by '\t';
加载数据
load data local inpath '/opt/module/data/hive-interviews/simultaneous' into table simultaneous;
解题步骤
解题步骤1 对数据进行分类,在开始数据后添加1,在下线数据后添加-1
sql语句
select id,sdt dt,1 p from simultaneous
union
select id,edt dt,-1 p from simultaneous
结果
_u1.id _u1.dt _u1.p
1001 2021-06-14 12:12:12 1
1001 2021-06-14 18:12:12 -1
1001 2021-06-14 20:12:12 1
1001 2021-06-14 23:12:12 -1
1002 2021-06-14 15:12:12 1
1002 2021-06-14 16:12:12 -1
1003 2021-06-14 13:12:12 1
1003 2021-06-14 16:12:12 -1
1004 2021-06-14 13:15:12 1
1004 2021-06-14 20:12:12 -1
1005 2021-06-14 15:18:12 1
1005 2021-06-14 20:12:12 -1
1006 2021-06-14 21:12:12 1
1006 2021-06-14 23:15:12 -1
1007 2021-06-14 22:12:12 1
1007 2021-06-14 23:10:12 -1
解题步骤2 按照时间顺序,累加计算人数
sql语句
select
id,dt,sum(p) over(order by dt) sum
from(
select id,sdt dt,1 p from simultaneous
union
select id,edt dt,-1 p from simultaneous
)t1
结果
id dt sum
1001 2021-06-14 12:12:12 1
1003 2021-06-14 13:12:12 2
1004 2021-06-14 13:15:12 3
1002 2021-06-14 15:12:12 4
1005 2021-06-14 15:18:12 5
1002 2021-06-14 16:12:12 3
1003 2021-06-14 16:12:12 3
1001 2021-06-14 18:12:12 2
1001 2021-06-14 20:12:12 1
1004 2021-06-14 20:12:12 1
1005 2021-06-14 20:12:12 1
1006 2021-06-14 21:12:12 2
1007 2021-06-14 22:12:12 3
1007 2021-06-14 23:10:12 2
1001 2021-06-14 23:12:12 1
1006 2021-06-14 23:15:12 0
解题步骤3 求同时在线最大人数
sql语句
select max(sum)
from(
select
id,dt,sum(p) over(order by dt) sum
from(
select id,sdt dt,1 p from simultaneous
union
select id,edt dt,-1 p from simultaneous
)t1
)t2
结果
OK
_c0
5