sql题:同时在线问题深度剖析
需求
如下为某直播平台主播开播及关播时间,根据该数据计算出平台最高峰同时在线的主播人数及时间段(stt: 2021-06-14 12:12:12 edt: 2021-06-14 20:12:12)
# id stt edte
1001 2021-06-14 12:12:12 2021-06-14 18:12:12
1003 2021-06-14 13:12:12 2021-06-14 16:12:12
1004 2021-06-14 13:15:12 2021-06-14 20:12:12
1002 2021-06-14 15:12:12 2021-06-14 16:12:12
1005 2021-06-14 15:18:12 2021-06-14 20:12:12
1007 2021-06-14 15:22:12 2021-06-14 15:45:56
1012 2021-06-14 15:45:56 2021-06-14 16:45:56
1008 2021-06-14 16:18:12 2021-06-14 18:32:43
1011 2021-06-14 18:32:43 2021-06-14 19:56:37
1001 2021-06-14 20:12:12 2021-06-14 23:12:12
1006 2021-06-14 21:12:12 2021-06-14 23:15:12
1007 2021-06-14 22:12:12 2021-06-14 23:10:12
1013 2021-06-15 00:10:12 2021-06-15 06:10:12
1014 2021-06-15 01:34:12 2021-06-15 05:08:12
1015 2021-06-15 01:34:12 2021-06-15 05:08:12
1016 2021-06-15 02:12:12 2021-06-15 04:40:12
1017 2021-06-15 04:19:12 2021-06-15 04:59:12
1018 2021-06-15 03:25:12 2021-06-15 04:43:12
解答
建库
create database if not exists test comment '测试用库';
建表
create table if not exists test.anchor_online_and_offline_record(
id bigint comment '主播id',
stt string comment '上线时间',
edt string comment '下线时间'
) comment '直播平台主播上下线记录表'
row format delimited fields terminated by '\t'
stored as orc;
插入数据
insert into table test.anchor_online_and_offline_record values(1001,'2021-06-14 12:12:12','2021-06-14 18:12:12');
insert into table test.anchor_online_and_offline_record values(1003,'2021-06-14 13:12:12','2021-06-14 16:12:12');
insert into table test.anchor_online_and_offline_record values(1004,'2021-06-14 13:15:12','2021-06-14 20:12:12');
insert into table test.anchor_online_and_offline_record values(1002,'2021-06-14 15:12:12','2021-06-14 16:12:12');
insert into table test.anchor_online_and_offline_record values(1005,'2021-06-14 15:18:12','2021-06-14 20:12:12');
insert into table test.anchor_online_and_offline_record values(1007,'2021-06-14 15:22:12','2021-06-14 15:45:56');
insert into table test.anchor_online_and_offline_record values(1012,'2021-06-14 15:45:56','2021-06-14 16:45:56');
insert into table test.anchor_online_and_offline_record values(1008,'2021-06-14 16:18:12','2021-06-14 18:32:43');
insert into table test.anchor_online_and_offline_record values(1011,'2021-06-14 18:32:43','2021-06-14 19:56:37');
insert into table test.anchor_online_and_offline_record values(1001,'2021-06-14 20:12:12','2021-06-14 23:12:12');
insert into table test.anchor_online_and_offline_record values(1006,'2021-06-14 21:12:12','2021-06-14 23:15:12');
insert into table test.anchor_online_and_offline_record values(1007,'2021-06-14 22:12:12','2021-06-14 23:10:12');
insert into table test.anchor_online_and_offline_record values(1013,'2021-06-15 00:10:12','2021-06-15 06:10:12');
insert into table test.anchor_online_and_offline_record values(1014,'2021-06-15 01:34:12','2021-06-15 05:08:12');
insert into table test.anchor_online_and_offline_record values(1015,'2021-06-15 01:34:12','2021-06-15 05:08:12');
insert into table test.anchor_online_and_offline_record values(1016,'2021-06-15 02:12:12','2021-06-15 04:40:12');
insert into table test.anchor_online_and_offline_record values(1017,'2021-06-15 04:19:12','2021-06-15 04:59:12');
insert into table test.anchor_online_and_offline_record values(1018,'2021-06-15 03:25:12','2021-06-15 04:43:12');
insert into table test.anchor_online_and_offline_record values(1001,'2021-06-16 12:12:12','2021-06-16 18:12:12');
insert into table test.anchor_online_and_offline_record values(1003,'2021-06-16 13:12:12','2021-06-16 16:12:12');
insert into table test.anchor_online_and_offline_record values(1004,'2021-06-16 13:15:12','2021-06-16 20:12:12');
insert into table test.anchor_online_and_offline_record values(1002,'2021-06-16 15:12:12','2021-06-16 16:12:12');
insert into table test.anchor_online_and_offline_record values(1005,'2021-06-16 15:18:12','2021-06-16 20:12:12');
insert into table test.anchor_online_and_offline_record values(1007,'2021-06-16 15:22:12','2021-06-16 15:45:56');
insert into table test.anchor_online_and_offline_record values(1012,'2021-06-16 15:45:56','2021-06-16 16:45:56');
insert into table test.anchor_online_and_offline_record values(1008,'2021-06-16 16:18:12','2021-06-16 18:32:43');
insert into table test.anchor_online_and_offline_record values(1011,'2021-06-16 18:32:43','2021-06-16 19:56:37');
insert into table test.anchor_online_and_offline_record values(1001,'2021-06-16 20:12:12','2021-06-16 23:12:12');
insert into table test.anchor_online_and_offline_record values(1006,'2021-06-16 21:12:12','2021-06-16 23:15:12');
insert into table test.anchor_online_and_offline_record values(1007,'2021-06-16 22:12:12','2021-06-16 23:10:12');
insert into table test.anchor_online_and_offline_record values(1013,'2021-06-17 00:10:12','2021-06-17 06:10:12');
insert into table test.anchor_online_and_offline_record values(1014,'2021-06-17 01:34:12','2021-06-17 05:08:12');
insert into table test.anchor_online_and_offline_record values(1015,'2021-06-17 01:34:12','2021-06-17 05:08:12');
insert into table test.anchor_online_and_offline_record values(1016,'2021-06-17 02:12:12','2021-06-17 04:40:12');
insert into table test.anchor_online_and_offline_record values(1017,'2021-06-17 04:19:12','2021-06-17 04:59:12');
insert into table test.anchor_online_and_offline_record values(1018,'2021-06-17 03:25:12','2021-06-17 04:43:12');
查询sql
-- 合并上线时间和下线时间,然后上线+1,下线-1,统计同时在线人数
--sum()排序累加数据的时候会出现桶一时间点上下线情况,比如,当前在线人数是最大的10人,在下一秒同时上线3个和下线3个,
--那么最高在线人数没变化
drop table if exists test.anchor_online_and_offline_record_temp;
create table if not exists test.anchor_online_and_offline_record_temp as
select id,dt,is_online,
sum(flag) over(order by dt) as online_num,
row_number() over(order by dt) as rn
from (select id,stt as dt,'Y' as is_online,1 as flag
from test.anchor_online_and_offline_record
union all
select id,edt as dt,'N' as is_online,-1 as flag
from test.anchor_online_and_offline_record) a;
-- 获取最大在线人数和最大人数所在的时间段,这里需要使用到自关联。
-- 这里获取的在线最大人数和时间段会出现开始时间和结束时间相等的情况,这种情况需要过滤掉
drop table if exists test.anchor_online_and_offline_record_temp01;
create table if not exists test.anchor_online_and_offline_record_temp01 as
select online_num,
begin_time,
end_time
from (select a.online_num,
b.dt as begin_time,
c.dt as end_time
from (select max(online_num) as online_num
from test.anchor_online_and_offline_record_temp) a
left join test.anchor_online_and_offline_record_temp b
on a.online_num = b.online_num
left join test.anchor_online_and_offline_record_temp c
on b.rn+1 = c.rn) d
where d.begin_time <> d.end_time;
--结果数据也会出现一条记录的结束时间等于另一条记录的开始时间,这种情况需要将两条记录合为一条记录,否则结果肯定是错误的
select online_num,
begin_time,
end_time
from (select online_num,begin_time,end_time,
row_number() over(partition by end_time order by begin_time) as rn
from (select b1.online_num,
b1.begin_time,
case when b2.end_time is not null
then b2.end_time
else b1.end_time
end as end_time
from test.anchor_online_and_offline_record_temp01 b1
left join test.anchor_online_and_offline_record_temp01 b2
on b1.end_time = b2.begin_time) a) b
where b.rn = 1;
总结
1.同时在线问题需要考虑结果的时间交叉问题,有交叉的两条结果记录需要合并
2.结果记录中开始时间等于结束时间的最高人数的记录属于假记录,需要过滤掉
生产上
若是生产上使用,那要多数据抽样查看,生产上数据的多样化、复杂化会出现很多不可预期的数据混乱现象