需求六-连续时间区间合并&间隔连续&日期交叉问题

小练习一

  • tableA 存储了一所大学所有人的所有科目的考试成绩,共4列:学号 ( stu_num ),考试科目(subject),考试成绩( score )等级 ( level )(共SABCD五个值),考试时间 ( time ),
  • 求:第一次考试的考试成绩等级为A,其它考试成绩等级都为B的学号。
select 
    stu_num
from (select 
        stu_num
        ,part_cnt
        ,sum(case when t.rank=1 and t.level='A' then 1 else flag_b end ) as flag_b
    from (select
            stu_num
            ,level
            ,case when level='B' then 1 else 0 as flag_b
            ,sum(stu_num) over (partition by stu_num) as part_cnt
            ,row_num()over(partition by stu_num order by time ) as rank
        from tableA
        )t
    group by stu_num
            ,part_cnt
    )t2
where t2.flag_b=t2.part_cnt
;

练习二:连续时间区间合并

  • 需求:连续时间区间合并
    • user_id 用户id
    • location_id 地点
    • start_time 停留开始时间(时间)
    • stop_mint 停留时间(分钟)
--数据
select *
from tmpdb.user_location 
;
--原始数据
-- user_location.user_id	user_location.location_id	user_location.start_time	user_location.stop_mint
-- 1	A	2018-01-0108:00:00	60
-- 1	A	2018-01-0109:00:00	60
-- 2	B	2018-01-0110:00:00	60
-- 1	A	2018-01-0111:00:00	60

--结果
-- user_id	location_id	start_time	stop_mint
-- 1	A	2018-01-01 08:00:00	120.0
-- 1	A	2018-01-01 11:00:00	60.0
-- 2	B	2018-01-01 10:00:00	60.0
  • 思路
    • step1:打标签is_first,该标签为向后开始合并的第一条
    • step2:开窗聚合标签,聚合出来group_i,即这些数据是需要合并的一组
    • step3:取到各组的最小开始时间,及最大结束时间,即为合并区间的起始和结束
  • 答案解析

select  user_id
       ,location_id
       ,start_time
       ,(unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') - unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss'))/60 as stop_mint --step6:按组取到合并后的开始时间,结束时间
from
(
    select  user_id
           ,location_id
           ,grp
           ,min(start_time) as start_time
           ,max(end_time)   as end_time
    from
    (
        select  user_id
               ,location_id
               ,start_time
               ,end_time
               ,sum(is_first)over(partition by user_id,location_id order by start_time) as grp
        from
        (
            select  user_id
                   ,location_id
                   ,start_time
                   ,end_time
                   ,case when start_time > lag_end_time then 1                                                          -- 两类:连续的时间的起始 ;不连续的时间
                         else 0 end is_first
            from
            (
                select  user_id
                       ,location_id
                       ,start_time
                       ,end_time
                       ,lag(end_time,1,0)over(partition by user_id,location_id order by start_time)    as lag_end_time     -- 上一条的结束时间
                       ,lead(start_time,1,0)over(partition by user_id,location_id order by start_time) as lead_start_time  -- 下一条的开始时间
                from
                (
                    select  user_id
                           ,location_id
                           ,from_unixtime(unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') ,'yyyy-MM-dd HH:mm:ss')                 as start_time -- 停留开始时间
                           ,from_unixtime( unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') + stop_mint*60 ,'yyyy-MM-dd HH:mm:ss') as end_time -- 停留结束时间
                    from tmpdb.user_location
                ) time_fix  --step1:将时间整理成标准时间,并计算出停留结束时间
            ) time_fix_lag_lead--step2 :取到上一条结束时间,下一条开始时间
        ) pre_grp_flag--step3: 将 需要向后 合并的开始时间  置为 1
    ) pre_grp--step4: 将 需要合并的数据分组
    group by  user_id
             ,location_id
             ,grp
) grp--step5:按组取到合并后的开始时间,结束时间
;
  • 详细拆分(看懂上面的下面的请忽略,这里为了方便理解,分的步骤贼散)
--step1:将时间整理成标准时间,并计算出停留结束时间
with time_fix as 
(
select user_id	
    ,location_id
    ,from_unixtime(unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') ,'yyyy-MM-dd HH:mm:ss')  as start_time --停留开始时间
    ,from_unixtime( unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') + stop_mint*60 , 'yyyy-MM-dd HH:mm:ss')  as end_time -- --停留结束时间
from tmpdb.user_location
)
-- user_id	location_id	start_time	end_time
-- 1	A	2018-01-01 08:00:00	2018-01-01 09:00:00
-- 1	A	2018-01-01 09:00:00	2018-01-01 10:00:00
-- 2	B	2018-01-01 10:00:00	2018-01-01 11:00:00
-- 1	A	2018-01-01 11:00:00	2018-01-01 12:00:00

--step2 :取到上一条结束时间,下一条开始时间
, time_fix_lag_lead as
(
select user_id	
    ,location_id
    ,start_time
    ,end_time
    ,lag(end_time,1,0)over(partition by user_id,location_id order by start_time) as lag_end_time --上一条的结束时间
    ,lead(start_time,1,0)over(partition by user_id,location_id order by start_time) as lead_start_time --下一条的开始时间
from time_fix 
)
-- user_id	location_id	start_time	end_time	lag_end_time	lead_start_time
-- 1	A	2018-01-01 08:00:00	2018-01-01 09:00:00	0	2018-01-01 09:00:00
-- 1	A	2018-01-01 09:00:00	2018-01-01 10:00:00	2018-01-01 09:00:00	2018-01-01 11:00:00
-- 1	A	2018-01-01 11:00:00	2018-01-01 12:00:00	2018-01-01 10:00:00	0
-- 2	B	2018-01-01 10:00:00	2018-01-01 11:00:00	0	0

--step3: 将 需要向后 合并的开始时间  置为 1
,pre_grp_flag as 
(
select 
    user_id	
    ,location_id	
    ,start_time
    ,end_time
    ,case when start_time > lag_end_time then 1 --两类:连续的时间的起始 ;不连续的时间
        else 0 end is_first  
from time_fix_lag_lead
)
-- user_id	location_id	start_time	end_time	is_first
-- 1	A	2018-01-01 08:00:00	2018-01-01 09:00:00	1
-- 1	A	2018-01-01 09:00:00	2018-01-01 10:00:00	0
-- 1	A	2018-01-01 11:00:00	2018-01-01 12:00:00	1
-- 2	B	2018-01-01 10:00:00	2018-01-01 11:00:00	1

--step4: 将 需要合并的数据分组
,pre_grp as 
(
select 
    user_id	
    ,location_id	
    ,start_time
    ,end_time
    ,sum(is_first)over(partition by user_id,location_id order by start_time) as grp
from pre_grp_flag
)
-- user_id	location_id	start_time	end_time	grp
-- 1	A	2018-01-01 08:00:00	2018-01-01 09:00:00	1
-- 1	A	2018-01-01 09:00:00	2018-01-01 10:00:00	1
-- 1	A	2018-01-01 11:00:00	2018-01-01 12:00:00	2
-- 2	B	2018-01-01 10:00:00	2018-01-01 11:00:00	1

--step5:按组取到合并后的开始时间,结束时间
,grp as 
(
select 
    user_id
    ,location_id
    ,grp
    ,min(start_time) as start_time
    ,max(end_time) as end_time
from pre_grp
group by user_id
        ,location_id
        ,grp
)
-- user_id	location_id	grp	start_time	end_time
-- 1	A	1	2018-01-01 08:00:00	2018-01-01 10:00:00
-- 1	A	2	2018-01-01 11:00:00	2018-01-01 12:00:00
-- 2	B	1	2018-01-01 10:00:00	2018-01-01 11:00:00

--step6:按组取到合并后的开始时间,结束时间
select 
    user_id	
    ,location_id	
    ,start_time
    ,(unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') - unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss'))/60 as stop_mint
from grp 
;
-- user_id	location_id	start_time	stop_mint
-- 1	A	2018-01-01 08:00:00	120.0
-- 1	A	2018-01-01 11:00:00	60.0
-- 2	B	2018-01-01 10:00:00	60.0

练习三:间隔连续问题

  • 需求:计算每个用户最大的连续登录天数,可以间隔一天。解释:如果一个用户在 1,3,5,6 登录游戏,则视为连续 6 天登录。
  • 结果 1001 5天 1002 2天
  • 准备数据
-- 创建数据
create table tmpdb.hanjiaxiaozhi as 
 select 1001 as id , '2021-12-12' as dt
union all select 1002 as id , '2021-12-12' as dt
union all select 1001 as id , '2021-12-13' as dt
union all select 1001 as id , '2021-12-14' as dt
union all select 1001 as id , '2021-12-16' as dt
union all select 1002 as id , '2021-12-16' as dt
union all select 1001 as id , '2021-12-19' as dt
union all select 1002 as id , '2021-12-17' as dt
union all select 1001 as id , '2021-12-20' as dt
;

-- 查看数据
select *
from tmpdb.hanjiaxiaozhi 
;
-- hanjiaxiaozhi.id	hanjiaxiaozhi.dt
-- 1001	2021-12-12
-- 1002	2021-12-12
-- 1001	2021-12-13
-- 1001	2021-12-14
-- 1001	2021-12-16
-- 1002	2021-12-16
-- 1001	2021-12-19
-- 1002	2021-12-17
-- 1001	2021-12-20
  • 解决
select  id
       ,max(max_login) as max_login
from
(
    select  id
           ,grp_id
           ,datediff(max(dt),min(dt))+1 as max_login
    from
    (
        select  id
               ,dt
               ,lag_dt
               ,sum(if(datediff(dt,lag_dt) > 2,1,0)) over (partition by id order by dt) as grp_id
        from
        (
            select  id
                   ,dt
                   ,lag(dt,1,'1970-01-01') over(partition by id order by dt) as lag_dt
            from tmpdb.hanjiaxiaozhi
        ) pre -- step1:拿到上一条dt
    )pre_3 --step23:做差值,这里是可以间隔1天,所以如果相减小于2,那么记为0,否则记为1
    group by  id
             ,grp_id
)t
group by  id --step4:同一grp_id下求同一组最大dt-最小dt,再求最大值 即为该用户最大连续登陆天数
;

  • 详细拆分(看懂上面的下面的请忽略,这里为了方便理解,分的步骤贼散)
-- step1:拿到上一条dt
with pre as
(select
    id
    ,dt
    ,lag(dt,1,'1970-01-01') over(partition by id order by dt) as lag_dt
from tmpdb.hanjiaxiaozhi 
)
-- id	dt	lag_dt
-- 1001	2021-12-12	1970-01-01
-- 1001	2021-12-13	2021-12-12
-- 1001	2021-12-14	2021-12-13
-- 1001	2021-12-16	2021-12-14
-- 1001	2021-12-19	2021-12-16
-- 1001	2021-12-20	2021-12-19
-- 1002	2021-12-12	1970-01-01
-- 1002	2021-12-16	2021-12-12
-- 1002	2021-12-17	2021-12-16

--step2:做差值,这里是可以间隔1天,所以如果相减小于2,那么记为0,否则记为1
,pre_2 as
(
select 
    id 
    ,dt
    ,lag_dt
    ,if(datediff(dt,lag_dt)<=2,0,1) as is_first
from pre
)
-- id	dt	lag_dt	is_first
-- 1001	2021-12-12	1970-01-01	1
-- 1001	2021-12-13	2021-12-12	0
-- 1001	2021-12-14	2021-12-13	0
-- 1001	2021-12-16	2021-12-14	0
-- 1001	2021-12-19	2021-12-16	1
-- 1001	2021-12-20	2021-12-19	0
-- 1002	2021-12-12	1970-01-01	1
-- 1002	2021-12-16	2021-12-12	1
-- 1002	2021-12-17	2021-12-16	0

-- step3:sum开窗分组
,pre_3 as 
(
select 
    id
    ,dt
    ,lag_dt
    ,is_first
    ,sum(is_first) over (partition by id order by dt) as grp_id
from pre_2
)
-- id	dt	lag_dt	is_first	grp_id
-- 1001	2021-12-12	1970-01-01	1	1
-- 1001	2021-12-13	2021-12-12	0	1
-- 1001	2021-12-14	2021-12-13	0	1
-- 1001	2021-12-16	2021-12-14	0	1
-- 1001	2021-12-19	2021-12-16	1	2
-- 1001	2021-12-20	2021-12-19	0	2
-- 1002	2021-12-12	1970-01-01	1	1
-- 1002	2021-12-16	2021-12-12	1	2
-- 1002	2021-12-17	2021-12-16	0	2

--这里step2和3可以合并,之所以分开是为了看起来方便
-- ,pre_3 as 
-- (
-- select 
--     id
--     ,dt
--     ,lag_dt
--     ,sum(if(datediff(dt,lag_dt)>2,1,0)) over (partition by id order by dt) as grp_id
-- from pre
-- )

--step4:同一grp_id下求同一组最大dt-最小dt,再求最大值 即为该用户最大连续登陆天数
select
    id
    ,max(max_login) as max_login
from(select
        id
        ,grp_id
        ,datediff(max(dt),min(dt))+1 as max_login
    from pre_3
    group by id
            ,grp_id
    )t
group by id
;
-- id	max_login
-- 1001	5
-- 1002	2

练习四: 打折日期交叉问题

  • 需求
  • 如下为平台商品促销数据:字段为品牌,打折开始日期,打折结束日期
  • 计算每个品牌总的打折销售天数,注意其中的交叉日期,比如 vivo 品牌,第一次活动时间为 2021-06-05 到 2021-06-15,第二次活动时间为 2021-06-09 到 2021-06-21 其中 9 号到 15号为重复天数,只统计一次,即 vivo 总打折天数为 2021-06-05 到 2021-06-21 共计 17 天。
  • 准备数据
create table py_tmpdb.hanjiaxiaozhi as
 select 'oppo' as brand, '2021-06-05' as stt , '2021-06-09' as edt
union all select 'oppo' as brand, '2021-06-11' as stt , '2021-06-21' as edt
union all select 'vivo' as brand, '2021-06-05' as stt , '2021-06-15' as edt
union all select 'vivo' as brand, '2021-06-09' as stt , '2021-06-21' as edt
union all select 'redmi' as brand, '2021-06-05' as stt , '2021-06-21' as edt
union all select 'redmi' as brand, '2021-06-09' as stt , '2021-06-15' as edt
union all select 'redmi' as brand, '2021-06-17' as stt , '2021-06-26' as edt
union all select 'huawei' as brand, '2021-06-05' as stt , '2021-06-26' as edt
union all select 'huawei' as brand, '2021-06-09' as stt , '2021-06-15' as edt
union all select 'huawei' as brand, '2021-06-17' as stt , '2021-06-21' as edt
;

--查看数据
select * 
from py_tmpdb.hanjiaxiaozhi

-- hanjiaxiaozhi.brand	hanjiaxiaozhi.stt	hanjiaxiaozhi.edt
-- oppo	2021-06-05	2021-06-09
-- oppo	2021-06-11	2021-06-21
-- vivo	2021-06-05	2021-06-15
-- vivo	2021-06-09	2021-06-21
-- redmi	2021-06-05	2021-06-21
-- redmi	2021-06-09	2021-06-15
-- redmi	2021-06-17	2021-06-26
-- huawei	2021-06-05	2021-06-26
-- huawei	2021-06-09	2021-06-15
-- huawei	2021-06-17	2021-06-21
  • 结果
brand	dur_days
huawei	22
oppo	16
redmi	22
vivo	17
  • 思路
    • step1:取到本条之前 最大的结束时间
    • step2:如果stt大于之前最大的edt,不变,否则 把stt+1置为上一条的结束时间
    • step3:datediff(edt,stt),会出现负的,这些数据下一步去掉
    • step4:如果dur_days为负数,那么+0,否则加dur_days+1
  • 答案解析
select  brand
       ,sum(if(dur_days > 0,dur_days+1,0)) as dur_days
from
(
    select  brand
           ,stt
           ,edt
           ,datediff(edt,stt) as dur_days
    from
    (
        select  brand
               ,if(max_edt is null,stt,if(stt > max_edt,stt,date_add(max_edt,1))) as stt
               ,edt
        from
        (
            select  brand
                   ,stt
                   ,edt
                   ,max(edt)over(partition by brand order by stt rows between unbounded preceding and 1 preceding) as max_edt
            from py_tmpdb.hanjiaxiaozhi
        ) step_1 -- step1:取到本条之前  最大的结束时间
    ) step_2 --step2:如果stt大于之前最大的edt,不变,否则 把stt+1置为上一条的结束时间
) step_3 --step3:datediff(edt,stt),会出现负的,这些数据下一步去掉
group by  brand --step4:如果dur_days为负数,那么+0,否则加dur_days+1
;
  • 详细拆分(看懂上面的下面的请忽略,这里为了方便理解,分的步骤贼散)
-- step1:取到本条之前  最大的结束时间
with step_1 as
(
select
    brand
    ,stt
    ,edt
    ,max(edt)over(partition by brand order by stt rows between unbounded preceding and 1 preceding) as max_edt
from py_tmpdb.hanjiaxiaozhi
)
-- brand	stt	edt	max_edt
-- huawei	2021-06-05	2021-06-26	NULL
-- huawei	2021-06-09	2021-06-15	2021-06-26
-- huawei	2021-06-17	2021-06-21	2021-06-26
-- oppo	2021-06-05	2021-06-09	NULL
-- oppo	2021-06-11	2021-06-21	2021-06-09
-- redmi	2021-06-05	2021-06-21	NULL
-- redmi	2021-06-09	2021-06-15	2021-06-21
-- redmi	2021-06-17	2021-06-26	2021-06-21
-- vivo	2021-06-05	2021-06-15	NULL
-- vivo	2021-06-09	2021-06-21	2021-06-15

--step2:如果stt大于之前最大的edt,不变,否则 把stt+1置为上一条的结束时间
,step_2 as
(
select 
    brand
    ,if(max_edt is null,stt,if(stt>max_edt,stt,date_add(max_edt,1))) as stt
    ,edt
from step_1
)
-- brand	stt	edt
-- huawei	2021-06-05	2021-06-26
-- huawei	2021-06-27	2021-06-15
-- huawei	2021-06-27	2021-06-21
-- oppo	2021-06-05	2021-06-09
-- oppo	2021-06-11	2021-06-21
-- redmi	2021-06-05	2021-06-21
-- redmi	2021-06-22	2021-06-15
-- redmi	2021-06-22	2021-06-26
-- vivo	2021-06-05	2021-06-15
-- vivo	2021-06-16	2021-06-21

--step3:datediff(edt,stt),会出现负的,这些数据下一步去掉,这里having会报错
,step_3 as
(
select
    brand
    ,stt
    ,edt
    ,datediff(edt,stt) as dur_days
from step_2
)
-- brand	stt	edt	dur_days
-- huawei	2021-06-05	2021-06-26	21
-- huawei	2021-06-27	2021-06-15	-12
-- huawei	2021-06-27	2021-06-21	-6
-- oppo	2021-06-05	2021-06-09	4
-- oppo	2021-06-11	2021-06-21	10
-- redmi	2021-06-05	2021-06-21	16
-- redmi	2021-06-22	2021-06-15	-7
-- redmi	2021-06-22	2021-06-26	4
-- vivo	2021-06-05	2021-06-15	10
-- vivo	2021-06-16	2021-06-21	5

--step4:如果dur_days为负数,那么+0,否则加dur_days+1
select
    brand
    ,sum(if(dur_days>0,dur_days+1,0)) as dur_days
from step_3
group by brand
;
-- brand	dur_days
-- huawei	22
-- oppo	16
-- redmi	22
-- vivo	17
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值