需求六-连续时间区间合并&间隔连续&日期交叉问题

韩家小志

已于 2022-12-12 18:23:01 修改

阅读量572

点赞数

分类专栏： # Hive函数实战文章标签： sql

于 2021-10-08 20:33:12 首次发布

本文链接：https://blog.csdn.net/qq_46893497/article/details/120659358

版权

Hive函数实战专栏收录该内容

7 篇文章 3 订阅

订阅专栏

文章目录

小练习一
练习二：连续时间区间合并
练习三：间隔连续问题
练习四：打折日期交叉问题

小练习一

tableA 存储了一所大学所有人的所有科目的考试成绩，共4列：学号（ stu_num ），考试科目（subject），考试成绩（ score ）等级（ level ）(共SABCD五个值)，考试时间（ time ），
求：第一次考试的考试成绩等级为A，其它考试成绩等级都为B的学号。

select 
    stu_num
from (select 
        stu_num
        ,part_cnt
        ,sum(case when t.rank=1 and t.level='A' then 1 else flag_b end ) as flag_b
    from (select
            stu_num
            ,level
            ,case when level='B' then 1 else 0 as flag_b
            ,sum(stu_num) over (partition by stu_num) as part_cnt
            ,row_num()over(partition by stu_num order by time ) as rank
        from tableA
        )t
    group by stu_num
            ,part_cnt
    )t2
where t2.flag_b=t2.part_cnt
;

练习二：连续时间区间合并

需求：连续时间区间合并
- user_id 用户id
- location_id 地点
- start_time 停留开始时间（时间）
- stop_mint 停留时间（分钟）

--数据
select *
from tmpdb.user_location 
;
--原始数据
-- user_location.user_id	user_location.location_id	user_location.start_time	user_location.stop_mint
-- 1	A	2018-01-0108:00:00	60
-- 1	A	2018-01-0109:00:00	60
-- 2	B	2018-01-0110:00:00	60
-- 1	A	2018-01-0111:00:00	60

--结果
-- user_id	location_id	start_time	stop_mint
-- 1	A	2018-01-01 08:00:00	120.0
-- 1	A	2018-01-01 11:00:00	60.0
-- 2	B	2018-01-01 10:00:00	60.0

思路
- step1:打标签is_first，该标签为向后开始合并的第一条
- step2:开窗聚合标签，聚合出来group_i，即这些数据是需要合并的一组
- step3:取到各组的最小开始时间，及最大结束时间，即为合并区间的起始和结束
答案解析


select  user_id
       ,location_id
       ,start_time
       ,(unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') - unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss'))/60 as stop_mint --step6:按组取到合并后的开始时间，结束时间
from
(
    select  user_id
           ,location_id
           ,grp
           ,min(start_time) as start_time
           ,max(end_time)   as end_time
    from
    (
        select  user_id
               ,location_id
               ,start_time
               ,end_time
               ,sum(is_first)over(partition by user_id,location_id order by start_time) as grp
        from
        (
            select  user_id
                   ,location_id
                   ,start_time
                   ,end_time
                   ,case when start_time > lag_end_time then 1                                                          -- 两类：连续的时间的起始 ；不连续的时间
                         else 0 end is_first
            from
            (
                select  user_id
                       ,location_id
                       ,start_time
                       ,end_time
                       ,lag(end_time,1,0)over(partition by user_id,location_id order by start_time)    as lag_end_time     -- 上一条的结束时间
                       ,lead(start_time,1,0)over(partition by user_id,location_id order by start_time) as lead_start_time  -- 下一条的开始时间
                from
                (
                    select  user_id
                           ,location_id
                           ,from_unixtime(unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') ,'yyyy-MM-dd HH:mm:ss')                 as start_time -- 停留开始时间
                           ,from_unixtime( unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') + stop_mint*60 ,'yyyy-MM-dd HH:mm:ss') as end_time -- 停留结束时间
                    from tmpdb.user_location
                ) time_fix  --step1:将时间整理成标准时间，并计算出停留结束时间
            ) time_fix_lag_lead--step2 :取到上一条结束时间，下一条开始时间
        ) pre_grp_flag--step3: 将 需要向后 合并的开始时间  置为 1
    ) pre_grp--step4: 将 需要合并的数据分组
    group by  user_id
             ,location_id
             ,grp
) grp--step5:按组取到合并后的开始时间，结束时间
;

详细拆分（看懂上面的下面的请忽略，这里为了方便理解，分的步骤贼散）

--step1:将时间整理成标准时间，并计算出停留结束时间
with time_fix as 
(
select user_id	
    ,location_id
    ,from_unixtime(unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') ,'yyyy-MM-dd HH:mm:ss')  as start_time --停留开始时间
    ,from_unixtime( unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') + stop_mint*60 , 'yyyy-MM-dd HH:mm:ss')  as end_time -- --停留结束时间
from tmpdb.user_location
)
-- user_id	location_id	start_time	end_time
-- 1	A	2018-01-01 08:00:00	2018-01-01 09:00:00
-- 1	A	2018-01-01 09:00:00	2018-01-01 10:00:00
-- 2	B	2018-01-01 10:00:00	2018-01-01 11:00:00
-- 1	A	2018-01-01 11:00:00	2018-01-01 12:00:00

--step2 :取到上一条结束时间，下一条开始时间
, time_fix_lag_lead as
(
select user_id	
    ,location_id
    ,start_time
    ,end_time
    ,lag(end_time,1,0)over(partition by user_id,location_id order by start_time) as lag_end_time --上一条的结束时间
    ,lead(start_time,1,0)over(partition by user_id,location_id order by start_time) as lead_start_time --下一条的开始时间
from time_fix 
)
-- user_id	location_id	start_time	end_time	lag_end_time	lead_start_time
-- 1	A	2018-01-01 08:00:00	2018-01-01 09:00:00	0	2018-01-01 09:00:00
-- 1	A	2018-01-01 09:00:00	2018-01-01 10:00:00	2018-01-01 09:00:00	2018-01-01 11:00:00
-- 1	A	2018-01-01 11:00:00	2018-01-01 12:00:00	2018-01-01 10:00:00	0
-- 2	B	2018-01-01 10:00:00	2018-01-01 11:00:00	0	0

--step3: 将 需要向后 合并的开始时间  置为 1
,pre_grp_flag as 
(
select 
    user_id	
    ,location_id	
    ,start_time
    ,end_time
    ,case when start_time > lag_end_time then 1 --两类：连续的时间的起始 ；不连续的时间
        else 0 end is_first  
from time_fix_lag_lead
)
-- user_id	location_id	start_time	end_time	is_first
-- 1	A	2018-01-01 08:00:00	2018-01-01 09:00:00	1
-- 1	A	2018-01-01 09:00:00	2018-01-01 10:00:00	0
-- 1	A	2018-01-01 11:00:00	2018-01-01 12:00:00	1
-- 2	B	2018-01-01 10:00:00	2018-01-01 11:00:00	1

--step4: 将 需要合并的数据分组
,pre_grp as 
(
select 
    user_id	
    ,location_id	
    ,start_time
    ,end_time
    ,sum(is_first)over(partition by user_id,location_id order by start_time) as grp
from pre_grp_flag
)
-- user_id	location_id	start_time	end_time	grp
-- 1	A	2018-01-01 08:00:00	2018-01-01 09:00:00	1
-- 1	A	2018-01-01 09:00:00	2018-01-01 10:00:00	1
-- 1	A	2018-01-01 11:00:00	2018-01-01 12:00:00	2
-- 2	B	2018-01-01 10:00:00	2018-01-01 11:00:00	1

--step5:按组取到合并后的开始时间，结束时间
,grp as 
(
select 
    user_id
    ,location_id
    ,grp
    ,min(start_time) as start_time
    ,max(end_time) as end_time
from pre_grp
group by user_id
        ,location_id
        ,grp
)
-- user_id	location_id	grp	start_time	end_time
-- 1	A	1	2018-01-01 08:00:00	2018-01-01 10:00:00
-- 1	A	2	2018-01-01 11:00:00	2018-01-01 12:00:00
-- 2	B	1	2018-01-01 10:00:00	2018-01-01 11:00:00

--step6:按组取到合并后的开始时间，结束时间
select 
    user_id	
    ,location_id	
    ,start_time
    ,(unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') - unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss'))/60 as stop_mint
from grp 
;
-- user_id	location_id	start_time	stop_mint
-- 1	A	2018-01-01 08:00:00	120.0
-- 1	A	2018-01-01 11:00:00	60.0
-- 2	B	2018-01-01 10:00:00	60.0

练习三：间隔连续问题

需求：计算每个用户最大的连续登录天数，可以间隔一天。解释：如果一个用户在 1,3,5,6 登录游戏，则视为连续 6 天登录。
结果 1001 5天 1002 2天
准备数据

-- 创建数据
create table tmpdb.hanjiaxiaozhi as 
 select 1001 as id , '2021-12-12' as dt
union all select 1002 as id , '2021-12-12' as dt
union all select 1001 as id , '2021-12-13' as dt
union all select 1001 as id , '2021-12-14' as dt
union all select 1001 as id , '2021-12-16' as dt
union all select 1002 as id , '2021-12-16' as dt
union all select 1001 as id , '2021-12-19' as dt
union all select 1002 as id , '2021-12-17' as dt
union all select 1001 as id , '2021-12-20' as dt
;

-- 查看数据
select *
from tmpdb.hanjiaxiaozhi 
;
-- hanjiaxiaozhi.id	hanjiaxiaozhi.dt
-- 1001	2021-12-12
-- 1002	2021-12-12
-- 1001	2021-12-13
-- 1001	2021-12-14
-- 1001	2021-12-16
-- 1002	2021-12-16
-- 1001	2021-12-19
-- 1002	2021-12-17
-- 1001	2021-12-20

解决

select  id
       ,max(max_login) as max_login
from
(
    select  id
           ,grp_id
           ,datediff(max(dt),min(dt))+1 as max_login
    from
    (
        select  id
               ,dt
               ,lag_dt
               ,sum(if(datediff(dt,lag_dt) > 2,1,0)) over (partition by id order by dt) as grp_id
        from
        (
            select  id
                   ,dt
                   ,lag(dt,1,'1970-01-01') over(partition by id order by dt) as lag_dt
            from tmpdb.hanjiaxiaozhi
        ) pre -- step1:拿到上一条dt
    )pre_3 --step23:做差值，这里是可以间隔1天，所以如果相减小于2，那么记为0，否则记为1
    group by  id
             ,grp_id
)t
group by  id --step4:同一grp_id下求同一组最大dt-最小dt，再求最大值 即为该用户最大连续登陆天数
;

详细拆分（看懂上面的下面的请忽略，这里为了方便理解，分的步骤贼散）

-- step1:拿到上一条dt
with pre as
(select
    id
    ,dt
    ,lag(dt,1,'1970-01-01') over(partition by id order by dt) as lag_dt
from tmpdb.hanjiaxiaozhi 
)
-- id	dt	lag_dt
-- 1001	2021-12-12	1970-01-01
-- 1001	2021-12-13	2021-12-12
-- 1001	2021-12-14	2021-12-13
-- 1001	2021-12-16	2021-12-14
-- 1001	2021-12-19	2021-12-16
-- 1001	2021-12-20	2021-12-19
-- 1002	2021-12-12	1970-01-01
-- 1002	2021-12-16	2021-12-12
-- 1002	2021-12-17	2021-12-16

--step2:做差值，这里是可以间隔1天，所以如果相减小于2，那么记为0，否则记为1
,pre_2 as
(
select 
    id 
    ,dt
    ,lag_dt
    ,if(datediff(dt,lag_dt)<=2,0,1) as is_first
from pre
)
-- id	dt	lag_dt	is_first
-- 1001	2021-12-12	1970-01-01	1
-- 1001	2021-12-13	2021-12-12	0
-- 1001	2021-12-14	2021-12-13	0
-- 1001	2021-12-16	2021-12-14	0
-- 1001	2021-12-19	2021-12-16	1
-- 1001	2021-12-20	2021-12-19	0
-- 1002	2021-12-12	1970-01-01	1
-- 1002	2021-12-16	2021-12-12	1
-- 1002	2021-12-17	2021-12-16	0

-- step3:sum开窗分组
,pre_3 as 
(
select 
    id
    ,dt
    ,lag_dt
    ,is_first
    ,sum(is_first) over (partition by id order by dt) as grp_id
from pre_2
)
-- id	dt	lag_dt	is_first	grp_id
-- 1001	2021-12-12	1970-01-01	1	1
-- 1001	2021-12-13	2021-12-12	0	1
-- 1001	2021-12-14	2021-12-13	0	1
-- 1001	2021-12-16	2021-12-14	0	1
-- 1001	2021-12-19	2021-12-16	1	2
-- 1001	2021-12-20	2021-12-19	0	2
-- 1002	2021-12-12	1970-01-01	1	1
-- 1002	2021-12-16	2021-12-12	1	2
-- 1002	2021-12-17	2021-12-16	0	2

--这里step2和3可以合并，之所以分开是为了看起来方便
-- ,pre_3 as 
-- (
-- select 
--     id
--     ,dt
--     ,lag_dt
--     ,sum(if(datediff(dt,lag_dt)>2,1,0)) over (partition by id order by dt) as grp_id
-- from pre
-- )

--step4:同一grp_id下求同一组最大dt-最小dt，再求最大值 即为该用户最大连续登陆天数
select
    id
    ,max(max_login) as max_login
from(select
        id
        ,grp_id
        ,datediff(max(dt),min(dt))+1 as max_login
    from pre_3
    group by id
            ,grp_id
    )t
group by id
;
-- id	max_login
-- 1001	5
-- 1002	2

练习四：打折日期交叉问题

需求
如下为平台商品促销数据：字段为品牌，打折开始日期，打折结束日期
计算每个品牌总的打折销售天数，注意其中的交叉日期，比如 vivo 品牌，第一次活动时间为 2021-06-05 到 2021-06-15，第二次活动时间为 2021-06-09 到 2021-06-21 其中 9 号到 15号为重复天数，只统计一次，即 vivo 总打折天数为 2021-06-05 到 2021-06-21 共计 17 天。
准备数据

create table py_tmpdb.hanjiaxiaozhi as
 select 'oppo' as brand, '2021-06-05' as stt , '2021-06-09' as edt
union all select 'oppo' as brand, '2021-06-11' as stt , '2021-06-21' as edt
union all select 'vivo' as brand, '2021-06-05' as stt , '2021-06-15' as edt
union all select 'vivo' as brand, '2021-06-09' as stt , '2021-06-21' as edt
union all select 'redmi' as brand, '2021-06-05' as stt , '2021-06-21' as edt
union all select 'redmi' as brand, '2021-06-09' as stt , '2021-06-15' as edt
union all select 'redmi' as brand, '2021-06-17' as stt , '2021-06-26' as edt
union all select 'huawei' as brand, '2021-06-05' as stt , '2021-06-26' as edt
union all select 'huawei' as brand, '2021-06-09' as stt , '2021-06-15' as edt
union all select 'huawei' as brand, '2021-06-17' as stt , '2021-06-21' as edt
;

--查看数据
select * 
from py_tmpdb.hanjiaxiaozhi

-- hanjiaxiaozhi.brand	hanjiaxiaozhi.stt	hanjiaxiaozhi.edt
-- oppo	2021-06-05	2021-06-09
-- oppo	2021-06-11	2021-06-21
-- vivo	2021-06-05	2021-06-15
-- vivo	2021-06-09	2021-06-21
-- redmi	2021-06-05	2021-06-21
-- redmi	2021-06-09	2021-06-15
-- redmi	2021-06-17	2021-06-26
-- huawei	2021-06-05	2021-06-26
-- huawei	2021-06-09	2021-06-15
-- huawei	2021-06-17	2021-06-21

结果

brand	dur_days
huawei	22
oppo	16
redmi	22
vivo	17

思路
- step1：取到本条之前最大的结束时间
- step2:如果stt大于之前最大的edt，不变，否则把stt+1置为上一条的结束时间
- step3:datediff(edt,stt),会出现负的，这些数据下一步去掉
- step4：如果dur_days为负数，那么+0，否则加dur_days+1
答案解析

select  brand
       ,sum(if(dur_days > 0,dur_days+1,0)) as dur_days
from
(
    select  brand
           ,stt
           ,edt
           ,datediff(edt,stt) as dur_days
    from
    (
        select  brand
               ,if(max_edt is null,stt,if(stt > max_edt,stt,date_add(max_edt,1))) as stt
               ,edt
        from
        (
            select  brand
                   ,stt
                   ,edt
                   ,max(edt)over(partition by brand order by stt rows between unbounded preceding and 1 preceding) as max_edt
            from py_tmpdb.hanjiaxiaozhi
        ) step_1 -- step1：取到本条之前  最大的结束时间
    ) step_2 --step2:如果stt大于之前最大的edt，不变，否则 把stt+1置为上一条的结束时间
) step_3 --step3:datediff(edt,stt),会出现负的，这些数据下一步去掉
group by  brand --step4：如果dur_days为负数，那么+0，否则加dur_days+1
;

详细拆分（看懂上面的下面的请忽略，这里为了方便理解，分的步骤贼散）

-- step1：取到本条之前  最大的结束时间
with step_1 as
(
select
    brand
    ,stt
    ,edt
    ,max(edt)over(partition by brand order by stt rows between unbounded preceding and 1 preceding) as max_edt
from py_tmpdb.hanjiaxiaozhi
)
-- brand	stt	edt	max_edt
-- huawei	2021-06-05	2021-06-26	NULL
-- huawei	2021-06-09	2021-06-15	2021-06-26
-- huawei	2021-06-17	2021-06-21	2021-06-26
-- oppo	2021-06-05	2021-06-09	NULL
-- oppo	2021-06-11	2021-06-21	2021-06-09
-- redmi	2021-06-05	2021-06-21	NULL
-- redmi	2021-06-09	2021-06-15	2021-06-21
-- redmi	2021-06-17	2021-06-26	2021-06-21
-- vivo	2021-06-05	2021-06-15	NULL
-- vivo	2021-06-09	2021-06-21	2021-06-15

--step2:如果stt大于之前最大的edt，不变，否则 把stt+1置为上一条的结束时间
,step_2 as
(
select 
    brand
    ,if(max_edt is null,stt,if(stt>max_edt,stt,date_add(max_edt,1))) as stt
    ,edt
from step_1
)
-- brand	stt	edt
-- huawei	2021-06-05	2021-06-26
-- huawei	2021-06-27	2021-06-15
-- huawei	2021-06-27	2021-06-21
-- oppo	2021-06-05	2021-06-09
-- oppo	2021-06-11	2021-06-21
-- redmi	2021-06-05	2021-06-21
-- redmi	2021-06-22	2021-06-15
-- redmi	2021-06-22	2021-06-26
-- vivo	2021-06-05	2021-06-15
-- vivo	2021-06-16	2021-06-21

--step3:datediff(edt,stt),会出现负的，这些数据下一步去掉，这里having会报错
,step_3 as
(
select
    brand
    ,stt
    ,edt
    ,datediff(edt,stt) as dur_days
from step_2
)
-- brand	stt	edt	dur_days
-- huawei	2021-06-05	2021-06-26	21
-- huawei	2021-06-27	2021-06-15	-12
-- huawei	2021-06-27	2021-06-21	-6
-- oppo	2021-06-05	2021-06-09	4
-- oppo	2021-06-11	2021-06-21	10
-- redmi	2021-06-05	2021-06-21	16
-- redmi	2021-06-22	2021-06-15	-7
-- redmi	2021-06-22	2021-06-26	4
-- vivo	2021-06-05	2021-06-15	10
-- vivo	2021-06-16	2021-06-21	5

--step4：如果dur_days为负数，那么+0，否则加dur_days+1
select
    brand
    ,sum(if(dur_days>0,dur_days+1,0)) as dur_days
from step_3
group by brand
;
-- brand	dur_days
-- huawei	22
-- oppo	16
-- redmi	22
-- vivo	17

韩家小志

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
需求六-连续时间区间合并&间隔连续&日期交叉问题

tableA 存储了一所大学所有人的所有科目的考试成绩，共4列：学号（ stu_num ），考试科目（subject），考试成绩（ score ）等级（ level ）(共SABCD五个值)，考试时间（ time ），求：第一次考试的考试成绩等级为A，其它考试成绩等级都为B的学号。select stu_numfrom (select stu_num ,part_cnt ,sum(case when t.rank=1 and t.le.
复制链接

扫一扫