小练习一
- tableA 存储了一所大学所有人的所有科目的考试成绩,共4列:学号 ( stu_num ),考试科目(subject),考试成绩( score )等级 ( level )(共SABCD五个值),考试时间 ( time ),
- 求:
第一次考试的考试成绩等级为A,其它考试成绩等级都为B的学号。
select
stu_num
from (select
stu_num
,part_cnt
,sum(case when t.rank=1 and t.level='A' then 1 else flag_b end ) as flag_b
from (select
stu_num
,level
,case when level='B' then 1 else 0 as flag_b
,sum(stu_num) over (partition by stu_num) as part_cnt
,row_num()over(partition by stu_num order by time ) as rank
from tableA
)t
group by stu_num
,part_cnt
)t2
where t2.flag_b=t2.part_cnt
;
练习二:连续时间区间合并
- 需求:连续时间区间合并
- user_id 用户id
- location_id 地点
- start_time 停留开始时间(时间)
- stop_mint 停留时间(分钟)
--数据
select *
from tmpdb.user_location
;
--原始数据
-- user_location.user_id user_location.location_id user_location.start_time user_location.stop_mint
-- 1 A 2018-01-0108:00:00 60
-- 1 A 2018-01-0109:00:00 60
-- 2 B 2018-01-0110:00:00 60
-- 1 A 2018-01-0111:00:00 60
--结果
-- user_id location_id start_time stop_mint
-- 1 A 2018-01-01 08:00:00 120.0
-- 1 A 2018-01-01 11:00:00 60.0
-- 2 B 2018-01-01 10:00:00 60.0
- 思路
- step1:打标签is_first,该标签为向后开始合并的第一条
- step2:开窗聚合标签,聚合出来group_i,即这些数据是需要合并的一组
- step3:取到各组的最小开始时间,及最大结束时间,即为合并区间的起始和结束
- 答案解析
select user_id
,location_id
,start_time
,(unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') - unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss'))/60 as stop_mint --step6:按组取到合并后的开始时间,结束时间
from
(
select user_id
,location_id
,grp
,min(start_time) as start_time
,max(end_time) as end_time
from
(
select user_id
,location_id
,start_time
,end_time
,sum(is_first)over(partition by user_id,location_id order by start_time) as grp
from
(
select user_id
,location_id
,start_time
,end_time
,case when start_time > lag_end_time then 1 -- 两类:连续的时间的起始 ;不连续的时间
else 0 end is_first
from
(
select user_id
,location_id
,start_time
,end_time
,lag(end_time,1,0)over(partition by user_id,location_id order by start_time) as lag_end_time -- 上一条的结束时间
,lead(start_time,1,0)over(partition by user_id,location_id order by start_time) as lead_start_time -- 下一条的开始时间
from
(
select user_id
,location_id
,from_unixtime(unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') ,'yyyy-MM-dd HH:mm:ss') as start_time -- 停留开始时间
,from_unixtime( unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') + stop_mint*60 ,'yyyy-MM-dd HH:mm:ss') as end_time -- 停留结束时间
from tmpdb.user_location
) time_fix --step1:将时间整理成标准时间,并计算出停留结束时间
) time_fix_lag_lead--step2 :取到上一条结束时间,下一条开始时间
) pre_grp_flag--step3: 将 需要向后 合并的开始时间 置为 1
) pre_grp--step4: 将 需要合并的数据分组
group by user_id
,location_id
,grp
) grp--step5:按组取到合并后的开始时间,结束时间
;
- 详细拆分(看懂上面的下面的请忽略,这里为了方便理解,分的步骤贼散)
--step1:将时间整理成标准时间,并计算出停留结束时间
with time_fix as
(
select user_id
,location_id
,from_unixtime(unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') ,'yyyy-MM-dd HH:mm:ss') as start_time --停留开始时间
,from_unixtime( unix_timestamp(start_time,'yyyy-MM-ddHH:mm:ss') + stop_mint*60 , 'yyyy-MM-dd HH:mm:ss') as end_time -- --停留结束时间
from tmpdb.user_location
)
-- user_id location_id start_time end_time
-- 1 A 2018-01-01 08:00:00 2018-01-01 09:00:00
-- 1 A 2018-01-01 09:00:00 2018-01-01 10:00:00
-- 2 B 2018-01-01 10:00:00 2018-01-01 11:00:00
-- 1 A 2018-01-01 11:00:00 2018-01-01 12:00:00
--step2 :取到上一条结束时间,下一条开始时间
, time_fix_lag_lead as
(
select user_id
,location_id
,start_time
,end_time
,lag(end_time,1,0)over(partition by user_id,location_id order by start_time) as lag_end_time --上一条的结束时间
,lead(start_time,1,0)over(partition by user_id,location_id order by start_time) as lead_start_time --下一条的开始时间
from time_fix
)
-- user_id location_id start_time end_time lag_end_time lead_start_time
-- 1 A 2018-01-01 08:00:00 2018-01-01 09:00:00 0 2018-01-01 09:00:00
-- 1 A 2018-01-01 09:00:00 2018-01-01 10:00:00 2018-01-01 09:00:00 2018-01-01 11:00:00
-- 1 A 2018-01-01 11:00:00 2018-01-01 12:00:00 2018-01-01 10:00:00 0
-- 2 B 2018-01-01 10:00:00 2018-01-01 11:00:00 0 0
--step3: 将 需要向后 合并的开始时间 置为 1
,pre_grp_flag as
(
select
user_id
,location_id
,start_time
,end_time
,case when start_time > lag_end_time then 1 --两类:连续的时间的起始 ;不连续的时间
else 0 end is_first
from time_fix_lag_lead
)
-- user_id location_id start_time end_time is_first
-- 1 A 2018-01-01 08:00:00 2018-01-01 09:00:00 1
-- 1 A 2018-01-01 09:00:00 2018-01-01 10:00:00 0
-- 1 A 2018-01-01 11:00:00 2018-01-01 12:00:00 1
-- 2 B 2018-01-01 10:00:00 2018-01-01 11:00:00 1
--step4: 将 需要合并的数据分组
,pre_grp as
(
select
user_id
,location_id
,start_time
,end_time
,sum(is_first)over(partition by user_id,location_id order by start_time) as grp
from pre_grp_flag
)
-- user_id location_id start_time end_time grp
-- 1 A 2018-01-01 08:00:00 2018-01-01 09:00:00 1
-- 1 A 2018-01-01 09:00:00 2018-01-01 10:00:00 1
-- 1 A 2018-01-01 11:00:00 2018-01-01 12:00:00 2
-- 2 B 2018-01-01 10:00:00 2018-01-01 11:00:00 1
--step5:按组取到合并后的开始时间,结束时间
,grp as
(
select
user_id
,location_id
,grp
,min(start_time) as start_time
,max(end_time) as end_time
from pre_grp
group by user_id
,location_id
,grp
)
-- user_id location_id grp start_time end_time
-- 1 A 1 2018-01-01 08:00:00 2018-01-01 10:00:00
-- 1 A 2 2018-01-01 11:00:00 2018-01-01 12:00:00
-- 2 B 1 2018-01-01 10:00:00 2018-01-01 11:00:00
--step6:按组取到合并后的开始时间,结束时间
select
user_id
,location_id
,start_time
,(unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') - unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss'))/60 as stop_mint
from grp
;
-- user_id location_id start_time stop_mint
-- 1 A 2018-01-01 08:00:00 120.0
-- 1 A 2018-01-01 11:00:00 60.0
-- 2 B 2018-01-01 10:00:00 60.0
练习三:间隔连续问题
- 需求:计算每个用户最大的连续登录天数,可以间隔一天。解释:如果一个用户在 1,3,5,6 登录游戏,则视为连续 6 天登录。
- 结果 1001 5天 1002 2天
- 准备数据
-- 创建数据
create table tmpdb.hanjiaxiaozhi as
select 1001 as id , '2021-12-12' as dt
union all select 1002 as id , '2021-12-12' as dt
union all select 1001 as id , '2021-12-13' as dt
union all select 1001 as id , '2021-12-14' as dt
union all select 1001 as id , '2021-12-16' as dt
union all select 1002 as id , '2021-12-16' as dt
union all select 1001 as id , '2021-12-19' as dt
union all select 1002 as id , '2021-12-17' as dt
union all select 1001 as id , '2021-12-20' as dt
;
-- 查看数据
select *
from tmpdb.hanjiaxiaozhi
;
-- hanjiaxiaozhi.id hanjiaxiaozhi.dt
-- 1001 2021-12-12
-- 1002 2021-12-12
-- 1001 2021-12-13
-- 1001 2021-12-14
-- 1001 2021-12-16
-- 1002 2021-12-16
-- 1001 2021-12-19
-- 1002 2021-12-17
-- 1001 2021-12-20
- 解决
select id
,max(max_login) as max_login
from
(
select id
,grp_id
,datediff(max(dt),min(dt))+1 as max_login
from
(
select id
,dt
,lag_dt
,sum(if(datediff(dt,lag_dt) > 2,1,0)) over (partition by id order by dt) as grp_id
from
(
select id
,dt
,lag(dt,1,'1970-01-01') over(partition by id order by dt) as lag_dt
from tmpdb.hanjiaxiaozhi
) pre -- step1:拿到上一条dt
)pre_3 --step23:做差值,这里是可以间隔1天,所以如果相减小于2,那么记为0,否则记为1
group by id
,grp_id
)t
group by id --step4:同一grp_id下求同一组最大dt-最小dt,再求最大值 即为该用户最大连续登陆天数
;
- 详细拆分(看懂上面的下面的请忽略,这里为了方便理解,分的步骤贼散)
-- step1:拿到上一条dt
with pre as
(select
id
,dt
,lag(dt,1,'1970-01-01') over(partition by id order by dt) as lag_dt
from tmpdb.hanjiaxiaozhi
)
-- id dt lag_dt
-- 1001 2021-12-12 1970-01-01
-- 1001 2021-12-13 2021-12-12
-- 1001 2021-12-14 2021-12-13
-- 1001 2021-12-16 2021-12-14
-- 1001 2021-12-19 2021-12-16
-- 1001 2021-12-20 2021-12-19
-- 1002 2021-12-12 1970-01-01
-- 1002 2021-12-16 2021-12-12
-- 1002 2021-12-17 2021-12-16
--step2:做差值,这里是可以间隔1天,所以如果相减小于2,那么记为0,否则记为1
,pre_2 as
(
select
id
,dt
,lag_dt
,if(datediff(dt,lag_dt)<=2,0,1) as is_first
from pre
)
-- id dt lag_dt is_first
-- 1001 2021-12-12 1970-01-01 1
-- 1001 2021-12-13 2021-12-12 0
-- 1001 2021-12-14 2021-12-13 0
-- 1001 2021-12-16 2021-12-14 0
-- 1001 2021-12-19 2021-12-16 1
-- 1001 2021-12-20 2021-12-19 0
-- 1002 2021-12-12 1970-01-01 1
-- 1002 2021-12-16 2021-12-12 1
-- 1002 2021-12-17 2021-12-16 0
-- step3:sum开窗分组
,pre_3 as
(
select
id
,dt
,lag_dt
,is_first
,sum(is_first) over (partition by id order by dt) as grp_id
from pre_2
)
-- id dt lag_dt is_first grp_id
-- 1001 2021-12-12 1970-01-01 1 1
-- 1001 2021-12-13 2021-12-12 0 1
-- 1001 2021-12-14 2021-12-13 0 1
-- 1001 2021-12-16 2021-12-14 0 1
-- 1001 2021-12-19 2021-12-16 1 2
-- 1001 2021-12-20 2021-12-19 0 2
-- 1002 2021-12-12 1970-01-01 1 1
-- 1002 2021-12-16 2021-12-12 1 2
-- 1002 2021-12-17 2021-12-16 0 2
--这里step2和3可以合并,之所以分开是为了看起来方便
-- ,pre_3 as
-- (
-- select
-- id
-- ,dt
-- ,lag_dt
-- ,sum(if(datediff(dt,lag_dt)>2,1,0)) over (partition by id order by dt) as grp_id
-- from pre
-- )
--step4:同一grp_id下求同一组最大dt-最小dt,再求最大值 即为该用户最大连续登陆天数
select
id
,max(max_login) as max_login
from(select
id
,grp_id
,datediff(max(dt),min(dt))+1 as max_login
from pre_3
group by id
,grp_id
)t
group by id
;
-- id max_login
-- 1001 5
-- 1002 2
练习四: 打折日期交叉问题
- 需求
- 如下为平台商品促销数据:字段为品牌,打折开始日期,打折结束日期
- 计算每个品牌总的打折销售天数,注意其中的交叉日期,比如 vivo 品牌,第一次活动时间为 2021-06-05 到 2021-06-15,第二次活动时间为 2021-06-09 到 2021-06-21 其中 9 号到 15号为重复天数,只统计一次,即 vivo 总打折天数为 2021-06-05 到 2021-06-21 共计 17 天。
- 准备数据
create table py_tmpdb.hanjiaxiaozhi as
select 'oppo' as brand, '2021-06-05' as stt , '2021-06-09' as edt
union all select 'oppo' as brand, '2021-06-11' as stt , '2021-06-21' as edt
union all select 'vivo' as brand, '2021-06-05' as stt , '2021-06-15' as edt
union all select 'vivo' as brand, '2021-06-09' as stt , '2021-06-21' as edt
union all select 'redmi' as brand, '2021-06-05' as stt , '2021-06-21' as edt
union all select 'redmi' as brand, '2021-06-09' as stt , '2021-06-15' as edt
union all select 'redmi' as brand, '2021-06-17' as stt , '2021-06-26' as edt
union all select 'huawei' as brand, '2021-06-05' as stt , '2021-06-26' as edt
union all select 'huawei' as brand, '2021-06-09' as stt , '2021-06-15' as edt
union all select 'huawei' as brand, '2021-06-17' as stt , '2021-06-21' as edt
;
--查看数据
select *
from py_tmpdb.hanjiaxiaozhi
-- hanjiaxiaozhi.brand hanjiaxiaozhi.stt hanjiaxiaozhi.edt
-- oppo 2021-06-05 2021-06-09
-- oppo 2021-06-11 2021-06-21
-- vivo 2021-06-05 2021-06-15
-- vivo 2021-06-09 2021-06-21
-- redmi 2021-06-05 2021-06-21
-- redmi 2021-06-09 2021-06-15
-- redmi 2021-06-17 2021-06-26
-- huawei 2021-06-05 2021-06-26
-- huawei 2021-06-09 2021-06-15
-- huawei 2021-06-17 2021-06-21
- 结果
brand dur_days
huawei 22
oppo 16
redmi 22
vivo 17
- 思路
- step1:取到本条之前 最大的结束时间
- step2:如果stt大于之前最大的edt,不变,否则 把stt+1置为上一条的结束时间
- step3:datediff(edt,stt),会出现负的,这些数据下一步去掉
- step4:如果dur_days为负数,那么+0,否则加dur_days+1
- 答案解析
select brand
,sum(if(dur_days > 0,dur_days+1,0)) as dur_days
from
(
select brand
,stt
,edt
,datediff(edt,stt) as dur_days
from
(
select brand
,if(max_edt is null,stt,if(stt > max_edt,stt,date_add(max_edt,1))) as stt
,edt
from
(
select brand
,stt
,edt
,max(edt)over(partition by brand order by stt rows between unbounded preceding and 1 preceding) as max_edt
from py_tmpdb.hanjiaxiaozhi
) step_1 -- step1:取到本条之前 最大的结束时间
) step_2 --step2:如果stt大于之前最大的edt,不变,否则 把stt+1置为上一条的结束时间
) step_3 --step3:datediff(edt,stt),会出现负的,这些数据下一步去掉
group by brand --step4:如果dur_days为负数,那么+0,否则加dur_days+1
;
- 详细拆分(看懂上面的下面的请忽略,这里为了方便理解,分的步骤贼散)
-- step1:取到本条之前 最大的结束时间
with step_1 as
(
select
brand
,stt
,edt
,max(edt)over(partition by brand order by stt rows between unbounded preceding and 1 preceding) as max_edt
from py_tmpdb.hanjiaxiaozhi
)
-- brand stt edt max_edt
-- huawei 2021-06-05 2021-06-26 NULL
-- huawei 2021-06-09 2021-06-15 2021-06-26
-- huawei 2021-06-17 2021-06-21 2021-06-26
-- oppo 2021-06-05 2021-06-09 NULL
-- oppo 2021-06-11 2021-06-21 2021-06-09
-- redmi 2021-06-05 2021-06-21 NULL
-- redmi 2021-06-09 2021-06-15 2021-06-21
-- redmi 2021-06-17 2021-06-26 2021-06-21
-- vivo 2021-06-05 2021-06-15 NULL
-- vivo 2021-06-09 2021-06-21 2021-06-15
--step2:如果stt大于之前最大的edt,不变,否则 把stt+1置为上一条的结束时间
,step_2 as
(
select
brand
,if(max_edt is null,stt,if(stt>max_edt,stt,date_add(max_edt,1))) as stt
,edt
from step_1
)
-- brand stt edt
-- huawei 2021-06-05 2021-06-26
-- huawei 2021-06-27 2021-06-15
-- huawei 2021-06-27 2021-06-21
-- oppo 2021-06-05 2021-06-09
-- oppo 2021-06-11 2021-06-21
-- redmi 2021-06-05 2021-06-21
-- redmi 2021-06-22 2021-06-15
-- redmi 2021-06-22 2021-06-26
-- vivo 2021-06-05 2021-06-15
-- vivo 2021-06-16 2021-06-21
--step3:datediff(edt,stt),会出现负的,这些数据下一步去掉,这里having会报错
,step_3 as
(
select
brand
,stt
,edt
,datediff(edt,stt) as dur_days
from step_2
)
-- brand stt edt dur_days
-- huawei 2021-06-05 2021-06-26 21
-- huawei 2021-06-27 2021-06-15 -12
-- huawei 2021-06-27 2021-06-21 -6
-- oppo 2021-06-05 2021-06-09 4
-- oppo 2021-06-11 2021-06-21 10
-- redmi 2021-06-05 2021-06-21 16
-- redmi 2021-06-22 2021-06-15 -7
-- redmi 2021-06-22 2021-06-26 4
-- vivo 2021-06-05 2021-06-15 10
-- vivo 2021-06-16 2021-06-21 5
--step4:如果dur_days为负数,那么+0,否则加dur_days+1
select
brand
,sum(if(dur_days>0,dur_days+1,0)) as dur_days
from step_3
group by brand
;
-- brand dur_days
-- huawei 22
-- oppo 16
-- redmi 22
-- vivo 17