1.需求:求order_info表中连续3天下单的用户id
运行结果
2.创建测试表,数据装载
--订单信息表
create table order_info
(
`order_id` string COMMENT '订单id',
`user_id` string COMMENT '用户id',
`create_date` string COMMENT '下单日期',
`total_amount` decimal(16, 2) COMMENT '订单总金额'
) COMMENT '订单表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
--数据装载
insert overwrite table order_info
values ('1', '101', '2021-09-27', 29000.00),
('2', '101', '2021-09-28', 70500.00),
('3', '101', '2021-09-29', 43300.00),
('4', '101', '2021-09-30', 860.00),
('5', '102', '2021-10-01', 46180.00),
('6', '102', '2021-10-01', 50000.00),
('7', '102', '2021-10-01', 75500.00),
('8', '102', '2021-10-02', 6170.00),
('9', '103', '2021-10-02', 18580.00),
('10', '103', '2021-10-02', 28000.00),
('11', '103', '2021-10-02', 23400.00),
('12', '103', '2021-10-03', 5910.00),
('13', '104', '2021-10-03', 13000.00),
('14', '104', '2021-10-03', 69500.00),
('15', '104', '2021-10-03', 2000.00),
('16', '104', '2021-10-03', 5380.00),
('17', '105', '2021-10-04', 6210.00),
('18', '105', '2021-10-04', 68000.00),
('19', '105', '2021-10-04', 43100.00),
('20', '105', '2021-10-04', 2790.00),
('21', '106', '2021-10-04', 9390.00),
('22', '106', '2021-10-05', 58000.00),
('23', '106', '2021-10-05', 46600.00),
('24', '106', '2021-10-05', 5160.00),
('25', '107', '2021-10-05', 55350.00),
('26', '107', '2021-10-05', 14500.00),
('27', '107', '2021-10-06', 47400.00),
('28', '107', '2021-10-06', 6900.00),
('29', '108', '2021-10-06', 56570.00),
('30', '108', '2021-10-06', 44500.00),
('31', '108', '2021-10-07', 50800.00),
('32', '108', '2021-10-07', 3900.00),
('33', '109', '2021-10-07', 41480.00),
('34', '109', '2021-10-07', 88000.00),
('35', '109', '2020-10-08', 15000.00),
('36', '109', '2020-10-08', 9020.00),
('37', '1010', '2020-10-08', 9260.00),
('38', '1010', '2020-10-08', 12000.00),
('39', '1010', '2020-10-08', 23900.00),
('40', '1010', '2020-10-08', 6790.00);
3.答案
思路一:直接可以想到用lag()开窗/lead()开窗
3.1第一步,对order_info表的同一天多次下单的记录进行去重
select user_id, create_date
from order_info
group by user_id, create_date;
3.2第二步,操作上一步去重后的结果,求出两天前的订单创建时间create_date
select user_id,
create_date,
lag(create_date, 2, '1970-01-01') over (partition by user_id order by create_date) lag_2
from (select user_id, create_date
from order_info
group by user_id, create_date) t1;
3.3第三步,使用datediff()求出当前订单创建时间与两天前的订单时间的差值diff
select user_id, datediff(create_date, lag_2) diff
from (
select user_id,
create_date,
lag(create_date, 2, '1970-01-01') over (partition by user_id order by create_date) lag_2
from (select user_id, create_date
from order_info
group by user_id, create_date) t1
) t2;
3.4第四步,筛选出差值为2的记录,并对结果去重(因为这里同一个用户可能存在多个连续三天下单的时间区间)
select distinct user_id
from (
select user_id, datediff(create_date, lag_2) diff
from (
select user_id,
create_date,
lag(create_date, 2, '1970-01-01') over (partition by user_id order by create_date) lag_2
from (select user_id, create_date
from order_info
group by user_id, create_date) t1
) t2
) t3
where diff = 2;
思路二:针对去重后的结果,rank()求排名,使用当前的下单时间减去排名得到一个flag,如果下单日期连续,那得到的flag肯定是相同的
3.1第一步,对order_info表的同一天多次下单的记录进行去重
select user_id, create_date
from order_info
group by user_id, create_date;
3.2第二步,使用rank()开窗以user_id进行分组create_date进行升序排名
select user_id, create_date, rank() over (partition by user_id order by create_date) rk
from (
select user_id, create_date
from order_info
group by user_id, create_date
) t1;
3.3第三步,对上述结果,使用data_sub()求出下单时间与排名的差值flag
select user_id, create_date, date_sub(create_date, rk) flag
from (
select user_id, create_date, rank() over (partition by user_id order by create_date) rk
from (
select user_id, create_date
from order_info
group by user_id, create_date
) t1
) t2;
3.4第四步,对上述结果以user_id、flag进行分组,对分组结果进行一个count(*) >= 3判断,注意这里select字句要对结果user_id进行去重(原因和思路一一致,因为可能存在多个满足筛选条件的时间区间)
select distinct user_id
from (
select user_id, create_date, date_sub(create_date, rk) flag
from (
select user_id, create_date, rank() over (partition by user_id order by create_date) rk
from (
select user_id, create_date
from order_info
group by user_id, create_date
) t1
) t2
) t3
group by user_id, flag
having count(*) >= 3;