话不多说,直接上题目
有一个订单表,里面装载的是各个用户的下单信息,表结构如下
create table orders
(
id string,
order_date string
)
row format delimited fields terminated by ',';
数据如下
这里的最近14天内指的是"2020-03-07" ~ "2020-03-20"
1.请统计最近14天内连续3天购买的用户,以及连续购买的最大天数
首先我们从一个简单的题目入手,来观察题目存在的某些细节,这里求的是最近14天连续3天购买的用户,那么如何来表示连续3天这个给定需求呢?
经过思考发现,有一种"等差数列差"的方法可以很轻松的将其实现,
具体思路:对每个用户关于购买日期进行一个排序,然后将购买日期和这个排序进行相减,需求中要的是连续3天,那么我们进行相减后,根据用户分组和相减后的日期进行分组,求出相同日期 >=3的,即可得到连续3天这个要求,最终我们还需要再进行一个去重,根据用户去重,因为会出现一个用户在14天内出现好几次连续3天,那么最终结果将会产生重复,比如说一个用户在7,8,9号三天连续,同时在15,16,17号三天也连续了,那么这个用户最终将会被统计两次,这明显不是我们需要的结果。具体实现见下
1.1)给每个用户的下单日期进行一个排序(分组排序要开窗)
select id,
order_date,
rank() over (partition by id order by order_date) rn
from orders
where order_date > date_sub('2020-03-20', 14)
and order_date <= '2020-03-20'
得到下面
1.2)将每个用户的购买日期和日期排序进行一个相减
select id,
date_sub(order_date, rn) diff
from (
select id,
order_date,
rank() over (partition by id order by order_date) rn
from orders
where order_date > date_sub('2020-03-20', 14)
and order_date <= '2020-03-20'
) t1
得到下面
1.3)此时按照用户id和diff进行分组求出相同用户和diff下>=3的数据
select id,
diff,
count(*) days
from (
select id,
date_sub(order_date, rn) diff
from (
select id,
order_date,
rank() over (partition by id order by order_date) rn
from orders
where order_date > date_sub('2020-03-20', 14)
and order_date <= '2020-03-20'
) t1
) t2
group by id, diff
having count(*) >= 3
得出下面结果
1.4)最后再根据用户进行去重实现最终需求
select id,
max(days)
from (
select id,
diff,
count(*) days
from (
select id,
date_sub(order_date, rn) diff
from (
select id,
order_date,
rank() over (partition by id order by order_date) rn
from orders
where order_date > date_sub('2020-03-20', 14)
and order_date <= '2020-03-20'
) t1
) t2
group by id, diff
having count(*) >= 3
) t3
group by id;
最终结果如下
2.请统计最近14天内连续3天购买的用户,以及连续购买的最大天数,其中断1天也算连续
比如说,6,8,9三天都购买了,也算连续,连续天数就是4天,即9-6+1=4
2.1)将每个用户上一行购买时间拿出来,没有的补为"1970-01-01"
select id,
order_date,
lag(order_date, 1, '1970-01-01')
over (partition by id order by order_date) last_date
from orders
where order_date > date_sub('2020-03-20', 14)
and order_date <= '2020-03-20'
结果如下:
2.2)将order_date与last_date进行相减
select id,
order_date,
datediff(order_date, last_date) diff
from (
select id,
order_date,
lag(order_date, 1, '1970-01-01')
over (partition by id order by order_date) last_date
from orders
where order_date > date_sub('2020-03-20', 14)
and order_date <= '2020-03-20'
) t1
结果如下:
2.3)这里开始处理连续3天断1天也算连续,我们如何来保证"断1天也连续",经过观察,我们需要对每个用户在这14天内购买日期进行一个分组,根据中间断开大于等于2天的将每个用户进行一个标号,这样每个用户连续3天(断1天也连续)的日期都会分在不同组
select id,
order_date,
sum(`if`(diff > 2, 1, 0)) over (partition by id order by order_date) sign
from (
select id,
order_date,
datediff(order_date, last_date) diff
from (
select id,
order_date,
lag(order_date, 1, '1970-01-01')
over (partition by id order by order_date) last_date
from orders
where order_date > date_sub('2020-03-20', 14)
and order_date <= '2020-03-20'
) t1
) t2
group by id, order_date, diff
结果如下
2.4) 求出每个用户和sign里面order_date的最大值和最小值(这样后面进行相减)
select id,
max(order_date) over (partition by id,sign) maxdt,
min(order_date) over (partition by id,sign) mindt
from (
select id,
order_date,
sum(`if`(diff > 2, 1, 0)) over (partition by id order by order_date) sign
from (
select id,
order_date,
datediff(order_date, last_date) diff
from (
select id,
order_date,
lag(order_date, 1, '1970-01-01')
over (partition by id order by order_date) last_date
from orders
where order_date > date_sub('2020-03-20', 14)
and order_date <= '2020-03-20'
) t1
) t2
group by id, order_date, diff
) t3
结果如下
2.5)将每个用户的maxdt和mindt进行相减
select id,
datediff(maxdt, mindt) + 1 diff2
from (
select id,
max(order_date) over (partition by id,sign) maxdt,
min(order_date) over (partition by id,sign) mindt
from (
select id,
order_date,
sum(`if`(diff > 2, 1, 0)) over (partition by id order by order_date) sign
from (
select id,
order_date,
datediff(order_date, last_date) diff
from (
select id,
order_date,
lag(order_date, 1, '1970-01-01')
over (partition by id order by order_date) last_date
from orders
where order_date > date_sub('2020-03-20', 14)
and order_date <= '2020-03-20'
) t1
) t2
group by id, order_date, diff
) t3
) t4
结果如下
2.6)最后我们根据用户进行分组将每个用户最大连续购买天数取出,即可实现最终需求
select id,
max(diff2)
from (
select id,
datediff(maxdt, mindt) + 1 diff2
from (
select id,
max(order_date) over (partition by id,sign) maxdt,
min(order_date) over (partition by id,sign) mindt
from (
select id,
order_date,
sum(`if`(diff > 2, 1, 0)) over (partition by id order by order_date) sign
from (
select id,
order_date,
datediff(order_date, last_date) diff
from (
select id,
order_date,
lag(order_date, 1, '1970-01-01')
over (partition by id order by order_date) last_date
from orders
where order_date > date_sub('2020-03-20', 14)
and order_date <= '2020-03-20'
) t1
) t2
group by id, order_date, diff
) t3
) t4
) t5
where diff2 >= 3
group by id;
结果如下
3.请统计最近14天内连续3天购买的用户,以及连续购买的最大天数,其中断2天也算连续
比如说,6,9,10三天都购买了,其中断了2天也算连续,连续天数就是5天,即10-6+1=5
和上面第2个需求思路相同,代码如下
select id,
max(diff2)
from (
select id,
datediff(maxdt, mindt) + 1 diff2
from (
select id,
max(order_date) over (partition by id,sign) maxdt,
min(order_date) over (partition by id,sign) mindt
from (
select id,
order_date,
sum(`if`(diff > 3, 1, 0)) over (partition by id order by order_date) sign
from (
select id,
order_date,
datediff(order_date, last_date) diff
from (
select id,
order_date,
lag(order_date, 1, '1970-01-01')
over (partition by id order by order_date) last_date
from orders
where order_date > date_sub('2020-03-20', 14)
and order_date <= '2020-03-20'
) t1
) t2
group by id, order_date, diff
) t3
) t4
) t5
where diff2 >= 3
group by id;
最终结果如下
4.请统计最近14天内连续3天购买的用户,以及连续购买的最大天数,其中断N天也算连续(具有统计意义)
根据上面第二第三个需求我们不难发现,其中不管断多少天算连续,也只需要修改其中一个字段的值即可,其他部分原封不动,得到下面一个具有普遍性的结论:
select id,
max(diff2)
from (
select id,
datediff(maxdt, mindt) + 1 diff2
from (
select id,
max(order_date) over (partition by id,sign) maxdt,
min(order_date) over (partition by id,sign) mindt
from (
select id,
order_date,
sum(`if`(diff > n + 1, 1, 0)) over (partition by id order by order_date) sign
from (
select id,
order_date,
datediff(order_date, last_date) diff
from (
select id,
order_date,
lag(order_date, 1, '1970-01-01')
over (partition by id order by order_date) last_date
from orders
where order_date > date_sub('2020-03-20', 14)
and order_date <= '2020-03-20'
) t1
) t2
group by id, order_date, diff
) t3
) t4
) t5
where diff2 >= 3
group by id;