问题描述
求每个用户的最高连续签到天数
分析问题
问题的关键是给用户的连续签到做出标记,然后在按照用户和连续签到标记进行分组聚合就可以。
准备数据
-- 删表
drop table if exists signs;
-- 建表
CREATE TABLE signs(
user_name string COMMENT '姓名',
sign_date string comment '签到日期'
);
-- 插入数据
insert overwrite table signs values
('老王','2021-01-01'),
('老王','2021-01-02'),
('老王','2021-01-03'),
('老王','2021-01-07'),
('小张','2021-01-01'),
('小张','2021-01-03'),
('小张','2021-01-04');
-- 查询数据
select * from signs;
方案1 日期减排序
第一步:给每一组连续签到打标,方法是签到日期减去排名,如果是连续签到,返回的日期相同。比如老王1号、2号、3号连续签到,返回2020-12-31。
select user_name
,sign_date
,rank() over(partition by user_name order by sign_date asc) as `排名`
,date_sub(sign_date, rank() over(partition by user_name order by sign_date asc))as label -- 连续签到分组标签
from signs
order by user_name,sign_date
第二步:统计每个用户连续签到天数,并取出最大值
with t1 as (
-- 给每个用户每次连续签到打标签
select user_name
,sign_date
,rank() over(partition by user_name order by sign_date asc) as `排名`
,date_sub(sign_date, rank() over(partition by user_name order by sign_date asc))as label -- 连续签到分组标签
from signs
order by user_name,sign_date
),
t2 as (
-- 统计每个用户每次连续签到的次数
select user_name
,label -- 连续签到分组标签
,count(distinct sign_date) as days
from t1
group by user_name,label
)
-- 取每个用户的最大天数
select user_name
,max(days) as days
from t2
group by user_name
方案2 lag判断
1、找到每个用户前一天的签到日期
-- 查询数据
select user_name
,sign_date
,lag(sign_date,1) over(partition by user_name order by sign_date) as sign_d1
from signs
2、当sign_d1为空或者日期差大于1时,赋值为1 否则为0
with t1 as (
--
select user_name
,sign_date
,lag(sign_date,1) over(partition by user_name order by sign_date) as sign_d1
from signs
)
,t2 as (
select t1.*
,case
when sign_d1 is null then 1
when datediff(sign_date,sign_d1) >1 then 1
else 0 end as label
from t1
)
select * from t2
3、使用sum over滚动求和,这样就标记出了连续签到的分组
with t1 as (
-- 查询数据
select user_name
,sign_date
,lag(sign_date,1) over(partition by user_name order by sign_date) as sign_d1
from signs
)
,t2 as (
select t1.*
,sum(case
when sign_d1 is null then 1
when datediff(sign_date,sign_d1) >1 then 1
else 0 end) over(partition by user_name order by sign_date) as label
from t1
)
select * from t2
3、按照用户和连续签到标记聚合,求出每个用户每次连续签到天数
with t1 as (
-- 查询数据
select user_name
,sign_date
,lag(sign_date,1) over(partition by user_name order by sign_date) as sign_d1
from signs
)
,t2 as (
select t1.*
,sum(case
when sign_d1 is null then 1
when datediff(sign_date,sign_d1) >1 then 1
else 0 end) over(partition by user_name order by sign_date) as label
from t1
)
,t3 as (
-- 按照用户和连续签到标记聚合
select user_name,label
,count(distinct sign_date) as days
from t2
group by user_name,label
)
select * from t3
4、按照用户聚合,求出每个用户连续签到天数的最大值
with t1 as (
-- 查询数据
select user_name
,sign_date
,lag(sign_date,1) over(partition by user_name order by sign_date) as sign_d1
from signs
)
,t2 as (
select t1.*
,sum(case
when sign_d1 is null then 1
when datediff(sign_date,sign_d1) >1 then 1
else 0 end) over(partition by user_name order by sign_date) as label
from t1
)
,t3 as (
-- 按照用户和连续签到标记聚合
select user_name,label
,count(distinct sign_date) as days
from t2
group by user_name,label
)
,t4 as (
-- 求出每个用户最高连续签到天数
select user_name,max(days) as days
from t3
group by user_name
)
select * from t4