SparkSQL—最大连续登录天数、当前连续登录天数、最大连续未登录天数问题
目录
3、当前连续登录天数(求用户最后一次获取活跃的连续登录天数)
6.2、 row_number() over() 另一种实现
1、数据准备
- 表字段:用户、新增日期、活跃日期
- 原数据表:user_active表
1.1、测试数据:比较date和timestamp类型
CREATE TABLE test_user_active
(
name string,
install_date timestamp,
active_date timestamp
);
insert overwrite table test_user_active
select '张三' as name, '2021-03-10' as install_date, '2021-03-11' as active_date;
select * from test_user_active;
张三 2021-03-10 00:00:00 2021-03-11 00:00:00
Time taken: 0.08 seconds, Fetched 1 row(s)
1.2、正式准备数据
CREATE TABLE user_active
(
name string,
install_date date,
active_date date
);
insert overwrite table user_active
select '张三' as name, '2021-01-01' as install_date, '2021-01-01' as active_date
union all
select '张三' as name, '2021-01-01' as install_date, '2021-01-02' as active_date
union all
select '张三' as name, '2021-01-01' as install_date, '2021-01-03' as active_date
union all
select '张三' as name, '2021-01-01' as install_date, '2021-01-04' as active_date
union all
select '张三' as name, '2021-01-01' as install_date, '2021-01-08' as active_date
union all
select '张三' as name, '2021-01-01' as install_date, '2021-01-09' as active_date
union all
select '张三' as name, '2021-01-01' as install_date, '2021-01-10' as active_date
union all
select '张三' as name, '2021-01-01' as install_date, '2021-01-11' as active_date
union all
select '张三' as name, '2021-01-01' as install_date, '2021-01-17' as active_date
union all
select '张三' as name, '2021-01-01' as install_date, '2021-01-18' as active_date
union all
select '张三' as name, '2021-01-01' as install_date, '2021-01-19' as active_date
union all
select '李四' as name, '2021-01-01' as install_date, '2021-01-01' as active_date
union all
select '李四' as name, '2021-01-01' as install_date, '2021-01-02' as active_date
union all
select '李四' as name, '2021-01-01' as install_date, '2021-01-15' as active_date
union all
select '李四' as name, '2021-01-01' as install_date, '2021-01-16' as active_date
union all
select '李四' as name, '2021-01-01' as install_date, '2021-01-17' as active_date
union all
select '李四' as name, '2021-01-01' as install_date, '2021-01-18' as active_date
union all
select '李四' as name, '2021-01-01' as install_date, '2021-01-19' as active_date;
spark-sql> select * from user_active;
张三 2021-01-01 2021-01-01
张三 2021-01-01 2021-01-02
张三 2021-01-01 2021-01-03
张三 2021-01-01 2021-01-04
张三 2021-01-01 2021-01-08
张三 2021-01-01 2021-01-09
张三 2021-01-01 2021-01-10
张三 2021-01-01 2021-01-11
张三 2021-01-01 2021-01-17
张三 2021-01-01 2021-01-18
张三 2021-01-01 2021-01-19
李四 2021-01-01 2021-01-01
李四 2021-01-01 2021-01-02
李四 2021-01-01 2021-01-15
李四 2021-01-01 2021-01-16
李四 2021-01-01 2021-01-17
李四 2021-01-01 2021-01-18
李四 2021-01-01 2021-01-19
Time taken: 4.184 seconds, Fetched 18 row(s)
2、求最大连续登录天数
关键点:diff-rank,diff表示每用户每活跃日期距新增日期间隔天数,rank表示每用户按活跃日期生序排的序号,两者相减可理解为计算用户是否连续活跃的衡量指标,如果连续活跃则相减值value应该相等。
--2、求单个用户最大连续登录天数
select name,max(activedays) as `最大连续登录天数`
from
( -- 1、求单个用户每一次连续登录的天数
select name,diff-rank AS value,count(1) as activedays
from
(
select name,active_date,row_number() over(partition by name order by active_date) as rank,datediff(active_date,install_date) as diff
from user_active
) group by 1,2,diff-rank
) group by 1
order by 1
张三 4
李四 5
Time taken: 8.198 seconds, Fetched 2 row(s)
模拟示例:
模拟示例:
install_date name active_date rank diff(active_date-install_date) value(diff-rank) values2
20210309 1 20210310 1 1 0 20210309
20210309 1 20210311 2 2 0 20210309
20210309 1 20210312 3 3 0 20210309
20210309 1 20210315 4 6 2 2021031220210309 1 20210319 5 10 6 20210312
20210309 1 20210320 6 11 6 20210312
20210309 1 20210321 7 12 6 20210312
20210309 1 20210322 8 13 6 20210312
20210309 1 20210323 9 14 6 20210312
20210309 1 20210328 10 19 9 20210312
name,value, activedaysselect name,max(activedays) as 最大连续登录天数
3、当前连续登录天数(求用户最后一次获取活跃的连续登录天数)
with active as
(
select name,active_date,rank,diff,diff-rank as value
from
(
select name,active_date
,row_number() over(partition by name order by active_date) as rank
,datediff(active_date,install_date) as diff
from user_active
)tmp
)
select t1.name,count(1) as `当前连续登录天数`
from
active as t1
join
( -- 没用用户的当前的最后一次活跃日期记录
select name,max(value) lastactive
from active
group by 1
) as t2
on t1.name = t2.name and t1.value = t2.lastactive
group by t1.name
order by 1
张三 3
李四 5
Time taken: 10.833 seconds, Fetched 2 row(s)
4、最大连续未登录天数
关键点:
1).同上,即 diff-rank,diff表示每用户每活跃日期距新增日期间隔天数,rank表示每用户按活跃日期生序排的序号,两者相减可理解为计算用户是否连续活跃的衡量指标,如果连续活跃则相减值value应该相等。
2).计算用户"最大连续未登录天数",还需考虑用户最后一次活跃日期距当前日期的时间间隔,然后再取最大值
--获取当前日期
select current_date;
2021-04-25
Time taken: 0.026 seconds, Fetched 1 row(s)
--设置日期为某个特定的计算日期
set current_date='2021-01-23';
with active as
(
select name,active_date,rank,diff,diff-rank as value
from
(
select name,active_date
,row_number() over(partition by name order by active_date) as rank
,datediff(active_date,install_date) as diff
from user_active
)tmp
)
select name,max(noactivedays) as `最大连续未登录天数`
from
(
select t1.name,t1.active_date as start_date,t2.active_date as end_date ,t2.value - t1.value as noactivedays
from active as t1
join active as t2
on t1.name = t2.name and t1.rank = t2.rank - 1
union all
select name,max(active_date)as start_date,${current_date} as end_date,datediff(${current_date},max(active_date)) as noactivedays
from user_active
group by 1
)
group by 1
order by 1;
张三 5
李四 12
Time taken: 3.904 seconds, Fetched 2 row(s)
5、用户自从安装app以来的连续未登录天数历史
--设置日期为某个特定的计算日期
set current_date='2021-01-23';
with active as
(
select name,active_date,rank,diff,diff-rank as value
from
(
select name,active_date
,row_number() over(partition by name order by active_date) as rank
,datediff(active_date,install_date) as diff
from user_active
)tmp
)
select name,start_date,end_date,noactivedays as `连续未登录天数`
from
(
select t1.name,t1.active_date as start_date,t2.active_date as end_date ,t2.value - t1.value as noactivedays
from active as t1
join active as t2
on t1.name = t2.name and t1.rank = t2.rank - 1
union all
select name,max(active_date)as start_date,${current_date} as end_date,datediff(${current_date},max(active_date)) as noactivedays
from user_active
group by 1
)
where noactivedays>0
order by 1;
张三 2021-01-04 2021-01-08 3
张三 2021-01-11 2021-01-17 5
张三 2021-01-19 2021-01-23 4
李四 2021-01-02 2021-01-15 12
李四 2021-01-19 2021-01-23 4
Time taken: 4.35 seconds, Fetched 5 row(s)
6、连续登陆3天用户
6.1、 row_number() over() 一种实现
模拟示例
username logindate rank current_date-logindate sum
1 20210410 1 15 16
1 20210411 2 14 16
1 20210412 3 13 16
1 20210415 4 10 14
set current_date='2021-01-23';
select distinct name
from
(
select name,active_date,row_number() over(partition by name order by active_date) + datediff(${current_date},date(active_date)) as diff
from user_active
) group by name,diff
having count(1) >= 3;
李四
张三
Time taken: 1.263 seconds, Fetched 2 row(s)
6.2、 row_number() over() 另一种实现
select distinct name
from
( -- 1、求单个用户每一次连续登录的天数
select name,diff-rank as value,count(1) as activedays
from
(
select name,active_date,row_number() over(partition by name order by active_date) as rank,datediff(active_date,install_date) as diff
from user_active
) group by 1,2
) tb
where activedays >= 3
李四
张三
Time taken: 12.273 seconds, Fetched 2 row(s)
6.3、 lead() over()实现
select distinct b.name
from
(
select name,active_date,lead(active_date,2) over(partition by name order by active_date desc) as date_3
from user_active
) b
where b.active_date is not null
and date_add(date(b.active_date),-2) = date(b.date_3);
李四
张三
Time taken: 7.199 seconds, Fetched 2 row(s)
7、选出空闲且连续座位的编号
原数据表:user_seat表
表字段:座位号编号、座位是否可选
select distinct a.seatid
from
user_seat as a
join user_seat as b
on abs(a.seatid-b.seatid) = 1 and a.free =1 and b.free =1
order by a.seatid
8、另一种思路,求连续登陆天数和连续未登录天数
测试和思路:其实主要解决思路就是要让连续的两条记录之间产生联系。
spark-sql> select date_add('2020-01-10',-3);
2020-01-07
Time taken: 0.02 seconds, Fetched 1 row(s)spark-sql> select date_add('2020-01-10',3);
2020-01-13
Time taken: 0.052 seconds, Fetched 1 row(s)
8.1、求连续登录天数
create table login_rank as
select user_id,login_date,row_number() over (partition by user_id order by login_date) day_rank
from login_log
;
--求连续登录天数
drop table login_rank_num ;
create table login_rank_num as
select t1.user_id
,t1.login_date
,t1.day_rank
from
login_rank t1
left join
login_rank t2
on t1.user_id = t2.user_id
and date_add(t1.login_date,-t1.day_rank) = date_add(t2.login_date,-t2.day_rank)
and t1.login_date <= t2.login_date
;
select user_id,login_date,count(login_date) as logdays
from login_rank_num
group by user_id,login_date;
8.2、求连续未登录天数
--求连续未登录天数
drop table login_rank_diff ;
create table login_rank_diff as
select t1.user_id
,t1.login_date
,datediff(t2.login_date,t1.login_date )-1 as daysdiff
from login_rank t1
left join
login_rank t2
on t1.user_id = t2.user_id and t1.day_rank = t2.day_rank-1;
select *
from login_rank_diff
9、SQL复杂场景实现【连续发单天数】
9.1、背景和准备数据
-
背景
数据运营人员常常会需要查找活跃用户名单,而活跃用户很多情况下被定义为连续在线或发单n天及以上的用户。一方面我们可以根据n的值直接进行筛选;更具一般性地,就要求我们去求取每个用户某段时间内的最大连续在线或者发单天数了。
SQL求连续在线天数是一个非常经典的问题,该问题在不考虑计算成本下有非常多的解法。该问题也是我在面试实习生时最喜欢深入问的一个问题,在引导一个候选人去完成这个问题的过程中可以看出其对sql的理解深度以及其思维是否灵敏。
该问题的最大难点在于如何判断日期与日期间是否连续,那这就要涉及到处理行与行之间的关系了。说到这对SQL比较熟悉的同学应该就会反应出使用join或者窗口函数来处理了。
- 数据
假设我们有19年一月份每日用户发单数据存储于订单表order_base:
user_id order_id create_time
234520012 1231512416323 2019-01-02 12:21:11
123149908 2412298719221 2019-01-04 01:11:34
… …
9.2、解法1(通过与特定日期的日期差判定连续):
本方法比较tricky。连续的时间以为着这些时间点与某一个特定时间点的时间差也是连续的,从下表可以直观理解这一点:
日期 特定日期 日期差d
2019-01-01 2019-01-01 0
2019-01-02 2019-01-01 1
2019-01-04 2019-01-01 3
2019-01-05 2019-01-01 4
2019-01-06 2019-01-01 5
那么我们对该日期差d进行个排序,如果连续的话,d与序号的差值应该是相同的,如下表:
日期 特定日期 日期差d 序号r 日期差d与序号r的差值
2019-01-01 2019-01-01 0 0 0
2019-01-02 2019-01-01 1 1 0
2019-01-04 2019-01-01 3 2 1
2019-01-05 2019-01-01 4 3 1
2019-01-06 2019-01-01 5 4 1
这样答案就显而易见了,只需要对上面这个子查询的最后一列进行分组统计行数,变得到了每次连续的天数,再取连续天数的最大值,便是我们想要的答案。
select
user_id,
max(date_cnt) as max_continuation_date_cnt
from
(
select
user_id,
d-d_ranking as d_group, -- 连续日期的组标记
count(1) as date_cnt
from
(
select
user_id,
d, --与标记日期的日期差
row_number() over(partition by user_id order by d) as d_ranking --与标记日期的日期差的排序
from
(
select
user_id,
datediff(create_date,'2019-01-01') as d --与标记日期的日期差
from
(
select
user_id,
to_date(create_time) as create_date
from
order_base
group by
user_id,
date(create_time)
)a -- 在这一层获取用户的发单日期并去重
)b --这一层获取与标记日期的日期差
)c --获取连续日期的排序
group by
user_id,
d-d_ranking
)d -- 获取每一个连续日期组的连续天数
group by
user_id
9.3、解法2(left join进行笛卡尔积):
select
user_id
from
(
select
user_id,
to_date(create_time) as create_date
from
order_base
group by
user_id,
date(create_time)
)a -- 在这一层获取用户的发单日期并去重
left join
(
select
user_id,
to_date(create_time) as create_date
from
order_base
group by
user_id,
to_date(create_time)
)b -- 与a完全相同的逻辑,为了得到日期与日期间的关联
on
a.user_id = b.user_id --仅使用user_id进行关联,获取同一个用户发单日期间的笛卡尔积
where
a.create_date <= b.create_date
and date_add(a.create_date,3) > b.create_date --以a的日期为基准,保留从a.create_date开始的3天内发单日期
group by
user_id,
a.create_date
having
count(1) = 3 --如果从a.create_date开始的3天内都有发单,则应该有3条记录
假设我们不需要知道用户最大的连续天数,只需要知道某个用户是否出现连续n天(假设n为3)登录的行为。那这里首先给出一种完全不考虑计算复杂度的解法,使用纯join关联去实现该问题。
整体思路是去获得同一个用户的发单日期对,看每一个发单日期的n天内是否有n个发单日期。
9.4、解法3 (lead或lag):
最后介绍一个最为直观,也是计算成本最小的方法。假设我们需要求连续登陆n天(假设n为7)及以上的用户,那么对于一个存在该行为的用户,他去重和排序后的发单日期信息中,必存在某一天,往前回溯(往后推)6条记录的日期,等于该日期减6(加6)。这么说可能不太好理解,但相信你看了以下代码便能很快明白我在说什么:
select
max(max_index)
from
(
select
sum(index) over(order by `timestamp`) as max_index --排序后第一行到本行的和
from
(
select
order_id,
unix_timestamp(login_time) as `timestamp`,
1 as index
from
connection_detail
where
dt = '20190101'
and is_td_finish = 1
union all
select
order_id,
unix_timestamp(logout_time) as `timestamp`,
-1 as index
from
connection_detail
where
dt = '20190101'
)a --将登录时间和登出时间多列成多行
)b
10、总结:
如我在介绍问题背景的时候所说,处理日期间的连续性就需要将行与行之间进行关联,而sql提供的解决方案是join和窗口函数。恰恰sql的优势便在于刻画这种行数据间的关系,该问题场景能够帮助我们更深入地理解SQL的这一特性。
参考:https://blog.csdn.net/weixin_38617657/article/details/114090011
https://blog.csdn.net/logao2012/article/details/104630231
https://blog.csdn.net/Adrian_Wang/article/details/89791948