uid login_date
001,2017-02-05 12:00:00
001,2017-02-05 14:00:00
001,2017-02-06 13:00:00
001,2017-02-07 12:00:00
001,2017-02-08 12:00:00
001,2017-02-10 14:00:00
002,2017-02-05 13:00:00
002,2017-02-06 12:00:00
002,2017-02-06 14:00:00
002,2017-02-08 12:00:00
002,2017-02-09 16:00:00
002,2017-02-10 12:00:00
003,2017-01-31 13:00:00
003,2017-01-31 12:00:00
003,2017-02-01 12:00:00
004,2017-02-02 12:00:00
004,2017-02-03 12:00:00
004,2017-02-10 12:00:00
004,2017-03-01 12:00:00
create table t_login_user(
uid string,
login_date string
)row format delimited fields terminated by ",";
load data local inpath "/root/login_user.txt" overwrite into table t_login_user;
select * from t_login_user;
计算连续登陆2天的用户
第一种方式
-- 去掉用户重复登陆的记录
select distinct uid,date_format(login_date,'yyyy-MM-dd') from t_login_user;
-- 在去掉用户重复登录的基础上,对用户分组,对登陆日期排序 计算如果连续登陆 那么下一次登陆的日期
with t1 as ( select distinct uid,date_format(login_date,'yyyy-MM-dd') as login_date from t_login_user )
select * ,
date_add(login_date,1) as next_date,
lead(login_date,1,0) over (partition by uid order by login_date) as next_login
from t1;
-- 查询下一次登陆日期 和 下一行记录相等的用户 就是连续登陆2天的用户
with t1 as ( select distinct uid,date_format(login_date,'yyyy-MM-dd') as login_date from t_login_user ),
t2 as (select *,
date_add(login_date,1) as next_date,
lead(login_date,1,0) over (partition by uid order by login_date) as next_login
from t1 )
select distinct uid from t2 where t2.next_date == t2.next_login;
-- 查询连续3天登陆的用户
with t1 as ( select distinct uid,date_format(login_date,'yyyy-MM-dd') as login_date from t_login_user ),
t2 as (select *,
date_add(login_date,2) as next_date,
lead(login_date,2,0) over (partition by uid order by login_date) as next_login
from t1 )
select distinct uid from t2 where t2.next_date == t2.next_login;
-- 查询连续N天登陆的用户
select *,
--本次登陆日期的第N天
date_add(登陆日期,N-1) as next_date,
--按照用户id分区,按照登陆日期排序 取对应N-1行的数据
lead(登陆日期,N-1,0) over (partition by 用户 order by 登陆日期) as next_login
from t
-- 查询连续登陆大于4天的用户
with t1 as (select distinct uid, date_format(login_date, 'yyyy-MM-dd') as login_date from t_login_user),
t2 as (
select uid,
date_format(login_date, 'yyyy-MM-dd') as interval_date,
row_number() over (partition by uid order by login_date ) as rn
from t1
),
t3 as (
select *, date_sub(interval_date, rn) as login_date
from t2
)
select uid, count(1)
from t3
group by uid, login_date
having count(1) >= 4;
第二种方式
-- 去掉当天重复登陆信息
select distinct uid, date_format(login_date, 'yyyy-MM-dd') as login_date
from t_login_user;
-- 窗口函数用户分组,登陆日期排序,行号
with t1 as (select distinct uid, date_format(login_date, 'yyyy-MM-dd') as login_date from t_login_user)
select *,
row_number() over (partition by uid order by login_date) as rn
from t1
-- 登陆日期-编号 = 间隙日期
with t1 as (select distinct uid, date_format(login_date, 'yyyy-MM-dd') as login_date from t_login_user),
t2 as (select *,
row_number() over (partition by uid order by login_date) as rn
from t1)
select *,
date_sub(login_date, rn) as interval_date
from t2;
-- 用户 间隙日期分组 计数 >=2 为连续两天登陆 >=n 为连续n天记录
with t1 as (select distinct uid, date_format(login_date, 'yyyy-MM-dd') as login_date from t_login_user),
t2 as (select *, row_number() over (partition by uid order by login_date) as rn from t1),
t3 as (select *, date_sub(login_date, rn) as interval_date from t2)
select uid, count(1) as login_count
from t3
group by uid, interval_date
having count(1) >= 2;
分组topN
查询每个用户最高连续登陆天数
-- 查询每个用户连续登陆的天数
with t1 as (select distinct uid, date_format(login_date, 'yyyy-MM-dd') as login_date from t_login_user),
t2 as (select *, row_number() over (partition by uid order by login_date) as rn from t1),
t3 as (select *, date_sub(login_date, rn) as interval_date from t2)
select uid, count(1) as login_count
from t3
group by uid, interval_date
-- 分组 设置编号
with t1 as (select distinct uid, date_format(login_date, 'yyyy-MM-dd') as login_date from t_login_user),
t2 as (select *, row_number() over (partition by uid order by login_date) as rn from t1),
t3 as (select *, date_sub(login_date, rn) as interval_date from t2),
t4 as (select uid, count(1) as login_count from t3 group by uid, interval_date)
select *, row_number() over (partition by uid order by login_count desc) as rn
from t4;
-- topn
with t1 as (select distinct uid, date_format(login_date, 'yyyy-MM-dd') as login_date from t_login_user),
t2 as (select *, row_number() over (partition by uid order by login_date) as rn from t1),
t3 as (select *, date_sub(login_date, rn) as interval_date from t2),
t4 as (select uid, count(1) as login_count from t3 group by uid, interval_date),
t5 as (select *, row_number() over (partition by uid order by login_count desc) as rn from t4)
select *
from t5
where rn <= 1;