方案一实现如下:
-- 连续登录统计
/*
login.txt
A,2021-03-22
B,2021-03-22
C,2021-03-22
A,2021-03-23
C,2021-03-23
A,2021-03-24
B,2021-03-24
*/
create table login
(
user_id string,
login_time string
) row format delimited fields terminated by ',';
load data local inpath '/root/hivedata/login.txt' overwrite into table login;
select *
from login;
-- 解决方案一:笛卡尔积过滤
create table login_tmp as
select a.user_id as a_user_id,
a.login_time as a_login_time,
b.user_id as b_user_id,
b.login_time as b_login_time
from login a,
login b;
-- 过滤数据,用户id相同且登录日期相差1天
select a_user_id,
a_login_time,
b_user_id,
b_login_time
from login_tmp
where a_user_id = b_user_id
and cast(substr(a_login_time, 9, 2) as int) - 1 = cast(substr(b_login_time, 9, 2) as int);
-- 去重
select distinct a_user_id
from login_tmp
where a_user_id = b_user_id
and cast(substr(a_login_time, 9, 2) as int) - 1 = cast(substr(b_login_time, 9, 2) as int);
方案二实现如下:
-- 连续登录统计
/*
login.txt
A,2021-03-22
B,2021-03-22
C,2021-03-22
A,2021-03-23
C,2021-03-23
A,2021-03-24
B,2021-03-24
*/
create table login
(
user_id string,
login_time string
) row format delimited fields terminated by ',';
load data local inpath '/root/hivedata/login.txt' overwrite into table login;
select *
from login;
-- 方案二:窗口函数实现
select user_id,
login_time,
-- 本次登录日期的第二天
date_add(login_time, 1) as nextday,
-- 当前行上移1行
lead(login_time, 1, 0) over (partition by user_id order by login_time) as nextlogin
from login;
-- 连续两天登录
with t as (
select user_id,
login_time,
-- 本次登录日期的第二天
date_add(login_time, 1) as nextday,
-- 当前行上移1行
lead(login_time, 1, 0) over (partition by user_id order by login_time) as nextlogin
from login
)
select distinct user_id
from t
where t.nextday = t.nextlogin;
-- 连续三天登录
with t as (
select user_id,
login_time,
-- 本次登录日期的第二天
date_add(login_time, 2) as nextday,
-- 当前行上移1行
lead(login_time, 2, 0) over (partition by user_id order by login_time) as nextlogin
from login
)
select distinct user_id
from t
where t.nextday = t.nextlogin;
连续N天登录公式: