Hive窗口函数应用案例1：连续N天登录用户统计

最新推荐文章于 2023-04-05 23:02:17 发布

绝牛雷犁热刀

最新推荐文章于 2023-04-05 23:02:17 发布

阅读量613

点赞数

文章标签： hive hadoop big data

本文链接：https://blog.csdn.net/m0_58535137/article/details/125042065

版权

在这里插入图片描述

方案一实现如下：

-- 连续登录统计
/*
login.txt

A,2021-03-22
B,2021-03-22
C,2021-03-22
A,2021-03-23
C,2021-03-23
A,2021-03-24
B,2021-03-24 
*/
create table login
(
    user_id    string,
    login_time string
) row format delimited fields terminated by ',';

load data local inpath '/root/hivedata/login.txt' overwrite into table login;

select *
from login;

-- 解决方案一：笛卡尔积过滤
create table login_tmp as
select a.user_id    as a_user_id,
       a.login_time as a_login_time,
       b.user_id    as b_user_id,
       b.login_time as b_login_time
from login a,
     login b;

-- 过滤数据，用户id相同且登录日期相差1天
select a_user_id,
       a_login_time,
       b_user_id,
       b_login_time
from login_tmp
where a_user_id = b_user_id
  and cast(substr(a_login_time, 9, 2) as int) - 1 = cast(substr(b_login_time, 9, 2) as int);

-- 去重
select distinct a_user_id
from login_tmp
where a_user_id = b_user_id
  and cast(substr(a_login_time, 9, 2) as int) - 1 = cast(substr(b_login_time, 9, 2) as int);

在这里插入图片描述
方案二实现如下：

-- 连续登录统计
/*
login.txt

A,2021-03-22
B,2021-03-22
C,2021-03-22
A,2021-03-23
C,2021-03-23
A,2021-03-24
B,2021-03-24
*/
create table login
(
    user_id    string,
    login_time string
) row format delimited fields terminated by ',';

load data local inpath '/root/hivedata/login.txt' overwrite into table login;

select *
from login;

-- 方案二：窗口函数实现
select user_id,
       login_time,
       -- 本次登录日期的第二天
       date_add(login_time, 1)                                                as nextday,
       -- 当前行上移1行
       lead(login_time, 1, 0) over (partition by user_id order by login_time) as nextlogin
from login;

-- 连续两天登录
with t as (
    select user_id,
           login_time,
           -- 本次登录日期的第二天
           date_add(login_time, 1)                                                as nextday,
           -- 当前行上移1行
           lead(login_time, 1, 0) over (partition by user_id order by login_time) as nextlogin
    from login
)
select distinct user_id
from t
where t.nextday = t.nextlogin;

-- 连续三天登录
with t as (
    select user_id,
           login_time,
           -- 本次登录日期的第二天
           date_add(login_time, 2)                                                as nextday,
           -- 当前行上移1行
           lead(login_time, 2, 0) over (partition by user_id order by login_time) as nextlogin
    from login
)
select distinct user_id
from t
where t.nextday = t.nextlogin;