hive连续登录问题总结

需求: 统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间、间隔天数等

数据如下:

id    datestr      amount
1,2019-02-08,6214.23 
1,2019-02-08,6247.32 
1,2019-02-09,85.63 
1,2019-02-09,967.36 
1,2019-02-10,85.69 
1,2019-02-12,769.85 
1,2019-02-13,943.86 
1,2019-02-14,538.42
1,2019-02-15,369.76
1,2019-02-16,369.76
1,2019-02-18,795.15
1,2019-02-19,715.65
1,2019-02-21,537.71
2,2019-02-08,6214.23 
2,2019-02-08,6247.32 
2,2019-02-09,85.63 
2,2019-02-09,967.36 
2,2019-02-10,85.69 
2,2019-02-12,769.85 
2,2019-02-13,943.86 
2,2019-02-14,943.18
2,2019-02-15,369.76
2,2019-02-18,795.15
2,2019-02-19,715.65
2,2019-02-21,537.71
3,2019-02-08,6214.23 
3,2019-02-08,6247.32 
3,2019-02-09,85.63 
3,2019-02-09,967.36 
3,2019-02-10,85.69 
3,2019-02-12,769.85 
3,2019-02 -14,276.81
3,2019-02-15,369.76
3,2019-02-16,369.76
3,2019-02-18,795.15
3,2019-02-19,715.65
3,2019-02-21,537.71

建表语句:

create table deal_tb(
    id string
    ,datestr string
    ,amount string
)row format delimited fields terminated by ',';

load一下数据

分析:

步骤一:将用户一天多次消费的数据进行整合

        select id,datestr,sum(amount) from deal_tb group by id,datestr;

        步骤一结果:

步骤二:以用户id开窗,以日期升序,开始编号

hive执行:select t1.*,row_number() over(partition by t1.id order by t1.datestr) as rn from (select id,datestr,sum(amount) from deal_tb group by id,datestr)t1

排版后(排版后在xshell里不一定能直接执行):

select 
        t1.*
        ,row_number() over(partition by t1.id order by t1.datestr) as rn 
from (select 
                id
                ,datestr
                ,sum(amount) 
           from deal_tb 
           group by id,datestr)t1;

  步骤三:将日期和分组序号相减得到连续登陆的开始日期,如果开始日期相同说明连续登陆

第一次使用date_sub函数:

语法: date_sub (string startdate, int days)

返回值: string

说明:返回开始日期startdate减少days天后的日期。

举例:

hive> select date_sub('2024-6-13',15);
OK
2024-05-29

=============================================================

select t2.*,date_sub(t2.datestr,rn) as grp from(select t1.*,row_number() over(partition by t1.id order by t1.datestr) as rn from (select id,datestr,sum(amount) from deal_tb group by id,datestr)t1)t2--hive执行(无排版)

    select
        t2.*
        ,date_sub(t2.datestr,rn) as grp
    from(select 
            t1.*
            ,row_number() over(partition by t1.id order by t1.datestr) as rn 
            from (select 
                                id
                                ,datestr
                                ,sum(amount) 
                      from deal_tb 
                      group by id,datestr)t1
           )t2

步骤四:统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间
select t3.id as id,min(t3.datestr) as start_time,max(t3.datestr) as end_time,sum(t3.amount) as sum_amount,count(1) as days from(select t2.*,date_sub(t2.datestr,rn) as grp from(select t1.*,row_number() over(partition by t1.id order by t1.datestr) as rn from (select id,datestr,sum(amount) as amount from deal_tb group by id,datestr)t1)t2)t3 group by t3.id,t3.grp;--hive执行(无排版)

select
    --连续登陆开始
        t3.id as id
        ,min(t3.datestr) as start_time --连续时间开始
        ,max(t3.datestr) as end_time --连续时间结束
        ,sum(t3.amount) as sum_amount --用户连续交易总和
        ,count(1) as days --连续登录天数    
from(select
            t2.*
            ,date_sub(t2.datestr,rn) as grp
        from(select 
                        t1.*
                        ,row_number() over(partition by t1.id order by t1.datestr) as rn 
               from (select 
                            id
                            ,datestr
                            ,sum(amount) as amount
                         from  deal_tb 
                         group by id,datestr)t1
         )t2        
    )t3 group by t3.id,t3.grp

步骤五:间隔天数
select t4.*,nvl(datediff(start_time,lag(end_time,1) over(partition by id order by start_time))-1,0) as jiange from(select t3.id as id,min(t3.datestr) as start_time,max(t3.datestr) as end_time,sum(t3.amount) as sum_amount,count(1) as days from(select t2.*,date_sub(t2.datestr,rn) as grp from(select t1.*,row_number() over(partition by t1.id order by t1.datestr) as rn from (select id,datestr,sum(amount) as amount from deal_tb group by id,datestr)t1)t2)t3 group by t3.id,t3.grp)t4--hive执行(无排版)

    select
        t4.*
        ,nvl(datediff(start_time,lag(end_time,1) over(partition by id order by start_time))-1,0) as jiange
    from(select
            t3.id as id
            ,min(t3.datestr) as start_time
            ,max(t3.datestr) as end_time
            ,sum(t3.amount) as sum_amount
            ,count(1) as days    
        from(select
                t2.*
                ,date_sub(t2.datestr,rn) as grp
            from(select 
                    t1.*
                    ,row_number() over(partition by t1.id order by t1.datestr) as rn 
                 from (select 
                        id
                        ,datestr
                        ,sum(amount) as amount
                      from 
                        deal_tb 
                      group by id,datestr)t1
            )t2
           )t3 group by t3.id,t3.grp)t4

  • 18
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值