需求: 统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间、间隔天数等
数据如下:
id datestr amount
1,2019-02-08,6214.23
1,2019-02-08,6247.32
1,2019-02-09,85.63
1,2019-02-09,967.36
1,2019-02-10,85.69
1,2019-02-12,769.85
1,2019-02-13,943.86
1,2019-02-14,538.42
1,2019-02-15,369.76
1,2019-02-16,369.76
1,2019-02-18,795.15
1,2019-02-19,715.65
1,2019-02-21,537.71
2,2019-02-08,6214.23
2,2019-02-08,6247.32
2,2019-02-09,85.63
2,2019-02-09,967.36
2,2019-02-10,85.69
2,2019-02-12,769.85
2,2019-02-13,943.86
2,2019-02-14,943.18
2,2019-02-15,369.76
2,2019-02-18,795.15
2,2019-02-19,715.65
2,2019-02-21,537.71
3,2019-02-08,6214.23
3,2019-02-08,6247.32
3,2019-02-09,85.63
3,2019-02-09,967.36
3,2019-02-10,85.69
3,2019-02-12,769.85
3,2019-02 -14,276.81
3,2019-02-15,369.76
3,2019-02-16,369.76
3,2019-02-18,795.15
3,2019-02-19,715.65
3,2019-02-21,537.71
建表语句:
create table deal_tb(
id string
,datestr string
,amount string
)row format delimited fields terminated by ',';
load一下数据
分析:
步骤一:将用户一天多次消费的数据进行整合
select id,datestr,sum(amount) from deal_tb group by id,datestr;
步骤一结果:
步骤二:以用户id开窗,以日期升序,开始编号
hive执行:select t1.*,row_number() over(partition by t1.id order by t1.datestr) as rn from (select id,datestr,sum(amount) from deal_tb group by id,datestr)t1
排版后(排版后在xshell里不一定能直接执行):
select
t1.*
,row_number() over(partition by t1.id order by t1.datestr) as rn
from (select
id
,datestr
,sum(amount)
from deal_tb
group by id,datestr)t1;
步骤三:将日期和分组序号相减得到连续登陆的开始日期,如果开始日期相同说明连续登陆
第一次使用date_sub函数:
语法: date_sub (string startdate, int days)
返回值: string
说明:返回开始日期startdate减少days天后的日期。
举例:
hive> select date_sub('2024-6-13',15);
OK
2024-05-29
=============================================================
select t2.*,date_sub(t2.datestr,rn) as grp from(select t1.*,row_number() over(partition by t1.id order by t1.datestr) as rn from (select id,datestr,sum(amount) from deal_tb group by id,datestr)t1)t2--hive执行(无排版)
select
t2.*
,date_sub(t2.datestr,rn) as grp
from(select
t1.*
,row_number() over(partition by t1.id order by t1.datestr) as rn
from (select
id
,datestr
,sum(amount)
from deal_tb
group by id,datestr)t1
)t2
步骤四:统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间
select t3.id as id,min(t3.datestr) as start_time,max(t3.datestr) as end_time,sum(t3.amount) as sum_amount,count(1) as days from(select t2.*,date_sub(t2.datestr,rn) as grp from(select t1.*,row_number() over(partition by t1.id order by t1.datestr) as rn from (select id,datestr,sum(amount) as amount from deal_tb group by id,datestr)t1)t2)t3 group by t3.id,t3.grp;--hive执行(无排版)
select
--连续登陆开始
t3.id as id
,min(t3.datestr) as start_time --连续时间开始
,max(t3.datestr) as end_time --连续时间结束
,sum(t3.amount) as sum_amount --用户连续交易总和
,count(1) as days --连续登录天数
from(select
t2.*
,date_sub(t2.datestr,rn) as grp
from(select
t1.*
,row_number() over(partition by t1.id order by t1.datestr) as rn
from (select
id
,datestr
,sum(amount) as amount
from deal_tb
group by id,datestr)t1
)t2
)t3 group by t3.id,t3.grp
步骤五:间隔天数
select t4.*,nvl(datediff(start_time,lag(end_time,1) over(partition by id order by start_time))-1,0) as jiange from(select t3.id as id,min(t3.datestr) as start_time,max(t3.datestr) as end_time,sum(t3.amount) as sum_amount,count(1) as days from(select t2.*,date_sub(t2.datestr,rn) as grp from(select t1.*,row_number() over(partition by t1.id order by t1.datestr) as rn from (select id,datestr,sum(amount) as amount from deal_tb group by id,datestr)t1)t2)t3 group by t3.id,t3.grp)t4--hive执行(无排版)
select
t4.*
,nvl(datediff(start_time,lag(end_time,1) over(partition by id order by start_time))-1,0) as jiange
from(select
t3.id as id
,min(t3.datestr) as start_time
,max(t3.datestr) as end_time
,sum(t3.amount) as sum_amount
,count(1) as days
from(select
t2.*
,date_sub(t2.datestr,rn) as grp
from(select
t1.*
,row_number() over(partition by t1.id order by t1.datestr) as rn
from (select
id
,datestr
,sum(amount) as amount
from
deal_tb
group by id,datestr)t1
)t2
)t3 group by t3.id,t3.grp)t4