【hive高级】利用hive统计连续登陆的用户数

源数据,文件中是以,号隔开的

id,date
A,2018-09-04
B,2018-09-04
C,2018-09-04
A,2018-09-05
A,2018-09-05
C,2018-09-05
A,2018-09-06
B,2018-09-06
C,2018-09-06
A,2018-09-04
B,2018-09-04
C,2018-09-04
A,2018-09-05
A,2018-09-05
C,2018-09-05
A,2018-09-06
B,2018-09-06
C,2018-09-06

设置本地模式

set hive.exec.mode.local.auto=true;

在hive中创建表

drop table tb_use

create table tb_use(
id string,
date string
)
-- partitioned by (daystr string)  -- 指定分区,必须在最前面(先指定存入的分区才能指定其他)
row format delimited fields terminated by ','  -- 指定行的分隔符
lines terminated by '\n'  -- 指定列的分隔符,默认为'\n'
stored as textfile  -- 指定存储文件的类型,hive中默认类型为textfile,SequenceFile,RCFile,ORCFile
-- location 'hdfs_path'; -- 存储到HDFS路径

从本地加载数据到表中

load data local inpath '/opt/datas/test.txt' overwrite into table tb_use;

将日期格式的字符串转为日期格式

select cast('2018-09-05' as date) ;
-- 结果:
2018-09-05

日期增加函数:date_add(string startdate, int days)

select date_add('2016-12-08',10)
-- 结果:
2016-12-18

日期减少函数:date_sub (string startdate, int days)

select date_sub('2016-12-08',10)
-- 结果:
2016-11-28

展现连续登陆两天的用户信息(用join方法)

错误示例:hive中join…on后不能跟or

select 
    b.id,b.date ,d.id,d.date  
from  -- 只取源表用户每天一条数据
    (select a.id id,max(a.date) date from tb_use a group by a.id,a.date) as b
join   -- 只取源表用户每天一条数据
    (select c.id id,max(c.date) date from tb_use c group by c.id,c.date) as d
on   
    b.id=d.id 
and 
(   -- 表左边的时间加1等于右边的时间
    date_add(cast(b.date as date),1)=cast(d.date as date)
or  -- 表左边的时间减1等于右边的时间
    date_sub(cast(b.date as date),1)=cast(d.date as date)
);
-- 结果:
FAILED: SemanticException [Error 10019]: Line 11:4 OR not supported in JOIN currently 'date'

正确示例:hive中join…on后可以用where代替

select 
    b.id,b.date ,d.id,d.date  
from  -- 只取源表用户每天一条数据
    (select a.id id,max(a.date) date from tb_use a group by a.id,a.date) as b
join   -- 只取源表用户每天一条数据
    (select c.id id,max(c.date) date from tb_use c group by c.id,c.date) as d
on   
    b.id=d.id 
where 
(   -- 表左边的时间加1等于右边的时间
    date_add(cast(b.date as date),1)=cast(d.date as date)
or  -- 表左边的时间减1等于右边的时间
    date_sub(cast(b.date as date),1)=cast(d.date as date)
);


结果:
b.id	b.date	     d.id	    d.date
A	    2018-09-04	 A	    2018-09-05
A	    2018-09-05	 A	    2018-09-04
A	    2018-09-05	 A	    2018-09-06
A	    2018-09-06	 A	    2018-09-05
C	    2018-09-04	 C	    2018-09-05
C	    2018-09-05	 C	    2018-09-04
C	    2018-09-05	 C	    2018-09-06
C	    2018-09-06	 C	    2018-09-05

统计连续登陆两天的用户个数

select 
    count(distinct b.id)  c1  -- 对id去重统计
from  -- 只取源表用户每天一条数据
    (select a.id id,max(a.date) date from tb_use a group by a.id,a.date) as b
join   -- 只取源表用户每天一条数据
    (select c.id id,max(c.date) date from tb_use c group by c.id,c.date) as d
on   
    b.id=d.id 
where 
(   -- 表左边的时间加1等于右边的时间
    date_add(cast(b.date as date),1)=cast(d.date as date)
or  -- 表左边的时间减1等于右边的时间
    date_sub(cast(b.date as date),1)=cast(d.date as date)
);


结果:
c1
2

求连续登陆n天的用户个数

-- 使用lag函数(向前取):lag(col,偏移量,默认值) over (partition by order by  )
select id ,date, lag(date,1,-1) over(partition by id order by date desc )  as date1 from tb_use group by id,date;


 结果:
id	 date   	    date1      
A	 2018-09-06    -1
A	 2018-09-05    2018-09-06
A	 2018-09-04    2018-09-05
B	 2018-09-06    -1
B	 2018-09-04    2018-09-06
C	 2018-09-06    -1
C	 2018-09-05    018-09-06
C	 2018-09-04    2018-09-05
-- 使用lead函数(向前取):lead(col,偏移量,默认值) over (partition by order by  )
select id ,date, lead(date,1,-1) over(partition by id order by date desc )  as date1 from tb_use group by id,date;


 结果:
id	 date	     date1      
A	 2018-09-06  2018-09-05
A	 2018-09-05  2018-09-04
A 	 2018-09-04  -1
B 	 2018-09-06  2018-09-04
B 	 2018-09-04  -1
C 	 2018-09-06  2018-09-05
C 	 2018-09-05  2018-09-04
C    2018-09-04  -1

展现连续登陆两天的用户信息(用窗口函数分析方法)

select  
    *
from 
(
    select id ,date, lead(date,1,-1) over(partition by id order by date desc )  as date1 from tb_use a group by id,date 
) as b
where 
    date_sub(cast(b.date as date),1)=cast(b.date1 as date);


结果:
b.id  b.date	    b.date1
A 	  2018-09-06    2018-09-05
A	  2018-09-05	2018-09-04
C	  2018-09-06	2018-09-05
C 	  2018-09-05	2018-09-04

统计连续登陆两天的用户个数(n天就只需要把lead(date,2,-1)中的2改成n-1并且把date_sub(cast(b.date as date),2)中的2改成n-1)

select  
    count(distinct b.id) as c1
from 
(
    select id ,date, lead(date,1,-1) over(partition by id order by date desc )  as date1 from tb_use a group by id,date 
) as b
where 
    date_sub(cast(b.date as date),1)=cast(b.date1 as date);


结果:
c1
2

统计连续登陆三天的用户个数(n天就只需要把lead(date,2,-1)中的2改成n-1并且把date_sub(cast(b.date as date),2)中的2改成n-1)

select  
    count(distinct b.id) as c1
from 
(
    select 
	    id ,date, lead(date,2,-1) over(partition by id order by date desc )  as date1 
	from 
	    tb_use a 
	group by 
	    id,date 
) as b
where 
    date_sub(cast(b.date as date),2)=cast(b.date1 as date);


结果:
c1 
2 
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值