源数据,文件中是以,号隔开的
id,date
A,2018-09-04
B,2018-09-04
C,2018-09-04
A,2018-09-05
A,2018-09-05
C,2018-09-05
A,2018-09-06
B,2018-09-06
C,2018-09-06
A,2018-09-04
B,2018-09-04
C,2018-09-04
A,2018-09-05
A,2018-09-05
C,2018-09-05
A,2018-09-06
B,2018-09-06
C,2018-09-06
设置本地模式
set hive.exec.mode.local.auto=true;
在hive中创建表
drop table tb_use
create table tb_use(
id string,
date string
)
-- partitioned by (daystr string) -- 指定分区,必须在最前面(先指定存入的分区才能指定其他)
row format delimited fields terminated by ',' -- 指定行的分隔符
lines terminated by '\n' -- 指定列的分隔符,默认为'\n'
stored as textfile -- 指定存储文件的类型,hive中默认类型为textfile,SequenceFile,RCFile,ORCFile
-- location 'hdfs_path'; -- 存储到HDFS路径
从本地加载数据到表中
load data local inpath '/opt/datas/test.txt' overwrite into table tb_use;
将日期格式的字符串转为日期格式
select cast('2018-09-05' as date) ;
-- 结果:
2018-09-05
日期增加函数:date_add(string startdate, int days)
select date_add('2016-12-08',10)
-- 结果:
2016-12-18
日期减少函数:date_sub (string startdate, int days)
select date_sub('2016-12-08',10)
-- 结果:
2016-11-28
展现连续登陆两天的用户信息(用join方法)
错误示例:hive中join…on后不能跟or
select
b.id,b.date ,d.id,d.date
from -- 只取源表用户每天一条数据
(select a.id id,max(a.date) date from tb_use a group by a.id,a.date) as b
join -- 只取源表用户每天一条数据
(select c.id id,max(c.date) date from tb_use c group by c.id,c.date) as d
on
b.id=d.id
and
( -- 表左边的时间加1等于右边的时间
date_add(cast(b.date as date),1)=cast(d.date as date)
or -- 表左边的时间减1等于右边的时间
date_sub(cast(b.date as date),1)=cast(d.date as date)
);
-- 结果:
FAILED: SemanticException [Error 10019]: Line 11:4 OR not supported in JOIN currently 'date'
正确示例:hive中join…on后可以用where代替
select
b.id,b.date ,d.id,d.date
from -- 只取源表用户每天一条数据
(select a.id id,max(a.date) date from tb_use a group by a.id,a.date) as b
join -- 只取源表用户每天一条数据
(select c.id id,max(c.date) date from tb_use c group by c.id,c.date) as d
on
b.id=d.id
where
( -- 表左边的时间加1等于右边的时间
date_add(cast(b.date as date),1)=cast(d.date as date)
or -- 表左边的时间减1等于右边的时间
date_sub(cast(b.date as date),1)=cast(d.date as date)
);
结果:
b.id b.date d.id d.date
A 2018-09-04 A 2018-09-05
A 2018-09-05 A 2018-09-04
A 2018-09-05 A 2018-09-06
A 2018-09-06 A 2018-09-05
C 2018-09-04 C 2018-09-05
C 2018-09-05 C 2018-09-04
C 2018-09-05 C 2018-09-06
C 2018-09-06 C 2018-09-05
统计连续登陆两天的用户个数
select
count(distinct b.id) c1 -- 对id去重统计
from -- 只取源表用户每天一条数据
(select a.id id,max(a.date) date from tb_use a group by a.id,a.date) as b
join -- 只取源表用户每天一条数据
(select c.id id,max(c.date) date from tb_use c group by c.id,c.date) as d
on
b.id=d.id
where
( -- 表左边的时间加1等于右边的时间
date_add(cast(b.date as date),1)=cast(d.date as date)
or -- 表左边的时间减1等于右边的时间
date_sub(cast(b.date as date),1)=cast(d.date as date)
);
结果:
c1
2
求连续登陆n天的用户个数
-- 使用lag函数(向前取):lag(col,偏移量,默认值) over (partition by order by )
select id ,date, lag(date,1,-1) over(partition by id order by date desc ) as date1 from tb_use group by id,date;
结果:
id date date1
A 2018-09-06 -1
A 2018-09-05 2018-09-06
A 2018-09-04 2018-09-05
B 2018-09-06 -1
B 2018-09-04 2018-09-06
C 2018-09-06 -1
C 2018-09-05 018-09-06
C 2018-09-04 2018-09-05
-- 使用lead函数(向前取):lead(col,偏移量,默认值) over (partition by order by )
select id ,date, lead(date,1,-1) over(partition by id order by date desc ) as date1 from tb_use group by id,date;
结果:
id date date1
A 2018-09-06 2018-09-05
A 2018-09-05 2018-09-04
A 2018-09-04 -1
B 2018-09-06 2018-09-04
B 2018-09-04 -1
C 2018-09-06 2018-09-05
C 2018-09-05 2018-09-04
C 2018-09-04 -1
展现连续登陆两天的用户信息(用窗口函数分析方法)
select
*
from
(
select id ,date, lead(date,1,-1) over(partition by id order by date desc ) as date1 from tb_use a group by id,date
) as b
where
date_sub(cast(b.date as date),1)=cast(b.date1 as date);
结果:
b.id b.date b.date1
A 2018-09-06 2018-09-05
A 2018-09-05 2018-09-04
C 2018-09-06 2018-09-05
C 2018-09-05 2018-09-04
统计连续登陆两天的用户个数(n天就只需要把lead(date,2,-1)中的2改成n-1并且把date_sub(cast(b.date as date),2)中的2改成n-1)
select
count(distinct b.id) as c1
from
(
select id ,date, lead(date,1,-1) over(partition by id order by date desc ) as date1 from tb_use a group by id,date
) as b
where
date_sub(cast(b.date as date),1)=cast(b.date1 as date);
结果:
c1
2
统计连续登陆三天的用户个数(n天就只需要把lead(date,2,-1)中的2改成n-1并且把date_sub(cast(b.date as date),2)中的2改成n-1)
select
count(distinct b.id) as c1
from
(
select
id ,date, lead(date,2,-1) over(partition by id order by date desc ) as date1
from
tb_use a
group by
id,date
) as b
where
date_sub(cast(b.date as date),2)=cast(b.date1 as date);
结果:
c1
2