简介
本文主要介绍hive中的窗口函数.hive中的窗口函数和sql中的窗口函数相类似,都是用来做一些数据分析类的工作,一般用于olap分析(在线分析处理)。
概念
我们都知道在sql中有一类函数叫做聚合函数,例如sum()、avg()、max()等等,这类函数可以将多行数据按照规则聚集为一行,一般来讲聚集后的行数是要少于聚集前的行数的.但是有时我们想要既显示聚集前的数据,又要显示聚集后的数据,这时我们便引入了窗口函数.
在深入研究Over字句之前,一定要注意:在SQL处理中,窗口函数都是最后一步执行,而且仅位于Order by字句之前。
数据准备
我们准备一张order表,字段分别为name,orderdate,cost.数据内容如下:
jack,2015-01-01,10
tony,2015-01-02,15
jack,2015-02-03,23
tony,2015-01-04,29
jack,2015-01-05,46
jack,2015-04-06,42
tony,2015-01-07,50
jack,2015-01-08,55
mart,2015-04-08,62
mart,2015-04-09,68
neil,2015-05-10,12
mart,2015-04-11,75
neil,2015-06-12,80
mart,2015-04-13,94
在hive中建立一张表t_window,将数据插入进去.
实例
聚合函数+over
假如说我们想要查询在2015年4月份购买过的顾客及总人数,我们便可以使用窗口函数去去实现
select name,count(*) over ()
from t_window
where substring(orderdate,1,7) = '2015-04'
详情参考:Hive:窗口函数 ——花和尚也有春天
本人实例
create table if not exists t_window(
name string,
orderdate date,
cost int
)
row format delimited fields terminated by ',';
load data inpath '/order.csv' into table t_window;
select * from t_window;
select count(*) from t_window; --14
select count(*) from t_window where substring(orderdate,1,7) = '2015-04'; --5
select name,count(*) from t_window where substring(orderdate,1,7) = '2015-04' group by name ;
select name,count(*) over() from t_window where substring(orderdate,1,7) = '2015-04';
select name,count(*) over() from t_window where substring(orderdate,1,7) = '2015-04' group by name;
select name,orderdate,cost,sum(cost) over(partition by name)from t_window;
select name,orderdate,cost,sum(cost) over(partition by month(orderdate))from t_window order by orderdate;
select name,orderdate,cost,sum(cost) over(partition by name)from t_window order by orderdate;
select name,orderdate,cost,sum(cost) over(partition by name order by orderdate)from t_window;
select name,orderdate,cost
,sum(cost) over () as row1 -- 全表数据cost的总和
,sum(cost) over (partition by name) as row2 -- 个人累计消费总和,
,sum(cost) over (partition by name order by orderdate rows between unbounded preceding and unbounded following) as row22
,sum(cost) over (partition by name order by orderdate) as row3 --个人截止到当前日期的消费总和
,sum(cost) over (partition by name order by orderdate rows between unbounded preceding and current row ) as row4 --效果同row3
,sum(cost) over (partition by name order by orderdate rows between 1 preceding and current row ) as row5 --当前消费额与上一次消费额的总和
,sum(cost) over (partition by name order by orderdate rows between 1 preceding and 1 following ) as row6 --当前消费额,上一次消费,下一次消费的总和
,sum(cost) over (partition by name order by orderdate rows between current row and unbounded following) as row7 -- 当前消费到最后一次消费的总金额
,sum(cost) over (partition by name order by orderdate rows between current row and 3 following ) as row8 -- 当前消费与后三次的消费总和
--
from t_window;
--切片
with t as
(select name,orderdate,cost
,ntile(3) over () as row1
,ntile(3) over (partition by name) as row2
-- ,ntile(3) over (partition by orderdate) as row3
,ntile(3) over (order by name) as row3
,ntile(3) over (partition by name order by cost desc) as row4
from t_window)
select * from t where t.row4=1;
select name, orderdate,cost
, row_number() over ()
from t_window;
insert into t_window values ('jack', '2015-02-04',42);
select * from t_window;
select name, orderdate,cost
, row_number() over (partition by name order by cost desc) as row1
, rank() over (partition by name order by cost desc) as row2
, dense_rank() over (partition by name order by cost desc) as row3
from t_window where name = 'jack';
select name, orderdate,cost
,lag(orderdate,1,'1990-01-01') over(partition by name order by orderdate) as row1
,lead(orderdate,1,'1990-01-01') over(partition by name order by orderdate) as row2
from t_window;