14 hive开窗函数

一、窗口函数一

row_number    rank     dense_rank    ntile

 

数据准备

itcast_t2.txt

cookie1,2018-04-10,1 cookie1,2018-04-11,5 cookie1,2018-04-12,7 cookie1,2018-04-13,3 cookie1,2018-04-14,2 cookie1,2018-04-15,4 cookie1,2018-04-16,4 cookie2,2018-04-10,2 cookie2,2018-04-11,3 cookie2,2018-04-12,5 cookie2,2018-04-13,6 cookie2,2018-04-14,3 cookie2,2018-04-15,9 cookie2,2018-04-16,7

 

创建表

CREATE TABLE itcast_t2 ( cookieid string, createtime string,  pv INT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' stored as textfile;

 

加载数据

load data local inpath '/export/server/hivedatas/itcast_t2.txt' into table itcast_t2;

 

 

 

1.1 row_number

 

ROW_NUMBER()  从1开始,按照顺序,生成分组内记录的序列

SELECT    cookieid,   createtime,   pv,   ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn   FROM itcast_t2;

 

 

 

1.2 rank和dense_rank

RANK() 生成数据项在分组中的排名,排名相等会在名次中留下空位

DENSE_RANK() 生成数据项在分组中的排名,排名相等会在名次中不会留下空位

 

 

SELECT cookieid, createtime, pv, RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn1, DENSE_RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn2 FROM itcast_t2 WHERE cookieid = 'cookie1';

 

 


 

1.3 ntile(N)

主要解决 求平均份数 ,平均分成几份,之间相差最多为1。

 

SELECT cookieid, createtime, pv, ntile(4) over(PARTITION BY cookieid ORDER BY pv desc) as ntile1 FROM itcast_t2 WHERE cookieid = 'cookie1';

 

 



 

二、Hive的分析函数之 sum avg min max

 

数据准备

cookie1,2018-04-10,1 cookie1,2018-04-11,5 cookie1,2018-04-12,7 cookie1,2018-04-13,3 cookie1,2018-04-14,2 cookie1,2018-04-15,4 cookie1,2018-04-16,4

 

--建表语句:

create table itcast_t1( cookieid string, createtime string,  pv int ) row format delimited fields terminated by ',';

 

--加载数据:

load data local inpath'/export/server/hivedatas/itcast_t1' into table itcast_t1;

 

--开启智能本地模式

SET hive.exec.mode.local.auto=true;

 

 

2.1 sum

 

select  cookieid, createtime, pv, sum(pv) over(partition by cookieid order by createtime) as pv1 from itcast_t1;

 

 

其他关键字相关用法:

特殊的关键词:

preceding : 往前

following : 往后

current row: 当前的行

unbounded:起点

unbounded preceding :从前面的起点开始

unbounded following :到后面的终点

格式:

分析函数 over(partition by 字段 order by 字段 [desc|asc]) rows between unbounded preceding and current row)

 

select cookieid,createtime,pv,

sum(pv) over(partition by cookieid order by createtime) as pv1

from itcast_t1;

 

select cookieid,createtime,pv,

sum(pv) over(partition by cookieid order by createtime rows between unbounded preceding and current row) as pv2

from itcast_t1;

 

select cookieid,createtime,pv,

sum(pv) over(partition by cookieid) as pv3

from itcast_t1;  --如果每天order  by排序语句  默认把分组内的所有数据进行sum操作

 

select cookieid,createtime,pv,

sum(pv) over(partition by cookieid order by createtime rows between 3 preceding and current row) as pv4

from itcast_t1;

 

select cookieid,createtime,pv,

sum(pv) over(partition by cookieid order by createtime rows between 3 preceding and 1 following) as pv5

from itcast_t1;

 

select cookieid,createtime,pv,

sum(pv) over(partition by cookieid order by createtime rows between current row and unbounded following) as pv6

from itcast_t1;

 

select cookieid,createtime,pv,

min(pv) over(partition by cookieid order by createtime rows between unbounded preceding and current row) as pv2

from itcast_t1;

 

# 当前时间往前推移2天的pv总和

select cookieid,createtime,pv,

sum(pv) over(partition by cookieid order by createtime rows between 2 preceding and

current row) as pv_2

from itcast_t1;

 

 

2.2 avg

 

select  cookieid, createtime, pv, avg(pv) over(partition by cookieid order by createtime) as pv1 from itcast_t1;

 

 

2.3 min

 

用法同sum和avg一样

 

2.4 max

用法同sum和avg一样

 

 

 

 

 


 

三、 hive的分析函数之LAG,LEAD,FIRST_VALUE,LAST_VALUE

 

应用业务场景

  • 当前的行和之前、之后的行中某个字段值进行比较或者计算操作。

  • 当前行和之前的行相关的计算业务场景用比较多,当前行和之后的行相关的计算业务场景用的比较少。

 

# 导入数据 准备工作

cookie1,2018-04-10 10:00:02,url2

cookie1,2018-04-10 10:00:00,url1

cookie1,2018-04-10 10:03:04,1url3

cookie1,2018-04-10 10:50:05,url6

cookie1,2018-04-10 11:00:00,url7

cookie1,2018-04-10 10:10:00,url4

cookie1,2018-04-10 10:50:01,url5

cookie2,2018-04-10 10:00:02,url22

cookie2,2018-04-10 10:00:00,url11

cookie2,2018-04-10 10:03:04,1url33

cookie2,2018-04-10 10:50:05,url66

cookie2,2018-04-10 11:00:00,url77

cookie2,2018-04-10 10:10:00,url44

cookie2,2018-04-10 10:50:01,url55

 

创建表

 

createtime string, --页面访问时间

url STRING --被访问页面

 

 

CREATE TABLE itcast_t4 ( cookieid string, createtime string,  url STRING       ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' stored as textfile;

 

 

--加载数据:

load data local inpath '/export/server/hivedatas/itcast4' into table itcast_t4;

 

3.1 lag

 

# 指定与上一行和上两行的数据 Lag

SELECT cookieid,   createtime,   url,   ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,   LAG(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS last_1_time,   LAG(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS last_2_time   FROM itcast_t4;

 

 

 

3.2 lead

 

# 指定与下一行和下两行的数据 lead

SELECT cookieid, createtime, url, ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn, LEAD(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS next_1_time, LEAD(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS next_2_time FROM itcast_t4;

 

 

3.3 first_value

 

# first_value 取分组内排序后,截止到当前行,第一个值

SELECT cookieid, createtime, url, ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn, FIRST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS first1 FROM itcast_t4;

 

 

3.4 last_value

 

取分组内排序后,截止到当前行,最后一个值

SELECT cookieid,   createtime,   url,   ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,   FIRST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime DESC) AS last2   FROM itcast_t4   ORDER BY cookieid,createtime;

 

必须制定order by

如果不指定ORDER BY,则进行排序混乱,会出现错误的结果


 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值