一、窗口函数一
row_number rank dense_rank ntile
数据准备
itcast_t2.txt
cookie1,2018-04-10,1 cookie1,2018-04-11,5 cookie1,2018-04-12,7 cookie1,2018-04-13,3 cookie1,2018-04-14,2 cookie1,2018-04-15,4 cookie1,2018-04-16,4 cookie2,2018-04-10,2 cookie2,2018-04-11,3 cookie2,2018-04-12,5 cookie2,2018-04-13,6 cookie2,2018-04-14,3 cookie2,2018-04-15,9 cookie2,2018-04-16,7
创建表
CREATE TABLE itcast_t2 ( cookieid string, createtime string, pv INT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' stored as textfile;
加载数据
load data local inpath '/export/server/hivedatas/itcast_t2.txt' into table itcast_t2;
1.1 row_number
ROW_NUMBER() 从1开始,按照顺序,生成分组内记录的序列
SELECT cookieid, createtime, pv, ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn FROM itcast_t2;
1.2 rank和dense_rank
RANK() 生成数据项在分组中的排名,排名相等会在名次中留下空位
DENSE_RANK() 生成数据项在分组中的排名,排名相等会在名次中不会留下空位
SELECT cookieid, createtime, pv, RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn1, DENSE_RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn2 FROM itcast_t2 WHERE cookieid = 'cookie1';
1.3 ntile(N)
主要解决 求平均份数 ,平均分成几份,之间相差最多为1。
SELECT cookieid, createtime, pv, ntile(4) over(PARTITION BY cookieid ORDER BY pv desc) as ntile1 FROM itcast_t2 WHERE cookieid = 'cookie1';
二、Hive的分析函数之 sum avg min max
数据准备
cookie1,2018-04-10,1 cookie1,2018-04-11,5 cookie1,2018-04-12,7 cookie1,2018-04-13,3 cookie1,2018-04-14,2 cookie1,2018-04-15,4 cookie1,2018-04-16,4
--建表语句:
create table itcast_t1( cookieid string, createtime string, pv int ) row format delimited fields terminated by ',';
--加载数据:
load data local inpath'/export/server/hivedatas/itcast_t1' into table itcast_t1;
--开启智能本地模式
SET hive.exec.mode.local.auto=true;
2.1 sum
select cookieid, createtime, pv, sum(pv) over(partition by cookieid order by createtime) as pv1 from itcast_t1;
其他关键字相关用法:
特殊的关键词:
preceding : 往前
following : 往后
current row: 当前的行
unbounded:起点
unbounded preceding :从前面的起点开始
unbounded following :到后面的终点
格式:
分析函数 over(partition by 字段 order by 字段 [desc|asc]) rows between unbounded preceding and current row)
select cookieid,createtime,pv,
sum(pv) over(partition by cookieid order by createtime) as pv1
from itcast_t1;
select cookieid,createtime,pv,
sum(pv) over(partition by cookieid order by createtime rows between unbounded preceding and current row) as pv2
from itcast_t1;
select cookieid,createtime,pv,
sum(pv) over(partition by cookieid) as pv3
from itcast_t1; --如果每天order by排序语句 默认把分组内的所有数据进行sum操作
select cookieid,createtime,pv,
sum(pv) over(partition by cookieid order by createtime rows between 3 preceding and current row) as pv4
from itcast_t1;
select cookieid,createtime,pv,
sum(pv) over(partition by cookieid order by createtime rows between 3 preceding and 1 following) as pv5
from itcast_t1;
select cookieid,createtime,pv,
sum(pv) over(partition by cookieid order by createtime rows between current row and unbounded following) as pv6
from itcast_t1;
select cookieid,createtime,pv,
min(pv) over(partition by cookieid order by createtime rows between unbounded preceding and current row) as pv2
from itcast_t1;
# 当前时间往前推移2天的pv总和
select cookieid,createtime,pv,
sum(pv) over(partition by cookieid order by createtime rows between 2 preceding and
current row) as pv_2
from itcast_t1;
2.2 avg
select cookieid, createtime, pv, avg(pv) over(partition by cookieid order by createtime) as pv1 from itcast_t1;
2.3 min
用法同sum和avg一样
2.4 max
用法同sum和avg一样
三、 hive的分析函数之LAG,LEAD,FIRST_VALUE,LAST_VALUE
应用业务场景
-
当前的行和之前、之后的行中某个字段值进行比较或者计算操作。
-
当前行和之前的行相关的计算业务场景用比较多,当前行和之后的行相关的计算业务场景用的比较少。
# 导入数据 准备工作
cookie1,2018-04-10 10:00:02,url2
cookie1,2018-04-10 10:00:00,url1
cookie1,2018-04-10 10:03:04,1url3
cookie1,2018-04-10 10:50:05,url6
cookie1,2018-04-10 11:00:00,url7
cookie1,2018-04-10 10:10:00,url4
cookie1,2018-04-10 10:50:01,url5
cookie2,2018-04-10 10:00:02,url22
cookie2,2018-04-10 10:00:00,url11
cookie2,2018-04-10 10:03:04,1url33
cookie2,2018-04-10 10:50:05,url66
cookie2,2018-04-10 11:00:00,url77
cookie2,2018-04-10 10:10:00,url44
cookie2,2018-04-10 10:50:01,url55
创建表
createtime string, --页面访问时间
url STRING --被访问页面
CREATE TABLE itcast_t4 ( cookieid string, createtime string, url STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' stored as textfile;
--加载数据:
load data local inpath '/export/server/hivedatas/itcast4' into table itcast_t4;
3.1 lag
# 指定与上一行和上两行的数据 Lag
SELECT cookieid, createtime, url, ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn, LAG(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS last_1_time, LAG(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS last_2_time FROM itcast_t4;
3.2 lead
# 指定与下一行和下两行的数据 lead
SELECT cookieid, createtime, url, ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn, LEAD(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS next_1_time, LEAD(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS next_2_time FROM itcast_t4;
3.3 first_value
# first_value 取分组内排序后,截止到当前行,第一个值
SELECT cookieid, createtime, url, ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn, FIRST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS first1 FROM itcast_t4;
3.4 last_value
取分组内排序后,截止到当前行,最后一个值
SELECT cookieid, createtime, url, ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn, FIRST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime DESC) AS last2 FROM itcast_t4 ORDER BY cookieid,createtime;
必须制定order by
如果不指定ORDER BY,则进行排序混乱,会出现错误的结果