Hive_Day04
聚合开窗函数和排序开窗函数
聚合开窗函数
-- 创建表
create table t_fraction(
name string,
subject string,
score int)
row format delimited fields terminated by ","
lines terminated by '\n';
-- 测试数据 fraction.txt
孙悟空,语文,10
孙悟空,数学,73
孙悟空,英语,15
猪八戒,语文,10
猪八戒,数学,73
猪八戒,英语,11
沙悟净,语文,22
沙悟净,数学,70
沙悟净,英语,31
唐玄奘,语文,21
唐玄奘,数学,81
唐玄奘,英语,23
-- 上传数据
load data local inpath '本地路径' into table t_fraction;
select name,subject,score,
sum(score) over() as sum1,
sum(score) over(partition by subject) as sum2,
sum(score) over(partition by subject order by score) as sum3,
-- 由起点到当前行的窗口聚合,和sum3一样
sum(score) over(partition by subject order by score rows between unbounded preceding and current row) as sum4,
-- 当前行和前面一行的窗口聚合
sum(score) over(partition by subject order by score rows between 1 preceding and current row) as sum5,
-- 当前行的前面一行到后面一行的窗口聚合 前一行+当前行+后一行
sum(score) over(partition by subject order by score rows between 1 preceding and 1 following) as sum6,
-- 当前行与后一行之和
sum(score) over(partition by subject order by score rows between current row and 1 following) as sum6,
-- 当前和后面所有的行
sum(score) over(partition by subject order by score rows between current row and unbounded following) as sum7
from t_fraction;
rows:行
unbounded preceding:起点
unbounded following:终点
n preceding:前 n 行
n following:后 n 行
current row:当前行
查询结果如下
聚合开窗函数练习
name,orderdate,cost
jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,42
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94
建表加载数据
vim business.txt
create table business
(
name string,
orderdate string,
cost int
)ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
load data local inpath "本地文件路径" into table business;
需求1:查询在2017年4月份购买过的顾客及总人数
需求2:查询顾客的购买明细及月购买总额
需求3:上述的场景,要将cost按照日期进行累加
需求4:查询顾客上次的购买时间
需求5:查询前20%时间的订单信息
按照日期过滤、分组count求总人数
select name,orderdate,cost,count(*) over() total_people from business where date_format(orderdate,'yyyy-MM')='2017-04';
按照顾客分组、sum购买金额
select name,orderdate,cost,sum(cost) over(partition by name) total_amount from business;
按照顾客分组、日期升序排序、组内每条数据将之前的金额累加
select name,orderdate,cost,sum(cost) over(partition by name order by orderdate rows between unbounded preceding and current row) cumulative_amountfrom business;
查询出明细数据同时获取上一条数据的购买时间(肯定需要按照顾客分组、时间升序排序)
select name,orderdate,cost,lag(orderdate,1) over(partition by name order by orderdate) last_date from business;
按照日期升序排序、取前20%的数据
select * from (select name,orderdate,cost,ntile(5) over(order by orderdate) sortgroup_num from business) t where t.sortgroup_num=1;
排序开窗函数
你知道开窗函数吗?请说一说有哪些排序开窗函数?RANK() ,DENSE_RANK() ,ROW_NUMBER()有什么区别?
- RANK() 排序相同时会重复,总数不会变
- DENSE_RANK() 排序相同时会重复,总数会减少
- ROW_NUMBER() 会根据顺序计算
- PERCENT_RANK()计算给定行的百分比排名。可以用来计算超过了百分之多少的人(当前行的rank值-1)/(分组内的总行数-1)
select name,subject,
score,
rank() over(partition by subject order by score desc) rp,
dense_rank() over(partition by subject order by score desc) drp,
row_number() over(partition by subject order by score desc) rnp,
percent_rank() over(partition by subject order by score) as percent_rank
from t_fraction;
select name,subject,score,
rank() over(order by score) as rank,
row_number() over(order by score) as row_number,
percent_rank() over(partition by subject order by score) as percent_rank
from t_fraction;
select name,subject,score,
rank() over() as rank,
row_number() over() as row_number,
percent_rank() over(partition by subject order by score) as percent_rank
from t_fraction;
排序开窗函数实战
name subject score
李毅 语文 87
李毅 数学 95
李毅 英语 68
黄仙 语文 94
黄仙 数学 56
黄仙 英语 84
小虎 语文 64
小虎 数学 86
小虎 英语 84
许文客 语文 65
许文客 数学 85
许文客 英语 78
建表加载数据
vim score.txt
create table score
(
name string,
subject string,
score int
) row format delimited fields terminated by "\t";
load data local inpath '本地文件路径' into table score;
需求1:每门学科学生成绩排名(是否并列排名、空位排名三种实现)
分析:学科分组、成绩降序排序、按照成绩排名
select name,subject,score,
rank() over(partition by subject order by score desc) rp,
dense_rank() over(partition by subject order by score desc) drp,
row_number() over(partition by subject order by score desc) rmp
from
score;
需求2:每门学科成绩排名top n的学生
select * from ( select name,subject,score,row_number() over(partition by subject order by score desc) rmp from score2) t
where t.rmp<=3;
Hive的行转列和列转行
一行转多行
create table testArray2(
name string,
weight array<string>
)row format delimited
fields terminated by '\t'
COLLECTION ITEMS terminated by ',';
xlj "150","170","180"
dashuju "150","180","190"
select name,col1 from testarray2 lateral view explode(weight) t1 as col1;
+----------+--------+
| name | col1 |
+----------+--------+
| xlj | "150" |
| xlj | "170" |
| xlj | "180" |
| dashuju | "150" |
| dashuju | "180" |
| dashuju | "190" |
+----------+--------+
select key from (select explode(map('key1',1,'key2',2,'key3',3)) as (key,value)) t;
key1
key2
key3
select name,col1,col2 from testarray2 lateral view explode(map('key1',1,'key2',2,'key3',3)) t1 as col1,col2;
+----------+-------+-------+
| name | col1 | col2 |
+----------+-------+-------+
| xlj | key1 | 1 |
| xlj | key2 | 2 |
| xlj | key3 | 3 |
| dashuju | key1 | 1 |
| dashuju | key2 | 2 |
| dashuju | key3 | 3 |
+----------+-------+-------+
select name,pos,col1 from testarray2 lateral view posexplode(weight) t1 as pos,col1;
+----------+------+--------+
| name | pos | col1 |
+----------+------+--------+
| xlj | 0 | "150" |
| xlj | 1 | "170" |
| xlj | 2 | "180" |
| dashuju | 0 | "150" |
| dashuju | 1 | "180" |
| dashuju | 2 | "190" |
+----------+------+--------+
多行转一行
// testLieToLine
name col1
xiaohu 150
xiaohu 170
xiaohu 180
huohuo 150
huohuo 180
huohuo 190
create table testLieToLine(
name string,
col1 int
)row format delimited
fields terminated by '\t';
select name,collect_list(col1) from testLieToLine group by name;
// 结果
xiaohu ["150","180","190"]
huohuo ["150","170","180"]
select t1.name
,collect_list(t1.col1)
from (
select name
,col1
from testarray2
lateral view explode(weight) t1 as col1
) t1 group by t1.name;