Hive(四)

最新推荐文章于 2024-11-09 22:26:44 发布

究极章鱼

最新推荐文章于 2024-11-09 22:26:44 发布

阅读量34

点赞数

文章标签： hive

本文链接：https://blog.csdn.net/qq_50379483/article/details/133712062

版权

本文详细介绍了Hive中的聚合开窗函数（如SUM()、RANK()等）和排序开窗函数（如RANK(),DENSE_RANK(),ROW_NUMBER()和percent_rank()），以及如何使用行转列和列转行操作处理数据。通过实例展示了如何在Hive中进行数据聚合、分组、排序和转换格式。

摘要由CSDN通过智能技术生成

Hive_Day04

聚合开窗函数和排序开窗函数

聚合开窗函数

-- 创建表
create table t_fraction(
name string,
subject string, 
score int)
row format delimited fields terminated by ","
lines terminated by '\n';

-- 测试数据 fraction.txt
孙悟空,语文,10
孙悟空,数学,73
孙悟空,英语,15
猪八戒,语文,10
猪八戒,数学,73
猪八戒,英语,11
沙悟净,语文,22
沙悟净,数学,70
沙悟净,英语,31
唐玄奘,语文,21
唐玄奘,数学,81
唐玄奘,英语,23

-- 上传数据
load data local inpath '本地路径' into table t_fraction;

select name,subject,score,
sum(score) over() as sum1,
sum(score) over(partition by subject) as sum2,
sum(score) over(partition by subject order by score) as sum3,

-- 由起点到当前行的窗口聚合，和sum3一样
sum(score) over(partition by subject order by score rows between unbounded preceding and current row) as sum4, 

-- 当前行和前面一行的窗口聚合
sum(score) over(partition by subject order by score rows between 1 preceding and current row) as sum5,

-- 当前行的前面一行到后面一行的窗口聚合  前一行+当前行+后一行
sum(score) over(partition by subject order by score rows between 1 preceding and 1 following) as sum6,

-- 当前行与后一行之和
sum(score) over(partition by subject order by score rows between current row and 1 following) as sum6,

-- 当前和后面所有的行
sum(score) over(partition by subject order by score rows between current row and unbounded following) as sum7
from t_fraction;

rows：行
unbounded preceding：起点
unbounded following：终点
n preceding：前 n 行
n following：后 n 行
current row：当前行

查询结果如下

在这里插入图片描述

聚合开窗函数练习

name，orderdate，cost
jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,42
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94


建表加载数据
vim business.txt

create table business
(
name string, 
orderdate string,
cost int
)ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';

load data local inpath "本地文件路径" into table business;

需求1：查询在2017年4月份购买过的顾客及总人数
需求2：查询顾客的购买明细及月购买总额
需求3：上述的场景,要将cost按照日期进行累加
需求4：查询顾客上次的购买时间
需求5：查询前20%时间的订单信息

按照日期过滤、分组count求总人数
select name,orderdate,cost,count(*) over() total_people from business where date_format(orderdate,'yyyy-MM')='2017-04';

按照顾客分组、sum购买金额
select name,orderdate,cost,sum(cost) over(partition by name) total_amount from business;

按照顾客分组、日期升序排序、组内每条数据将之前的金额累加
select name,orderdate,cost,sum(cost) over(partition by name order by orderdate rows between unbounded preceding and current row) cumulative_amountfrom business;

查询出明细数据同时获取上一条数据的购买时间(肯定需要按照顾客分组、时间升序排序)
select name,orderdate,cost,lag(orderdate,1) over(partition by name order by orderdate) last_date from business;

按照日期升序排序、取前20%的数据
select * from (select name,orderdate,cost,ntile(5) over(order by orderdate) sortgroup_num from business) t where t.sortgroup_num=1;

排序开窗函数

你知道开窗函数吗？请说一说有哪些排序开窗函数？RANK() ，DENSE_RANK() ，ROW_NUMBER()有什么区别？

RANK() 排序相同时会重复，总数不会变
DENSE_RANK() 排序相同时会重复，总数会减少
ROW_NUMBER() 会根据顺序计算
PERCENT_RANK()计算给定行的百分比排名。可以用来计算超过了百分之多少的人(当前行的rank值-1)/(分组内的总行数-1)

select name,subject,
score,
rank() over(partition by subject order by score desc) rp,
dense_rank() over(partition by subject order by score desc) drp,
row_number() over(partition by subject order by score desc) rnp,
percent_rank() over(partition by subject order by score) as percent_rank 
from t_fraction;

select name,subject,score,
rank() over(order by score) as rank,
row_number() over(order by score) as row_number,
percent_rank() over(partition by subject order by score) as percent_rank
from t_fraction;

select name,subject,score,
rank() over() as rank,
row_number() over() as row_number,
percent_rank() over(partition by subject order by score) as percent_rank
from t_fraction;

排序开窗函数实战

name	subject	score
李毅	语文	87
李毅	数学	95
李毅	英语	68
黄仙	语文	94
黄仙	数学	56
黄仙	英语	84
小虎	语文	64
小虎	数学	86
小虎	英语	84
许文客	语文	65
许文客	数学	85
许文客	英语	78

建表加载数据
vim score.txt

create table score
(
name string,
subject string, 
score int
) row format delimited fields terminated by "\t";

load data local inpath '本地文件路径' into table score;

需求1：每门学科学生成绩排名(是否并列排名、空位排名三种实现)
分析：学科分组、成绩降序排序、按照成绩排名
select name,subject,score,
rank() over(partition by subject order by score desc) rp,
dense_rank() over(partition by subject order by score desc) drp,
row_number() over(partition by subject order by score desc) rmp
from 
score;

需求2：每门学科成绩排名top n的学生
select * from ( select name,subject,score,row_number() over(partition by subject order by score desc) rmp from score2) t
where t.rmp<=3;

Hive的行转列和列转行

一行转多行

create table testArray2(
    name string,
    weight array<string>
)row format delimited 
fields terminated by '\t'
COLLECTION ITEMS terminated by ',';

xlj	"150","170","180"
dashuju	"150","180","190"

select name,col1  from testarray2 lateral view explode(weight) t1 as col1;

+----------+--------+
|   name   |  col1  |
+----------+--------+
| xlj      | "150"  |
| xlj      | "170"  |
| xlj      | "180"  |
| dashuju  | "150"  |
| dashuju  | "180"  |
| dashuju  | "190"  |
+----------+--------+

select key from (select explode(map('key1',1,'key2',2,'key3',3)) as (key,value)) t;

key1
key2
key3

select name,col1,col2  from testarray2 lateral view explode(map('key1',1,'key2',2,'key3',3)) t1 as col1,col2;
+----------+-------+-------+
|   name   | col1  | col2  |
+----------+-------+-------+
| xlj      | key1  | 1     |
| xlj      | key2  | 2     |
| xlj      | key3  | 3     |
| dashuju  | key1  | 1     |
| dashuju  | key2  | 2     |
| dashuju  | key3  | 3     |
+----------+-------+-------+

select name,pos,col1  from testarray2 lateral view posexplode(weight) t1 as pos,col1;

+----------+------+--------+
|   name   | pos  |  col1  |
+----------+------+--------+
| xlj      | 0    | "150"  |
| xlj      | 1    | "170"  |
| xlj      | 2    | "180"  |
| dashuju  | 0    | "150"  |
| dashuju  | 1    | "180"  |
| dashuju  | 2    | "190"  |
+----------+------+--------+

多行转一行

// testLieToLine
name col1
xiaohu	150
xiaohu	170
xiaohu	180
huohuo	150
huohuo	180
huohuo	190

create table testLieToLine(
    name string,
    col1 int
)row format delimited 
fields terminated by '\t';

select name,collect_list(col1) from testLieToLine group by name;

// 结果
xiaohu	["150","180","190"]
huohuo	["150","170","180"]

select  t1.name
        ,collect_list(t1.col1) 
from (
    select  name
            ,col1 
    from testarray2 
    lateral view explode(weight) t1 as col1
) t1 group by t1.name;