Hive(三)

最新推荐文章于 2024-10-03 18:36:45 发布

究极章鱼

最新推荐文章于 2024-10-03 18:36:45 发布

阅读量66

点赞数

文章标签： hive

本文链接：https://blog.csdn.net/qq_50379483/article/details/133688154

版权

Hive_Day03

文章目录

Hive_Day03

静态分区和动态分区

单个静态分区案例

CREATE TABLE IF NOT EXISTS t_student (
sno int,
sname string
) partitioned by(grade int)
row format delimited fields terminated by ',';
--  分区的字段不要和表的字段相同。相同会报错error10035

1,xiaohu01,1
2,xiaohu02,1
3,xiaohu03,1
4,xiaohu04,1
5,xiaohu05,1

6,xiaohu06,2
7,xiaohu07,2
8,xiaohu08,2

9,xiaohu09,3
10,xiaohu10,3
11,xiaohu11,3
12,xiaohu12,3
13,xiaohu13,3
14,xiaohu14,3
15,xiaohu15,3

16,xiaohu16,4
17,xiaohu17,4
18,xiaohu18,4
19,xiaohu19,4
20,xiaohu20,4
21,xiaohu21,4

多个静态分区案例

CREATE TABLE IF NOT EXISTS t_teacher (
tno int,
tname string
) partitioned by(grade int,clazz int)
row format delimited fields terminated by ','
location '/xlj/hive_static_multi';

--注意：前后两个分区的关系为父子关系，也就是grade文件夹下面有多个clazz子文件夹。
1,xiaoge01,1,1
2,xiaoge02,1,1

3,xiaoge03,1,2
4,xiaoge04,1,2

5,xiaoge05,1,3
6,xiaoge06,1,3

7,xiaoge07,2,1
8,xiaoge08,2,1

9,xiaoge09,2,2

--载入数据
load data local inpath '/usr/local/soft/bigdata/t11.txt' into table t_teacher partition(grade=1,clazz=1);

分区表查询

// 查看分区
show partitions t_student;

select * from t_student where grade = 1;

// 全表扫描，不推荐，效率低
select count(*) from students_pt1;

// 使用where条件进行分区裁剪，避免了全表扫描，效率高
select count(*) from students_pt1 where grade = 1;

// 也可以在where条件中使用非等值判断
select count(*) from students_pt1 where grade<3 and grade>=1;

// 添加分区
alter table t_student add partition (grade=5);

alter table t_student add partition (grade=5) location '指定数据文件的路径';
这里的文件路径指的是要上传数据的文件路径，不是已经有数据的文件路径，这个文件夹必须是事先不存在的，如果存在就报错

// 删除分区
alter table t_student drop partition (grade=5);

动态分区

--创建分区表
CREATE TABLE IF NOT EXISTS t_student_d (
sno int,
sname string
) partitioned by (grade int,clazz int)
row format delimited fields terminated by ',';

--创建外部表
CREATE EXTERNAL TABLE IF NOT EXISTS t_student_e (
sno int,
sname string,
grade int,
clazz int
) 
row format delimited fields terminated by ',';

数据：

1,xiaohu01,1,1
2,xiaohu02,1,1
3,xiaohu03,1,1
4,xiaohu04,1,2
5,xiaohu05,1,2
6,xiaohu06,2,3
7,xiaohu07,2,3
8,xiaohu08,2,3
9,xiaohu09,3,3
10,xiaohu10,3,3
11,xiaohu11,3,3
12,xiaohu12,3,4
13,xiaohu13,3,4
14,xiaohu14,3,4
15,xiaohu15,3,4
16,xiaohu16,4,4
17,xiaohu17,4,4
18,xiaohu18,4,5
19,xiaohu19,4,5
20,xiaohu20,4,5
21,xiaohu21,4,5

insert overwrite table t_student_d partition (grade,clazz) select * from t_student_e;

分桶

1,tom,11
2,cat,22
3,dog,33
4,hive,44
5,hbase,55
6,mr,66
7,alice,77
8,scala,88

create table psn31
(
id int,
name string,
age int
)
row format delimited
fields terminated by ',';

create table psn_bucket
(
id int,
name string,
age int
)
clustered by(age) into 4 buckets
row format delimited
fields terminated by ',';

insert into psn_bucket select * from psn31;

分区和分桶的区别

Hive分区：是指按照数据表的某列或者某些列分为多个区，区从形式上可以理解为⽂件夹，⽐如我们要收集某个⼤型⽹站的⽇志数据，⼀个⽹站每天的⽇志数据存在同⼀张表上，由于每天会⽣成⼤量的⽇志，导致数据表的内容巨⼤，在查询时进⾏全表扫描耗费的资源⾮常多。那其实这个情况下，我们可以按照⽇期对数据表进⾏分区，不同⽇期的数据存放在不同的分区，在查询时只要指定分区字段的值就可以直接从该分区查找。
Hive分桶：分桶是相对分区进⾏更细粒度的划分。分桶将整个数据内容安装某列属性值得hash值进⾏区分，如要安装name属性分为3个桶，就是对name属性值的hash值对3取摸，按照取模结果对数据分桶。如取模结果为0的数据记录存放到⼀个⽂件，取模为1的数据存放到⼀个⽂件，取模为2的数据存放到⼀个⽂件。
分桶对数据的处理⽐分区更加的细化，分区针对的是数据的存储路径，分桶针对的是数据⽂件
分桶是按照hash值进⾏切分的，相对来说⽐较公平，分区是按照列的值划分，容易造成数据倾斜
分桶、分区不⼲扰，分区表可以划分为分桶表

4个By的区别

在⽣产环境中order by⽤的较少，容易导致OOM 在⽣产环境中sort by+distribute by⽤的多

order by：全局排序，只有⼀个reduce
sort by：分区内有序
distribute by：类似于MR中partition，进⾏分区，结合sort by使⽤
cluster by：当distribute by和sort by字段相同的时候，可以使⽤cluster by⽅式。cluster by除了具有distribute by的功能外还兼具了sort by的功能。但是排序只能是升序排序，不能指定排序规则为ASC或者DESC

一行转换为多行，和多行转换为一行

UDTF 进一出多

-- 创建数据库表
create table t_movie1(
id int,
name string,
types string
)
row format delimited fields terminated by ','
lines terminated by '\n';

-- 电影数据  movie1.txt
-- 加载数据到数据库 load data inpath '/shujia/movie1.txt' into table t_movie1;
1,余罪,剧情-动作-犯罪
2,木乃伊,动作-冒险-剧情
3,勇敢的心,动作-传记-剧情-历史-战争
4,大话西游,剧情-动作-爱情-武侠-古装
5,霍比特人,动作-奇幻-冒险

-- explode  可以将一组数组的数据变成一列表
select  explode(split(types,"-")) from t_movie1;

-- lateral view 表生成函数，可以将explode的数据生成一个列表
select id,name,type from t_movie1 lateral view explode(split(types,"-")) typetable as type;

-- 创建数据库表
create table t_movie2(
id int,
name string,
type string
)
row format delimited fields terminated by ','
lines terminated by '\n';

-- 电影数据 movie2.txt
-- 加载数据到数据库 load data inpath '/shujia/movie2.txt' into table t_movie2;
1,余罪,剧情
1,余罪,动作
1,余罪,犯罪
2,木乃伊,动作
2,木乃伊,冒险
2,木乃伊,剧情
3,勇敢的心,动作
3,勇敢的心,传记
3,勇敢的心,剧情
3,勇敢的心,历史
3,勇敢的心,战争
4,大话西游,剧情
4,大话西游,动作
4,大话西游,爱情
4,大话西游,武侠
4,大话西游,古装
5,霍比特人,动作
5,霍比特人,奇幻
5,霍比特人,冒险

1,余罪,剧情
1,余罪,动作
1,余罪,犯罪

id,name,["剧情","动作","犯罪"]
1,余罪,"剧情-动作-犯罪"


-- collect_set()和collect_list()都是对列转成行，区别就是list里面可重复而set里面是去重的
-- concat_ws(':',collect_set(type))   ':' 表示你合并后用什么分隔，collect_set(stage)表示要合并表中的那一列数据
select id,name,concat_ws('-',collect_set(type)) as types from t_movie2 group by id,name;

词频统计案例

hello-world
hello-bigdata-spark
like-life
bigdata-good-hive-spark

create table wc
(
line string
)
row format delimited fields terminated by ',';

load data local inpath '/usr/local/soft/hivedata/wc.txt' into table wc;

select w.word,count(*) from (select explode(split(line,',')) as word from wc) w group by w.word;