hive-3

最新推荐文章于 2022-05-09 19:38:17 发布

leezsj

最新推荐文章于 2022-05-09 19:38:17 发布

阅读量544

点赞数

分类专栏： hive 文章标签： hive

本文链接：https://blog.csdn.net/leezsj/article/details/118708509

版权

⼆级分区

⼆级分区指的是在⼀张表中有两个分区,创建和插⼊时都要指定两个分区名,最常⻅的就
是下⾯案例的年和⽉,创建的语法和流程都是和⼀级分区⼀样,只是多⼀个分区⽽已.

create table if not exists part2(
id int,
name string
)
partitioned by (year string,month string)
row format delimited fields terminated by ',';
load data local inpath '/opt/soft/data/user.txt' into table part2
partition(year='2018',month='03');
load data local inpath '/opt/soft/data/user.txt' into table part2
partition(year='2018',month='02');

三级分区

三级分区,以此类推,就是当前表中有三个分区,⼀般情况下要数据量⾮常⼤的情况下才会⽤到.

-- 创建⼀个三级分区
create table if not exists part3(
id int,
name string
)
partitioned by (year string,month string,day string)
row format delimited fields terminated by ',';
-- 给三级分区加载数据
load data local inpath '/opt/soft/data/user.txt' into table
part3 partition(year='2018',month='03',day='21');
load data local inpath '/opt/soft/data/user.txt' into table
part3 partition(year='2018',month='02',day='20');

-- 创建⼀个三级分区
create table if not exists part3(
id int,
name string
)
partitioned by (year string,month string,day string)
row format delimited fields terminated by ',';
-- 加载数据
insert into part3 partition(year='2021',month='07',day='08')
slelect
id,
name
from part1
where dt='20210708';
-- 行为数据中一定有时间 操作表
-- 维度数据一般没有时间 地区表，类别表
-- 分区表一般都有时间
-- show partitions part3;查看分区表
-- 主机地址：50070查看
select * from part3;查看part3

-- load 方式加载
load data local inpath '/home/part2' into table part3 partition(year='2021',month='07',day='08');
select * from part3;
-- 追加会重复数据
select * fom part3 where day='08';
-- 分区查询
-- 独立的按照三个条件过滤就用三级分区，否则没必要

测试是否区分⼤⼩写

在hive中默认分区是不区分⼤⼩写的,通过下⾯的案例可以来测试

select * fom part3 where DAY='08';
-- 测试是否区分大小写

创建⼀个测试区分⼤⼩写的分区

create table if not exists part4(
id int,
name string
)
partitioned by (year string,month string,DAY string)
row format delimited fields terminated by ',';

进⾏分区⼤⼩写的测试

load data local inpath '/opt/soft/data/user.txt' into table part4
partition(year='2018',month='03',day='AA');

查看分区

show partitions part1;

新增分区

有时候在创建表的时候没有创建具体的静态分区, 可以在后期通过添加分区来实现添加静态分区

-- 创建分区表
create table if not exists part5
(
 id int,
 name string
)
 partitioned by (dt string) row format delimited fields
terminated by ",";
-- 增加分区：
alter table part5 add partition(dt='2018-03-21');
alter table part5 add partition(dt='2018-03-20')
partition(dt='2018-03-17');


alter table part1 add partition(dt='20210709');
--新增单个分区
alter table part1 add partition(dt='20210709') partition(dt='20210709');
--新增多个分区

增加分区并设置数据

alter table part5 add partition(dt='2018-11-11') location
'/user/hive/warehouse/part1/dt=2019-08-08';
alter table part5 add partition(dt='2018-03-26') location
'/user/hive/warehouse/part2/dt=2018-03-20'
 partition(dt='2018-03-24') location
'/user/hive/warehouse/part3/dt=2018-03-21';


-- 增加分区并指定数据
alter table part1 add partition(dt='20210711') local '/input';
-- 显示NULL的原因是因为内容不匹配分隔符不匹配
-- 第一列是因为类型不匹配，后面是因为类型不匹配，分隔符不匹配
-- 所以分区列表里面没有
-- 因为是本地添加的所以他的目录是本地的hdfs：主机名：9000 /input
-- 其他的目录是hdfs：主机名：9000 /user/hive/warehouse/test.db/part1/dt=20210710

修改分区的hdfs的路径

（注意：location后接的hdfs路径需要写成完全路径）

alter table part5 partition(dt='2018-03-21') set location
'/user/hive/warehouse/qf1704.db/part1/dt=2018-03-21'; --错误使
⽤
alter table part5 partition(dt='2018-03-21') set location
'hdfs://leetom01:8020/user/hive/warehouse/qf1704.db/part1/dt=20
19-09-11';

-- 修改分区的hdfs路径（直接修改元数据）
alter table part1 partition(dt='20210711') set location '/out/01';
--也行
alter table part1 partition(dt='20210711') set location 'hdfs://lee01:9000/out/01';

删除分区

alter table part5 drop partition(dt='2018-03-21');


-- 修改为外部表
alter table part1 set tblproperties('EXTERNAL'='TRUE');

-- 删除分区（删除元数据，文件）
alter table part1 drop partition(d='20210709');-- 一个
alter table part1 drop partition(dt='20210709'),partition(dt='20210710'); -- 多个
-- 外部表在删除时只会删除元数据，不会删除hdfs中的分区数据，但是内部表会删除数据

删除多个分区

alter table part5 drop partition(dt='2018-03-
24'),partition(dt='2018-03-26');

分区类型详解

静态分区

加载数据到指定分区的值,新增分区或者加载分区时指定分区名.
加载数据到分区时明确指定分区值

动态分区

数据未知，根据分区的值来确定需要创建的分区。

加载分区时未指定分区值

历史数据，或者一段时间的数据，存放到一起后期需要将其分区

混合分区

静态和动态都有。

动态分区属性设置及示例

动态分区的属性：
hive.exec.dynamic.partition=true
-- 默认允许动态分区
hive.exec.dynamic.partition.mode=nonstrict
-- 默认为严格模式，用户必须指定一个静态分区
-- 非严格模式 所有分区均可为动态
hive.exec.max.dynamic.partitions=1000
-- 最大动态分区数量
hive.exec.max.dynamic.partitions.pernode=100
-- 单台服务器最大100个分区

创建动态分区表

-- 创建动态分区表
create table dy_part1(
id int,
name string
)
partitioned by (dt string)
row format delimited fields terminated by ',';

-- 创建表
create table dy_part(
id int,
name string
)
partitioned by (year string,month string,day string)
row format delimited fields terminated by ',';

create table dy_part3(
id int,
name string
)
partitioned by (year string,month string,day string)
row format delimited fields terminated by ',';

-- 动态加载
set hive.exec.dynamic.patition.mode=nonstrict;
insert into dy_part2 partition(year,month,day)
select 
id,
name,
year,
month,
day
from part3
;
set hive.exec.dynamic.patition.mode=nonstrict;
insert into dy_part3 partition(year,month,day)
select 
id,
name,
month,
day
from part3
where year='2021'
;

动态分区加载数据不能使⽤load⽅式加载

load data local inpath '/hivedata/user.csv' into table dy_part1
partition(dt);

-- load 不会产生作业

动态分区使⽤insert into的⽅式加载数据

先创建临时表：
create table temp_part(
id int,
name string,
dt string
)
row format delimited fields terminated by ',';
拷⻉data/student.txt到服务器上
导⼊数据：
load data local inpath '/hivedata/student.txt' into table
temp_part;
# 如果是严格模式,则不能导⼊,需要执⾏下⾯代码
set hive.exec.dynamic.partition.mode=nonstrict
insert into dy_part1 partition(dt)
select id,name,dt from temp_part;

混合分区示例

在⼀个分区表中同时有静态分区,同时也有动态分区,那么这个表的分区表可以称之为混合分区,下⾯演示下混合分区的流程. (注意列的个数匹配)

创建⼀个混合分区表

create table dy_part2(
id int,
name string
)
partitioned by (year string,month string,day string)
row format delimited fields terminated by ',';

创建临时表并加载数据

-- 创建分区表
create table temp_part2(
id int,
name string,
year string,
month string,
day string
)
row format delimited fields terminated by ',';
#加载数据 数据参考(data/student.txt⽂件)
load data local inpath '/opt/data/student.txt' into table
temp_part2;

导⼊数据到分区表

(注意：字段的对应)

错误⽤法：
insert into dy_part2 partition (year='2018',month,day)
select * from temp_part2;
正确⽤法：
insert into dy_part2 partition (year='2018',month,day)
select id,name,month,day from temp_part2;

设置hive为严格模式执⾏

#严格模式⾄少要有⼀个静态分区,⾮严格模式可以全是动态分区
set hive.mapred.mode=nonstrict/strict;
<name>hive.mapred.mode</name>
<value>nonstrict</value>
<description>
 The mode in which the Hive operations are being performed.
 In strict mode, some risky queries are not allowed to run.
They include:
 Cartesian Product.
 No partition being picked up for a query.
 Comparing bigints and strings.
 Comparing bigints and doubles.
 Orderby without limit.
</description>

set hive.mapred.mode=nonstrict/strict
-- 设置非严格/严格模式   
-- 默认非严格
select * from part where dt='20210707';
-- 分区过滤
select * from part3 where day ='08'
-- 严格分区使用分区字段进行查询
-- 有风险的查询不允许执行
-- 如：
    1笛卡尔集
    2分区表不用分区字段过滤
    3bigint和string比较查询
    4bitint和double比较查询
    5orderby语句不带limit查询
    select * from flow2 order by up desc limit 2;
    如下语句不允许查询
    select * from flow2 order by up desc;
    select * from part3;

分区表注意事项

1. hive的分区使⽤的是表外字段，分区字段是⼀个==伪==列，但是分区字段是可以
做查询过滤。
2. 分区字段不建议使⽤中⽂
3. ⼀般不建议使⽤动态分区，因为动态分区会使⽤mapreduce来进⾏查询数据，
如果分区数据过多，导致 namenode 和 resourcemanager 的性能瓶颈。所以建议
在使⽤动态分区前尽可能预知分区数量。
4. 分区属性的修改都可以修改元数据和hdfs数据内容。
Hive分区和Mysql分区的区别
mysql分区字段⽤的是表内字段；⽽hive分区字段采⽤表外字段。
-- 分区表一般适用于行为数据

分桶

为什么要分桶

当单个的分区或者表的数据量过⼤，分区不能更细粒度的划分数据，就需要使⽤分桶技术将数据划分成更细的粒度

-- 分区没有办法把数据分的很细的情况用分桶

-- 分区的数据量仍然过大，使用分桶，分到更细的文件

-- 在区或表中创建更多文件

分桶技术

[CLUSTERED BY (COLUMNNAME COLUMNTYPE [COMMENT 'COLUMN
COMMENT'],...)
[SORTED BY (COLUMNNAME [ASC|DESC])...] INTO NUM_BUCKETS BUCKETS]
-- 有多少个桶就有多少个文件
clustered by (columname columntpye [comment 'column comment'],...)[sorted by (columnname[asc|desc])...] into num_buckets buckets

关键字及其原理

==bucket==
分桶的原理:跟MR中的HashPartitioner原理⼀样:都是key的hash值取模reduce的数
量
MR中：按照key的hash值除以reduceTask取余
Hive中：按照分桶字段的hash值取模除以分桶的个数

分桶的意义

为了保存分桶查询的分桶结构（数据按照分桶字段进⾏保存hash散列）
分桶表的数据进⾏抽样和JOIN时可以提⾼查询效率,⼀般是⽤来抽样查询

分桶表的创建示例

分桶表创建流程:
1. ⾸先要创建带分桶定义的表（分桶表）
2. 然后创建⼀个临时表（普通表）
3. 从临时表中使⽤分桶查询将查询到的数据插⼊到分桶表

对普通表进⾏分桶查询

create table t_stu(
Sno int,
Sname string,
Sex string,
Sage int,
Sdept string)
row format delimited fields terminated by ','
stored as textfile;
load data local inpath '/root/hivedata/students.txt' into table
t_stu;
select * from t_stu;

--操作
-- 文件
vi student.txt
1，cuihua1,1,23,1
-- 建表
create table t_stu(
sno int,
sname string,
sex string,
sage int,
sdept string
)
row format delimited fields terminated by ','
stored as testfile;
load data local inpath '/root/hivedata/students.txt' into table
t_stu;
select * from t_stu;

简单查询测试：----按照分桶查询
select * from t_stu cluster by (sno); 分桶的同时还会进⾏排序
select * from t_stu distribute by (sno) sort by (sage desc); distribute只进⾏分桶
sort by进⾏排序
注意:
默认只有⼀个reduce,所以默认的分桶也只有⼀个,看不到效果,这时我们可以将
reduce的个数设置成4个,再进⾏查询,分析结果.这⾥的分桶就类似于mr中的分区
设置reduce task的个数： set mapreduce.job.reduces=4;
查询时分开设置分桶和排序的⽬的是为了满⾜⽤户的需求

创建分桶表

create table if not exists buc13(
id int,
name string,
age int
)
clustered by (id) into 4 buckets
row format delimited fields terminated by ',';
数据：从(data/buc1.csv加载如下格式数据)
id,name,age
-- 分桶的字段一定是表内字段不能用表外字段

创建临时表

create table if not exists temp_buc1(
id int,
name string,
age int
)
row format delimited fields terminated by ',';

分桶使⽤load⽅式加载数据不能体现分桶

load data local inpath '/opt/data/buc1.txt' into table buc1;
-- 能加载，但是不能体现分桶（使用load装载看不到分桶的效果，使用insertinto方式装载）

加载数据到临时表

load data local inpath '/hivedata/buc1.txt' into table temp_buc1;

使⽤分桶查询将数据导⼊到分桶表

insert overwrite table buc13
select id,name,age from temp_buc1
cluster by (id);

设置强制分桶的属性

<!-- 如果要分桶,就要打开分桶的强制模式 -->
set hive.enforce.bucketing=false/true
<name>hive.enforce.bucketing</name>
<value>false</value>
<description>
 Whether bucketing is enforced. If true, while inserting into
the table, bucketing is enforced.</description>

如果设置了reduces的个数和总桶数不⼀样，请⼿动设置

set mapreduce.job.reduces=-1 #-1表示可以根据实际需要来定制reduces的数
量

创建指定排序字段的分桶表

create table if not exists buc8(
id int,
name string,
age int
)
clustered by (id)
sorted by (id) into 4 buckets
row format delimited fields terminated by ',';

导⼊数据

insert overwrite table buc6
select id,name,age from temp_buc1
distribute by (id) sort by (id asc);
和下⾯的语句效果⼀样
insert overwrite table buc8
select id,name,age from temp_buc1
cluster by (id);

-- insert into 
set hive.enforce.bucketing=ture;
insert overwrite table buc13
select sno,sname,sage from t_stu
cluster by (sno);
-----------如下等价于如上-------------------
set hive.enforce.bucketing=ture;
insert overwrite table buc13
select sno,sname,sage from t_stu
distributed by (sno) sort by sno asc;

--查询数据
select * from buc13;

分桶数据是：
1%4  1
2%4  2
3%4  3
4%4  0
5%4  1
6%4  2
7%4  3
8%4  0
9%4  1

分桶表查询案例

select * from buc3;
select * from buc3 tablesample(bucket 1 out of 1 on id);
-- 查询分桶全部数据
select * from buc13 tablesample(bucket 1 out of 1 on id);
-- 查询指定分桶数据--第一桶x%y  x查询的第几桶，y总共桶数
select * from buc13 tablesample(bucket 1 out of 4 on id);
-- 查询指定分桶数据--第二桶
select * from buc13 tablesample(bucket 2 out of 4 on id);
-- 查询指定分桶数据--把四桶压缩成两桶，第一桶是1，3桶第二桶是2，4桶
select * from buc13 tablesample(bucket 1 out of 2 on id);
-- 查询指定分桶数据--把四桶放大成八桶，1分二再查询
select * from buc13 tablesample(bucket 1 out of 8 on id);
-- 查询和分桶配合
select * from buc13 tablesample(bucket 1 out of 2 on id) where id>5;

查询第1桶的数据

默认有4桶
查询第⼀桶
select * from buc3 tablesample(bucket 1 out of 4 on sno); 
查询第⼀桶和第三桶
select * from buc3 tablesample(bucket 1 out of 2 on sno);
查询第⼀桶的前半部分
select * from buc3 tablesample(bucket 1 out of 8 on sno);
tablesample(bucket x out of y on id)
语法：
tablesample(bucket x out of y on sno) 注意：tablesample⼀定是紧
跟在表名之后
x:代表从第⼏桶开始查询
y:查询的总桶数，y可以是总桶数的倍数或者因⼦，x不能⼤于y
x表示从哪个bucket开始抽取。例如，table总bucket数为32，
tablesample(bucket 3 out of 16)，表示总共抽取（32/16=）2个bucket的数
据，分别为第3个bucket和第（3+16=）19个bucket的数据。
表⼀,表⼆,表三说明:
表中的含义是当x取对应的值时,m对应的就是所对应的桶
只记住表的结果即可.
当前的桶数 4 
当y=桶数 实例 (bucket 1 out of 4 on sno) 
m=sno%y m是余数
解释:当x=1,桶数=4,y=4, 只要sno对y取余=0我们就认为这条数据在第1桶
这⾥当x=1时,m=0,所以取的是第⼀桶
表⼀:
第⼏桶 1 2 3 4
x=1 m=0
x=2 	m=1
x=3 		m=2
x=4 			m=3
当y<桶数 我们称y为因⼦ 实例 (bucket 1 out of 2 on sno) 取的
是第⼀桶和第三桶
解释:
⾸先使⽤n=sno%桶数+1 
再使⽤m=n判断,这⾥的m就是下⾯的m1,m2,m3....
这⾥当x=1时,m=m1和m3,所以取的是第⼀桶和第三桶
这⾥当x=2时,m=m2和m4,所以取的是第⼆桶和第四桶
表⼆:
第⼏桶 1 2 3 4
x=1 m1=x=1(x的数值) m3=1+4/2=3(m1+桶数/y值) 
m5=3+4/2=5(m2+桶数/y值)>4舍掉
x=2 m2=2 m4=m2+4/y值=4 
m6=m4+4/y值=6>4 舍掉
当y>桶数 我们称y为倍数 实例 (bucket 1 out of 8 on sno) 
m=sno%y m是余数
解释:当x=1,桶数=4,y=8, 只要sno对y取余=0我们就认为这条数据在第1桶
表三:
第⼏桶 1 2 3 4
x=1 m=0
x=2 m=1
x=3 m=2
x=4 m=3
x=5 m=0+4
x=6 m=1+4
x=7 m=2+4
x=8 m=3+4

查询

注意y可以不是总桶数的倍数,但是他会重新分桶,重新查询.

select * from buc3 tablesample(bucket 1 out of 5 on sno);

查询sno为奇数的数据
select * from buc3 tablesample(bucket 2 out of 2 on sno);
查询sno为偶数且age⼤于30的⼈
select * from buc3 tablesample(bucket 1 out of 2 on sno) where
age>30;
注意:这⾥会报错,talesample⼀定是紧跟在表名之后
select * from buc3 where age>30 tablesample(bucket 1 out of 2 on
sno);
注意:由于有编码问题:当我们写中⽂时要注意.编码不对查不出结果
其他查询知识:
select * from buc3 limit 3; 查出三⾏
select * from buc3 tablesample(3 rows); 查出三⾏
select * from buc3 tablesample(13 percent); 查出13%的内容,如果百分
⽐不够现实⼀⾏,⾄少会显示⼀⾏,如果百分⽐为0,显示第⼀桶
select * from buc3 tablesample(68B);k KB M G T P 查出68B包含的数
据,如果是0B,默认显示第⼀桶
要求随机抽取3⾏数据：
select * from t_stu order by rand() limit 3; 随机显示3条数据 
https://www.aboutyun.com/thread-27093-1-1.html

select * from buc3 tablesample(3 rows); 查出三⾏
select * from buc3 tablesample(10 percent); 查出10%
select * from buc3 tablesample(1k); 查出1k
--随机查出三行
select * from t_stu order by rand() limit 3;

分区分桶联合案例

案例查下

需求：按照性别分区（1男2⼥），在分区中按照id的奇偶进⾏分桶：
id,name,sex
数据请参考data/user.txt
建表：
create table if not exists stu(
id int,
name string
)
partitioned by (sex string)
clustered by (id) into 2 buckets
row format delimited fields terminated by ',';
建临时表：
create table if not exists stu_temp(
id int,
name string,
sex string
)
row format delimited fields terminated by ',';
-- 主要要把动态分区设为⾮严格模式,如下:
set hive.exec.dynamic.partition.mode=nonstrict;
-- 加载临时表的数据
load data local inpath '/opt/data/user.txt' into table stu_temp;
将数据导⼊到分区分桶表
insert overwrite table stu partition(sex)
select id,name,sex from stu_temp cluster by id ;
查询性别为⼥，学号为奇数的学⽣
select * from stu tablesample(bucket 2 out of 2 on id)
where sex = '2';

--分区分桶
create table if not exists buc13(
id int,
name string,
agt int
)
partitioned by(dt string)
clustered by(id) into 2 buckets
row format delimited fields terminated by ',';

分桶表总结

1、定义
 clustered by (id) ---指定分桶的字段
 sorted by (id asc|desc) ---指定数据的排序规则，表示咱们预期的数
据是以这种规则进⾏的排序
 
2、导⼊数据
 cluster by (id) ---指定getPartition以哪个字段来进⾏hash，并且
排序字段也是指定的字段，排序是以asc排列
 distribute by (id) ---- 指定getPartition以哪个字段来进⾏hash
 sort by (name asc | desc) ---指定排序字段
导数据时：
 insert overwrite table buc3
 select id,name,age from temp_buc1
 distribute by (id) sort by (id asc);
 和下⾯的语句效果⼀样
 insert overwrite table buc4
 select id,name,age from temp_buc1
 cluster by (id) ;

注意事项

分区使⽤的是表外字段，分桶使⽤的是表内字段
分桶更加⽤于细粒度的管理数据，更多的是使⽤来做抽样、join

查询语句基本语法

Select查询结构基本语法

下⾯是⼀个SQL查询语句的基本结构

select selection_list # 查询的列
from table # 要查询的表
join on # 连接的表
where # 查询条件
group by # 分组查询
having # 分组条件过滤
order by # 字段排序
sort by # 结果排序
limit # 限制结果数
union/union all # 合并表

sql语句的执⾏顺序

FROM
<left_table>
ON
<join_condition>
<join_type>
JOIN
&

最低0.47元/天解锁文章

leezsj

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
hive-3

⼆级分区⼆级分区指的是在⼀张表中有两个分区,创建和插⼊时都要指定两个分区名,最常⻅的就是下⾯案例的年和⽉,创建的语法和流程都是和⼀级分区⼀样,只是多⼀个分区⽽已.create table if not exists part2(id int,name string)partitioned by (year string,month string)row format delimited fields terminated by ',';load data local inpath
复制链接

扫一扫