Hive 静态分区与动态分区，分桶与抽样

最新推荐文章于 2024-07-14 12:46:09 发布

房石阳明i

最新推荐文章于 2024-07-14 12:46:09 发布

阅读量515

点赞数 1

文章标签： hive mysql 数据库

本文链接：https://blog.csdn.net/Mogeko1/article/details/127512064

版权

数据来源Mysql

mysql 建表：

CREATE TABLE IF NOT EXISTS test1 (
  emp_no BIGINT,
  name VARCHAR (32),
  date DATE,  
  PRIMARY KEY (emp_no)
);

mysql 插入数据:

INSERT INTO `test`.`test1` (`emp_no`, `name`, `date`) VALUES (1, 'zhangsan', '2020-10-23');
INSERT INTO `test`.`test1` (`emp_no`, `name`, `date`) VALUES (2, 'zhangsan2', '2021-10-23');
INSERT INTO `test`.`test1` (`emp_no`, `name`, `date`) VALUES (3, 'zhangsan3', '2022-10-23');
INSERT INTO `test`.`test1` (`emp_no`, `name`, `date`) VALUES (4, 'zhangsan4', '2019-10-23');

mysql导出数据:

SELECT * FROM test1 INTO OUTFILE '/var/lib/mysql-files/test1.txt' FIELDS TERMINATED BY ',';

#将导出的文件放入hdfs上

hdfs dfs -put  /var/lib/mysql-files/test1.txt   /hive/data/

Hive操作：#不分区

create table test1
(
emp_no bigint,
name string,
date date
)
row format delimited
fields terminated by ',';

load data inpath '/hive/data/test1.txt' into table test1; #加载hdfs上的数据

#表内提前有数据增加字段yaer作为分区导出到本地/root/data/emp_xxx上

FROM test1
INSERT OVERWRITE LOCAL DIRECTORY '/root/data/emp_2020' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' select *  where year(date)=2020
INSERT OVERWRITE LOCAL DIRECTORY '/root/data/emp_2021' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' select *  where year(date)=2021
INSERT OVERWRITE LOCAL DIRECTORY '/root/data/emp_2022' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' select *  where year(date)=2022

#创建静态分区表

create table test_partition 
(
emp_no bigint,
name string,
date date
)
partitioned by (year string)
row format delimited
fields terminated by ','
hive> show partitions test_partition; #查看分区 应是无

#加载上步分好的数据到静态分区里面

load data local inpath '/root/data/emp_2020/000000_0' into table test_partition  partition(year="2020");
load data local inpath '/root/data/emp_2021/000000_0' into table test_partition partition(year="2021");
#hive> show partitions test_partition; #查看分区 应是有
#hdfs dfs -ls /user/hive/warehouse/test_partition/  可在这里面查看
#alter table 表名 drop partition(year='xxxx');    #删除分区

或者
在hdfs上

hdfs dfs -mkdir -p /user/hive/warehouse/test.db/emp_partition/year=2020  #创建目录
hdfs dfs -put /root/data/emp_20/0000000_0   /user/hive/warehouse/test.db/emp_partition/year=2020 #上传目录
hive> show partitions test_partitoin; #再查看分区应是无 
hive> msck repair table emp_partition --关联mysql的元数据 修复分区
hive> show partitions test_partitoin; #再查看分区应是有

或者

hive>alter table emp_partition add partition(year=2022)    --建立分区目录，关联元数据
hive> show partitions test_partitoin; #再查看分区
hdfs  dfs -put /root/data/emp_2022/000000_0 /user/hive/warehouse/test.db/emp_partition/year=2022
hive> show partitions test_partitoin; #再查看分区

#共以上三种方式

#动态分区

create table test_ac 
(
emp_no bigint,
name string,
date date
)
partitioned by (year string)
row format delimited
fields terminated by ',';

set hive.exec.dynamic.partition=true;    //开启动态分区功能（默认true，开启）
set hive.exec.dynamic.partition.mode=nonstrict;    //设置为非严格模式（动态分区的模式，默认strict，表示必须指定至少一个分区为静态分区，nonstrict模式表示允许所有的分区字段都可以使用动态分区。

insert into test_ac  partition (year) select emp_no,name,date,year(date) as year from test1;
hive> show partitions test_partitoin; #查看分区

#动态分区扩展（可不加）：

#在所有执行MR的节点上，最大一共可以创建多少个动态分区。
hive.exec.max.dynamic.partitions=1000
#在每个执行MR的节点上，最大可以创建多少个动态分区。该参数需要根据实际的数据来设定。比如：源数据中包含了一年的数据，即day字段有365个值，那么该参数就需要设置成大于365，如果使用默认值100，则会报错。
hive.exec.max.dynamic.partitions.pernode=100
#整个MR Job中，最大可以创建多少个HDFS文件。
hive.exec.max.created.files=100000
#当有空分区生成时，是否抛出异常。一般不需要设置。
hive.error.on.empty.partition=false

#Hive分桶

mysql 追加数据:

INSERT INTO `test`.`test1` (`emp_no`, `name`, `date`) VALUES (5, 'zhangsan5', '2022-10-23');
INSERT INTO `test`.`test1` (`emp_no`, `name`, `date`) VALUES (6, 'zhangsan6', '2023-10-23');
INSERT INTO `test`.`test1` (`emp_no`, `name`, `date`) VALUES (7, 'zhangsan7', '2024-10-23');
INSERT INTO `test`.`test1` (`emp_no`, `name`, `date`) VALUES (8, 'zhangsan8', '2025-10-23');

#导出数据

SELECT * FROM test1 INTO OUTFILE '/var/lib/mysql-files/test1.txt' FIELDS TERMINATED BY ',';

#Hive操作：#创建数据表

CREATE TABLE test_bs(
emp_no bigint,
name string,
date date
)
row format delimited
fields terminated by ',';

load data local inpath '/hive/data/test1.txt' into table test_bs; #加载本地
load data inpath '/hive/data/test1.txt' into table test1; #加载hdfs

SET hive.enforce.bucketing=true;  #启用分块

#创建分桶表

CREATE TABLE test_bucket(
emp_no bigint,
name string,
date date
)
CLUSTERED BY(date) INTO 4 BUCKETS  //按（日期）分类为4个块
row format delimited
fields terminated by ',';

#指定数据来源

from test_bs
insert into test_bucket select emp_no,name,date;

#查看hdfs上分的桶

hdfs dfs -cat /user/hive/warehouse/test_bucket/000000_0

#抽样

SELECT * FROM test_bucket TABLESAMPLE(BUCKET 1 OUT OF 4 )
SELECT * FROM test_bucket TABLESAMPLE(BUCKET 2 OUT OF 8 )
SELECT * FROM test_bucket TABLESAMPLE(BUCKET 2 OUT OF 2 )
SELECT * FROM test_bucket TABLESAMPLE(BUCKET 3 OUT OF 4 )

#规则

tablesample是抽样语句，
语法：tablesample(bucket x out of y)，
y必须是table总共bucket数的倍数或者因子。
Hive根据y的大小，决定抽样的比例。
例如：table总共分了32份，
当y=16时，抽取2(32/16)个bucket的数据，
当y=64时，抽取1/2(32/64)个bucket的数据。
x表示从哪个bucket开始抽取。
例如：table总共bucket数为32，tablesample(bucket 3 out of 16)
表示总共抽取2(32/16)个bucket的数据，
分别为第三个bucket和第19(3+16)个bucket的数据。