一、文件存储格式
File Formats and Compression: RCFile, Avro, ORC, Parquet; Compression, LZO
1.1 textfile
1、textfile是hive默认的数据文件存储格式
2、textfile是普通的文件文本存储
3、不压缩
4、可以配合压缩配置属性进行压缩
CREATE TABLE `u4`(
`id` int,
`name` string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as textfile;
set mapreduce.output.fileoutputformat.compress=true;
set hive.exec.compress.output=true;
insert into table u4
select * from u2;
1.2 sequencefile
1、sequencefile是hive为用户提供的二进制存储
2、sequencefile不能使用load方式直接加载数据
3、本身压缩
CREATE TABLE `u4`(
`id` int,
`name` string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as sequencefile;
1.3 rcfile
1、rcfile是hive为用户提供的行列混合存储
2、rcfile格式下,将会尽量把附近的行和列的块尽量存储到一起
3、本身压缩,且查询效率较高
CREATE TABLE `u5`(
`id` int,
`name` string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as rcfile;
1.4 orc
1、orc是优化后的rcfile
CREATE TABLE `u6`(
`id` int,
`name` string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as orc;
1.5 parquet
1、parquet是典型列式存储,自带压缩,查询较快(按列查询)
CREATE TABLE `u7`(
`id` int,
`name` string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as PARQUET;
insert into table u7
select * from u2;
1.6 自定义存储格式
数据:
seq_yd元数据文件:
aGVsbG8gemhhbmdoYW8=
aGVsbG8gZmVpZmVpLGdvb2QgZ29vZCBzdHVkeSxkYXkgZGF5IHVw
seq_yd文件为base64编码后的内容,decode后数据为:
hello zhanghao
hello feifei,good good study,day day up
create table cus(str STRING)
stored as
inputformat 'org.apache.hadoop.hive.contrib.fileformat.base64.Base64TextInputFormat'
outputformat 'org.apache.hadoop.hive.contrib.fileformat.base64.Base64TextOutputFormat';
LOAD DATA LOCAL INPATH '/home/hivedata/cus' INTO TABLE cus;
配置文件相关内容
hive-default.xml.template
<property>
<name>hive.default.fileformat</name>
<value>TextFile</value>
<description>
Expects one of [textfile, sequencefile, rcfile, orc].
Default file format for CREATE TABLE statement. Users can explicitly override it by CREATE TABLE ... STORED AS [FORMAT]
</description>
</property>