textfile 数据格式 (行式存储),是默认存储方式
create table log_text (
track_time string,
url string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS textfile;
load data local inpath '/root/log.data' into table log_text;
orc 数据格式 (行列存储)
ORC同时具备行式存储和列式存储的优点,且压缩速度快,能实现高效的存和取。
create table log_orc(
track_time string,
url string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc ;
insert into table log_orc select * from log_text ;
parquet 数据格式 (列式存储)
create table log_parquet(
track_time string,
url string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET ;
insert into table log_parquet select * from log_text ;
orc 数据格式 + 无压缩格式
create table log_orc_none(
track_time string,
url string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc
tblproperties ("orc.compress"="NONE");
insert into table log_orc_none select * from log_text ;
orc 数据格式 + snappy 压缩格式
Snappy压缩速度快、压缩率合理,配合ORC能够达到最优的性能。
--写入时压缩生效
set hive.exec.orc.compression.strategy=COMPRESSION;
create table log_orc_snappy(
track_time string,
url string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc
tblproperties ("orc.compress"="SNAPPY");
insert into table log_orc_snappy select * from log_text ;
orc 数据格式 + zlib 压缩格式
Zlib压缩率很高,对于一些使用率很低,且数据量庞大的数据,可以使用Zlib节省磁盘空间。
--写入时压缩生效
set hive.exec.orc.compression.strategy=COMPRESSION;
create table log_orc_zlib(
track_time string,
url string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc
tblproperties ("orc.compress"="ZLIB");
insert into table log_orc_snappy select * from log_text ;