hive的压缩一般分为三类
(1)从hive输出层面的压缩
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
(2)从mapreduce层面
set mapreduce.map.output.compress=true;
set mapreduce.map.output.compress.codec=com.hadoop.compression.lzo.LzoCodec;
set mapreduce.output.fileoutputformat.compress=false;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.LzoCodec;
(3)从hive表结构层面
比如说hive表的格式:text,rcfile,orc等。
建表
/tmp/lgh/compression
create table c_text
(
`t` string,
`cip` string ,
`u` string ,
`ur` string ,
`ar` string ,
`ua` string ,
`pvid` string ,
`ut` string ,
`tt` string ,
`tp` string ,
`tu` string ,
`cp` string
)
PARTITIONED BY (
`dt` string)
stored as textfile
location '/tmp/lgh/compression/c_text';
create table c_text_compress
(
`t` string,
`cip` string ,
`u` string ,
`ur` string ,
`ar` string ,
`ua` string ,
`pvid` string ,
`ut` string ,
`tt` string ,
`tp` string ,
`tu` string ,
`cp` string
)PARTITIONED BY (
`dt` string)
stored as textfile
location '/tmp/lgh/compression/c_text_compress';
create table c_orc
(
`t` string,
`cip` string ,
`u` string ,
`ur` string ,
`ar` string ,
`ua` string ,
`pvid` string ,
`ut` string ,
`tt` string ,
`tp` string ,
`tu` string ,
`cp` string
)PARTITIONED BY (
`dt` string)
stored as orc
location '/tmp/lgh/compression/c_orc';
create table c_orc_compress
(
`t` string,
`cip` string ,
`u` string ,
`ur` string ,
`ar` string ,
`ua` string ,
`pvid` string ,
`ut` string ,
`tt` string ,
`tp` string ,
`tu` string ,
`cp` string
)
PARTITIONED BY (
`dt` string)
stored as orc
location '/tmp/lgh/compression/c_orc_compress';
create table c_rcfile_compress
(
`t` string,
`cip` string ,
`u` string ,
`ur` string ,
`ar` string ,
`ua` string ,
`pvid` string ,
`ut` string ,
`tt` string ,
`tp` string ,
`tu` string ,
`cp` string
)
PARTITIONED BY (
`dt` string)
stored as rcfile
location '/tmp/lgh/compression/c_rcfile_compress';
一.完全不使用任何压缩,textfile格式
set hive.exec.compress.intermediate=false;
set hive.exec.compress.output=false;
set mapreduce.map.output.compress=false;
set mapreduce.map.output.compress.codec=com.hadoop.compression.lzo.LzoCodec;
set mapreduce.output.fileoutputformat.compress=false;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.LzoCodec;
insert overwrite table c_text partition(dt='20170505') select t,cip,u,ur,ar,ua,pvid,ut,tt,tp,tu,cp from logs.logservice where dt='20170505';
二.完全不使用任何压缩,orc格式
set hive.exec.compress.intermediate=false;
set hive.exec.compress.output=false;
set mapreduce.map.output.compress=false;
set mapreduce.map.output.compress.codec=com.hadoop.compression.lzo.LzoCodec;
set mapreduce.output.fileoutputformat.compress=false;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.LzoCodec;
insert overwrite table c_orc partition(dt='20170505') select t,cip,u,ur,ar,ua,pvid,ut,tt,tp,tu,cp from logs.logservice where dt='20170505';
三.使用hive压缩,textfile
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
set mapreduce.map.output.compress=false;
set mapreduce.map.output.compress.codec=com.hadoop.compression.lzo.LzoCodec;
set mapreduce.output.fileoutputformat.compress=false;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.LzoCodec;
insert overwrite table c_text_compress partition(dt='20170505_hive') select t,cip,u,ur,ar,ua,pvid,ut,tt,tp,tu,cp from logs.logservice where dt='20170505';
四.使用mapreduce压缩,textfile
set hive.exec.compress.intermediate=false;
set hive.exec.compress.output=false;
set mapreduce.map.output.compress=true;
set mapreduce.map.output.compress.codec=com.hadoop.compression.lzo.LzoCodec;
set mapreduce.output.fileoutputformat.compress=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.LzoCodec;
insert overwrite table c_text_compress partition(dt='20170505_mr') select t,cip,u,ur,ar,ua,pvid,ut,tt,tp,tu,cp from logs.logservice where dt='20170505';
五.同时使用hive和mapreduce压缩,textfile
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
set mapreduce.map.output.compress=true;
set mapreduce.map.output.compress.codec=com.hadoop.compression.lzo.LzoCodec;
set mapreduce.output.fileoutputformat.compress=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.LzoCodec;
insert overwrite table c_text_compress partition(dt='20170505_mr_hive') select t,cip,u,ur,ar,ua,pvid,ut,tt,tp,tu,cp from logs.logservice where dt='20170505';
六.使用hive压缩,orc格式(默认的zlib压缩,hive设置的压缩不生效)
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
set mapreduce.map.output.compress=false;
set mapreduce.map.output.compress.codec=com.hadoop.compression.lzo.LzoCodec;
set mapreduce.output.fileoutputformat.compress=false;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.LzoCodec;
set orc.compress=ZLIB;
insert overwrite table c_orc_compress partition(dt='20170505_hive') select t,cip,u,ur,ar,ua,pvid,ut,tt,tp,tu,cp from logs.logservice where dt='20170505';
七.orc格式+NONE
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
set mapreduce.map.output.compress=false;
set mapreduce.map.output.compress.codec=com.hadoop.compression.lzo.LzoCodec;
set mapreduce.output.fileoutputformat.compress=false;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.LzoCodec;
set orc.compress=NONE;
insert overwrite table c_orc_compress partition(dt='20170505_none') select t,cip,u,ur,ar,ua,pvid,ut,tt,tp,tu,cp from logs.logservice where dt='20170505';
八.orc格式+SPNAY
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
set mapreduce.map.output.compress=false;
set mapreduce.map.output.compress.codec=com.hadoop.compression.lzo.LzoCodec;
set mapreduce.output.fileoutputformat.compress=false;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.LzoCodec;
set orc.compress=SNAPPY;
insert overwrite table c_orc_compress partition(dt='20170505_snappy') select t,cip,u,ur,ar,ua,pvid,ut,tt,tp,tu,cp from logs.logservice where dt='20170505';
九.rcfile使用hive压缩
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
set mapreduce.map.output.compress=false;
set mapreduce.map.output.compress.codec=com.hadoop.compression.lzo.LzoCodec;
set mapreduce.output.fileoutputformat.compress=false;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.LzoCodec;
insert overwrite table c_rcfile_compress partition(dt='20170505_hive') select t,cip,u,ur,ar,ua,pvid,ut,tt,tp,tu,cp from logs.logservice where dt='20170505';
十.rcfile不使用hive压缩
set hive.exec.compress.intermediate=false;
set hive.exec.compress.output=false;
set mapreduce.map.output.compress=false;
set mapreduce.map.output.compress.codec=com.hadoop.compression.lzo.LzoCodec;
set mapreduce.output.fileoutputformat.compress=false;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.LzoCodec;
insert overwrite table c_rcfile_compress partition(dt='20170505_none') select t,cip,u,ur,ar,ua,pvid,ut,tt,tp,tu,cp from logs.logservice where dt='20170505';
结果:
hadoop fs -du -h /tmp/lgh/compression/c_text/
7.3 G /tmp/lgh/compression/c_text/dt=20170505
hadoop fs -du -h /tmp/lgh/compression/c_text_compress/
2.8 G /tmp/lgh/compression/c_text_compress/dt=20170505_hive
7.3 G /tmp/lgh/compression/c_text_compress/dt=20170505_mr
2.8 G /tmp/lgh/compression/c_text_compress/dt=20170505_mr_hive
hadoop fs -du -h /tmp/lgh/compression/c_orc/
688.3 M /tmp/lgh/compression/c_orc/dt=20170505
hadoop fs -du -h /tmp/lgh/compression/c_orc_compress/
688.3 M /tmp/lgh/compression/c_orc_compress/dt=20170505_hive
2.4 G /tmp/lgh/compression/c_orc_compress/dt=20170505_none
974.5 M /tmp/lgh/compression/c_orc_compress/dt=20170505_snappy
hadoop fs -du -h /tmp/lgh/compression/c_rcfile_compress/
2.1 G /tmp/lgh/compression/c_rcfile_compress/dt=20170505_hive
7.2 G /tmp/lgh/compression/c_rcfile_compress/dt=20170505_none
结论:
1.hive的压缩参数对orc格式没有效果,对text,rcfile格式起作用(but why?个人猜测原因可能是orc不支持lzo压缩)
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
2.mapreduce的压缩参数对hive任务没有起作用,以下参数只在单纯的mapreduce作业中生效
set mapreduce.map.output.compress=true;
set mapreduce.output.fileoutputformat.compress=true;
3.hive对于orz的压缩格式,可以设置orc.compress参数或者hive.exec.orc.default.compress来实现。(可选值有NONE, ZLIB, SNAPPY)
测试结果
格式 | 大小 |
---|---|
text | 7.3GB |
lzo | 2.8GB |
orc+none | 2.4GB |
orc+snappy | 974.5MB |
orc+zlib | 688.3MB |
rcfile+nono | 7.2GB |
rcfile+lzo | 2.1GB |
gzip | 1.8GB |