1、orcfile的使用
创建orcfile表,不压缩
create table d_op_behavior_host_orc_none(
thedate string,
id string,
cookie_id string,
ip string,
siteid string,
last_time string,
pv int
)
stored as orc TBLPROPERTIES ("orc.compress"="NONE")
创建orcfile表,压缩
create table d_op_behavior_host_orc_snappy(
thedate string,
id string,
cookie_id string,
ip string,
siteid string,
last_time string,
pv int
)
stored as orc TBLPROPERTIES ("orc.compress"="SNAPPY")
Orcfile指定TBLPROPERTIES ("orc.compress"="ZLIB") or ("orc.compress"="SNAPPY") or ("orc.compress"="NONE") and other ORC properties
2、parquet的使用
创建parquet表
create table d_op_behavior_host_parq (
thedate string,
id string,
cookie_id string,
ip string,
siteid string,
last_time string,
pv int
)
ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
STORED AS
INPUTFORMAT "parquet.hive.DeprecatedParquetInputFormat"
OUTPUTFORMAT "parquet.hive.DeprecatedParquetOutputFormat";
不需要建表指定压缩方式,插入数据之前set parquet.compression=SNAPPY 或set parquet.compression=GZIP
3、sequencefile的使用
create table d_op_behavior_host_seq(
thedate string,
id string,
cookie_id string,
ip string,
siteid string,
last_time string,
pv int
) stored as sequencefile;
set hive.exec.compress.output=true;
set mapred.output.compress=true;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
set io.compression.codecs=org.apache.hadoop.io.compress.SnappyCodec;
SET mapred.output.compression.type=BLOCK;