1、TEXTFILE
create table if not exists xt_format_text(
source string,
loginv string,
uv string,
dt string)
row format delimited
stored as textfile;
插入数据开启压缩:
set hive.exec.compress.output=true;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;
insert overwrite table xt_format_text select * from xt_test3;
2、SEQUENCEFILE
SequenceFile是Hadoop API提供的一种二进制文件支持,其具有使用方便、可分割、可压缩的特点。
SequenceFile支持三种压缩选择:NONE,RECORD,BLOCK。Record压缩率低,一般建议使用BLOCK压缩
create table if not exists xt_format_sequencefile(
source string,
loginv string,
uv string,
dt string)
row format delimited
stored as sequencefile;
插入数据开启压缩:
set hive.exec.compress.output=true;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;
SET mapred.output.compression.type=BLOCK;
insert overwrite table xt_format_sequencefile select * from xt_format_text;
3、RCFILE
RCFILE是一种行列存储相结合的存储方式。首先,其将数据按行分块,保证同一个record在一个块上,避免读一个记录需要读取多个block。其次,块数据列式存储,有利于数据压缩和快速的列存取
create table if not exists xt_format_rcfile(
source string,
loginv string,
uv string,
dt string)
row format delimited #rcfile时 会自动忽略这个,而使用 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe' 这个SerDe.
stored as rcfile;
插入数据开启压缩:
set hive.exec.compress.output=true;
set mapred.output.compress=true;#注意:hive不依赖这个设置,只依赖于hive.exec.compress.output的设置值。
set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;
insert overwrite table xt_format_rcfile select * from xt_format_sequencefile ;
4、AVRO
CREATE TABLE xt_format_avro
COMMENT "just drop the schema right into the HQL"
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES (
'avro.schema.literal'='{
"namespace": "com.letv.bigdata.dataplatform",
"name": "xt_avro_format_test",
"type": "record",
"fields": [ { "name":"source","type":"string"},
{ "name":"loginv","type":"string"},
{ "name":"uv","type":"string"},
{ "name":"dt", "type":"string"}
]
}');
5、ORC 文件格式(The Optimized Row Columnar (ORC))
create table if not exists xt_format_orcfile(
source string,
loginv string,
uv string,
dt string)
stored as orc;
实际例子:
CREATE TABLE xt_tds_did_user_targ_day(
dvc_id string,
user_id string,
p1 string,
p2 string,
p3 string,
prod_code string,
login_ip string,
cntry_name string,
area_name string,
prov_name string,
city_name string,
chnl_type string,
chnl_type_name string,
chnl_code string,
chnl_name string,
login_ref string,
net_type string,
oper_sys string,
oper_sys_ver string,
dvc_brand string,
dvc_model string,
dvc_type string,
dvc_dpi string,
brows_name string,
login_ts bigint,
first_login_date string,
first_app_ver string,
last_login_date string,
last_app_ver string,
evil_ip bigint,
pv bigint,
input_pv bigint,
ins_pv bigint,
qry_pv bigint,
outs_pv bigint,
coop_pv bigint,
vv bigint,
cv bigint,
pt bigint,
vod_vv bigint,
vod_cv bigint,
vod_pt bigint,
live_vv bigint,
live_cv bigint,
live_pt bigint,
ca_vv bigint,
ca_cv bigint,
ca_pt bigint,
try_vv bigint,
try_cv bigint,
try_pt bigint,
pay_vv bigint,
pay_cv bigint,
pay_pt bigint,
off_vv bigint,
off_cv bigint,
off_pt bigint,
block_ts bigint,
drag_ts bigint,
drag_ahd_ts bigint,
drag_bwd_ts bigint,
click_ts bigint,
instl_ts bigint,
stup_ts bigint,
movie_vv bigint,
movie_cv bigint,
movie_pt bigint,
tvp_vv bigint,
tvp_cv bigint,
tvp_pt bigint,
cartn_vv bigint,
cartn_cv bigint,
cartn_pt bigint,
var_vv bigint,
var_cv bigint,
var_pt bigint,
amuse_vv bigint,
amuse_cv bigint,
amuse_pt bigint,
sport_vv bigint,
sport_cv bigint,
sport_pt bigint,
music_vv bigint,
music_cv bigint,
music_pt bigint,
fin_vv bigint,
fin_cv bigint,
fin_pt bigint,
hot_vv bigint,
hot_cv bigint,
hot_pt bigint)
PARTITIONED BY (
dt string,
pf string)
STORED AS RCFILE;
set hive.exec.compress.output=true;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;
insert overwrite table xt_tds_did_user_targ_day partition(dt='20150425',pf='tv')
select
dvc_id,
user_id,
p1,
p2,
p3,
prod_code,
login_ip,
cntry_name,
area_name,
prov_name,
city_name,
chnl_type,
chnl_type_name,
chnl_code,
chnl_name,
login_ref,
net_type,
oper_sys,
oper_sys_ver,
dvc_brand,
dvc_model,
dvc_type,
dvc_dpi,
brows_name,
login_ts,
first_login_date,
first_app_ver,
last_login_date,
last_app_ver,
evil_ip,
pv,
input_pv,
ins_pv,
qry_pv,
outs_pv,
coop_pv,
vv,
cv,
pt,
vod_vv,
vod_cv,
vod_pt,
live_vv,
live_cv,
live_pt,
ca_vv,
ca_cv,
ca_pt,
try_vv,
try_cv,
try_pt,
pay_vv,
pay_cv,
pay_pt,
off_vv,
off_cv,
off_pt,
block_ts,
drag_ts,
drag_ahd_ts,
drag_bwd_ts,
click_ts,
instl_ts,
stup_ts,
movie_vv,
movie_cv,
movie_pt,
tvp_vv,
tvp_cv,
tvp_pt,
cartn_vv,
cartn_cv,
cartn_pt,
var_vv,
var_cv,
var_pt,
amuse_vv,
amuse_cv,
amuse_pt,
sport_vv,
sport_cv,
sport_pt,
music_vv,
music_cv,
music_pt,
fin_vv,
fin_cv,
fin_pt,
hot_vv,
hot_cv,
hot_pt from data_tds.tds_did_user_targ_day where dt='20150425' and pf='tv';