一、存储和压缩对比
1. 比较各种存储格式
create table log_text (
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as textfile;
load data local inpath '/opt/module/datas/log.data' into table log_text;
create table log_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties( "orc.compress" = "NONE" ) ;
insert into log_orc select * from log_text;
create table log_par(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as parquet;
insert into log_par select * from log_text;
2. 比较各种压缩格式
create table log_orc_zlib(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties( "orc.compress" = "ZLIB" ) ;
insert into log_orc_zlib select * from log_text;
create table log_orc_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties( "orc.compress" = "SNAPPY" ) ;
insert into log_orc_snappy select * from log_text;
create table log_par_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as parquet
tblproperties( "parquet.compression" = "SNAPPY" ) ;
insert into log_par_snappy select * from log_text;
二、表的优化
1. 小表,大表Join
create table bigtable( id bigint , t bigint , uid string, keyword string, url_rank int , click_num int , click_url string) row format delimited fields terminated by '\t' ;
load data local inpath '/opt/module/datas/bigtable' into table bigtable;
create table smalltable( id bigint , t bigint , uid string, keyword string, url_rank int , click_num int , click_url string) row format delimited fields terminated by '\t' ;
load data local inpath '/opt/module/datas/smalltable' into table smalltable;
create table jointable( id bigint , t bigint , uid string, keyword string, url_rank int , click_num int , click_url string) row format delimited fields terminated by '\t' ;
insert overwrite table jointable
select b. id, b. t, b. uid, b. keyword, b. url_rank, b. click_num, b. click_url
from smalltable s
join bigtable b
on b. id = s. id;
insert overwrite table jointable
select b. id, b. t, b. uid, b. keyword, b. url_rank, b. click_num, b. click_url
from bigtable b
join smalltable s
on s. id = b. id;
2. 大表,大表Join
create table ori( id bigint , t bigint , uid string, keyword string, url_rank int , click_num int , click_url string) row format delimited fields terminated by '\t' ;
load data local inpath '/opt/module/datas/ori' into table ori;
create table nullidtable( id bigint , t bigint , uid string, keyword string, url_rank int , click_num int , click_url string) row format delimited fields terminated by '\t' ;
load data local inpath '/opt/module/datas/nullid' into table nullidtable;
insert overwrite table jointable select n. * from ( select * from nullidtable where id is not null ) n left join ori o on n. id = o. id;
insert overwrite table jointable
select n. * from nullidtable n full join ori o on
nvl( n. id, rand( ) ) = o. id;
3. 动态分区
set hive. exec . dynamic. partition . mode = nonstrict;
create table dept_partition( id int , name string) partitioned
by ( location int ) row format delimited fields terminated by '\t' ;
insert into table dept_partition partition ( location)
select deptno, dname, loc from dept;