hive存储压缩与优化

一、存储和压缩对比

1. 比较各种存储格式

--建立文本表格
create table log_text (
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as textfile;

--插入数据
load data local inpath '/opt/module/datas/log.data' into table log_text;
--建立非压缩的orc格式
create table log_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties("orc.compress"="NONE");

--插入数据
insert into log_orc select * from log_text;
--建立parquet格式
create table log_par(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as parquet;

--插入数据
insert into log_par select * from log_text;

2. 比较各种压缩格式

--zlib压缩的orc格式
create table log_orc_zlib(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties("orc.compress"="ZLIB");

--插入数据
insert into log_orc_zlib select * from log_text;
--snappy压缩的orc格式
create table log_orc_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties("orc.compress"="SNAPPY");

--插入数据
insert into log_orc_snappy select * from log_text;
--snappy压缩的parquet格式
create table log_par_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as parquet
tblproperties("parquet.compression"="SNAPPY");

--插入数据
insert into log_par_snappy select * from log_text;

二、表的优化

1. 小表,大表Join

--建立大表
create table bigtable(id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';

load data local inpath '/opt/module/datas/bigtable' into table bigtable;
--建立小表
create table smalltable(id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';

load data local inpath '/opt/module/datas/smalltable' into table smalltable;
--建立结果表格
create table jointable(id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
--连接操作
insert overwrite table jointable
select b.id, b.t, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url
from smalltable s
join bigtable  b
on b.id = s.id;

insert overwrite table jointable
select b.id, b.t, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url
from bigtable  b
join smalltable  s
on s.id = b.id;

2. 大表,大表Join

--建没有Null的表
create table ori(id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';

load data local inpath '/opt/module/datas/ori' into table ori;

--建立有Null的表
create table nullidtable(id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';

load data local inpath '/opt/module/datas/nullid' into table nullidtable;
--空key过滤
insert overwrite table jointable select n.* from (select * from nullidtable where id is not null ) n  left join ori o on n.id = o.id;

--空key转换
insert overwrite table jointable
select n.* from nullidtable n full join ori o on 
nvl(n.id,rand()) = o.id;

3. 动态分区

--首先设置非严格模式
set hive.exec.dynamic.partition.mode=nonstrict;
--创建分区表
create table dept_partition(id int, name string) partitioned
by (location int) row format delimited fields terminated by '\t';
--从原表中向分区表插入数据
insert into table dept_partition partition(location)
select deptno, dname, loc from dept;
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值