Hive数据倾斜

 Hive数据倾斜

hive> use test6;
OK
Time taken: 0.011 seconds
hive> create table data_skew(
    > key string
    > ,col string
    > )row format delimited fields terminated by ',';
OK
Time taken: 0.166 seconds
hive> load data local inpath '/usr/local/soft/data/data_skew.txt' into table data_skew;
Loading data to table test6.data_skew
Table test6.data_skew stats: [numFiles=1, totalSize=8983126]
OK
Time taken: 0.397 seconds
hive> set mapred.reduce.tasks=6;
hive> select key,count(*) from data_skew group by key;
OK
84402	400
84403	200
84404	300
84405	100
84401	982128
null	16872
Time taken: 31.281 seconds, Fetched: 6 row(s)
hive> select * from data_skew tablesample(10 percent);
hive> select key,count(*) from data_skew tablesample(10 percent)group by key;
OK
84402	400
84403	200
84404	300
84405	100
84401	982128
null	16872
Time taken: 31.729 seconds, Fetched: 6 row(s)
hive> select key,count(*) from (select * from data_skew tablesample(10 percent))t1 group by t1.key;
OK
84402	400
84403	200
84404	300
84405	100
84401	982128
null	16872
Time taken: 30.129 seconds, Fetched: 6 row(s)
hive> create table data_skew_sample as select * from data_skew tablesample(10 percent);
OK
Time taken: 15.541 seconds
hive> select count(*) from data_skew_sample;
OK
1000000
Time taken: 20.311 seconds, Fetched: 1 row(s)
hive> select * from data_skew distribute by rand() sort by rand() limit 10000;
hive> select if(key='84401',rand(),-1)from data_skew limit 10;
OK
0.4715909925737496
0.6333747805176559
0.07481627614769659
0.9834538485039709
0.3812408770264697
0.5047202313471298
0.29126524706224566
0.2839361660201293
0.8005911653315098
0.7408824800692296
Time taken: 0.264 seconds, Fetched: 10 row(s)
hive> select key
    > ,if(key='84401',rand(),-1)
    > ,count(*)
    > from data_skew
    > group by key,if(key='84401',rand(),-1);
hive> select key
    > ,if(key='84401',floor(rand()*6),-1)
    > ,count(*)
    > from data_skew
    > group by key,if(key='84401',floor(rand()*6),-1);
OK
84401	0	163532
84401	1	163528
84401	2	163677
84401	3	163595
84401	4	163429
84401	5	164367
84402	-1	400
84403	-1	200
84404	-1	300
84405	-1	100
null	-1	16872
Time taken: 22.223 seconds, Fetched: 11 row(s)
hive> select key
    > ,if(key='84401'or key='null',floor(rand()*6),-1)
    > ,count(*)
    > from data_skew
    > group by key,if(key='84401'or key='null',floor(rand()*6),-1);
OK
84401	0	163547
84401	1	163864
84401	2	163813
84401	3	164480
84401	4	163658
84401	5	162766
84402	-1	400
84403	-1	200
84404	-1	300
84405	-1	100
null	0	2692
null	1	2849
null	2	2858
null	3	2774
null	4	2765
null	5	2934
Time taken: 22.134 seconds, Fetched: 16 row(s)
hive> select t1.key
    > ,sum(t1.cnt) as total
    > from(
    > select key
    > ,if(key='84401'or key='null',floor(rand()*6),-1)
    > ,count(*) as cnt
    > from data_skew
    > group by key,if(key='84401'or key='null',floor(rand()*6),-1)
    > )t1 group by key;
OK
84401	982128
84402	400
84403	200
84404	300
84405	100
null	16872
Time taken: 23.583 seconds, Fetched: 6 row(s)

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值