Hive数据倾斜
hive> use test6;
OK
Time taken: 0.011 seconds
hive> create table data_skew(
> key string
> ,col string
> )row format delimited fields terminated by ',';
OK
Time taken: 0.166 seconds
hive> load data local inpath '/usr/local/soft/data/data_skew.txt' into table data_skew;
Loading data to table test6.data_skew
Table test6.data_skew stats: [numFiles=1, totalSize=8983126]
OK
Time taken: 0.397 seconds
hive> set mapred.reduce.tasks=6;
hive> select key,count(*) from data_skew group by key;
OK
84402 400
84403 200
84404 300
84405 100
84401 982128
null 16872
Time taken: 31.281 seconds, Fetched: 6 row(s)
hive> select * from data_skew tablesample(10 percent);
hive> select key,count(*) from data_skew tablesample(10 percent)group by key;
OK
84402 400
84403 200
84404 300
84405 100
84401 982128
null 16872
Time taken: 31.729 seconds, Fetched: 6 row(s)
hive> select key,count(*) from (select * from data_skew tablesample(10 percent))t1 group by t1.key;
OK
84402 400
84403 200
84404 300
84405 100
84401 982128
null 16872
Time taken: 30.129 seconds, Fetched: 6 row(s)
hive> create table data_skew_sample as select * from data_skew tablesample(10 percent);
OK
Time taken: 15.541 seconds
hive> select count(*) from data_skew_sample;
OK
1000000
Time taken: 20.311 seconds, Fetched: 1 row(s)
hive> select * from data_skew distribute by rand() sort by rand() limit 10000;
hive> select if(key='84401',rand(),-1)from data_skew limit 10;
OK
0.4715909925737496
0.6333747805176559
0.07481627614769659
0.9834538485039709
0.3812408770264697
0.5047202313471298
0.29126524706224566
0.2839361660201293
0.8005911653315098
0.7408824800692296
Time taken: 0.264 seconds, Fetched: 10 row(s)
hive> select key
> ,if(key='84401',rand(),-1)
> ,count(*)
> from data_skew
> group by key,if(key='84401',rand(),-1);
hive> select key
> ,if(key='84401',floor(rand()*6),-1)
> ,count(*)
> from data_skew
> group by key,if(key='84401',floor(rand()*6),-1);
OK
84401 0 163532
84401 1 163528
84401 2 163677
84401 3 163595
84401 4 163429
84401 5 164367
84402 -1 400
84403 -1 200
84404 -1 300
84405 -1 100
null -1 16872
Time taken: 22.223 seconds, Fetched: 11 row(s)
hive> select key
> ,if(key='84401'or key='null',floor(rand()*6),-1)
> ,count(*)
> from data_skew
> group by key,if(key='84401'or key='null',floor(rand()*6),-1);
OK
84401 0 163547
84401 1 163864
84401 2 163813
84401 3 164480
84401 4 163658
84401 5 162766
84402 -1 400
84403 -1 200
84404 -1 300
84405 -1 100
null 0 2692
null 1 2849
null 2 2858
null 3 2774
null 4 2765
null 5 2934
Time taken: 22.134 seconds, Fetched: 16 row(s)
hive> select t1.key
> ,sum(t1.cnt) as total
> from(
> select key
> ,if(key='84401'or key='null',floor(rand()*6),-1)
> ,count(*) as cnt
> from data_skew
> group by key,if(key='84401'or key='null',floor(rand()*6),-1)
> )t1 group by key;
OK
84401 982128
84402 400
84403 200
84404 300
84405 100
null 16872
Time taken: 23.583 seconds, Fetched: 6 row(s)