1.抽样的结构
桶表bucket
为了抽样而设计的结构(为了让抽样更快)
桶表非常类似于分区表(将一块数据分成多块(也能提升查询速度))基于抽样分块 提升抽样速度
(分区表基于字段分区)
强行分桶
set hive.enforce.bucketing=true
2
create table t1(
id int,
name string,
age int
) clustered by (id) sorted by(id) into 8 buckets
row format delimited fields terminated by ',';
sorted by(id) 排序
导入数据
create table t2(
id int,
name string,
age int
)
load data inpath ... into table t2
insert into table t1 select * from t2
hash(id) % 8
2.抽样的函数
tablesample()
select * from t1 tablesample(10 rows)
select * from t1 tablesample(bucket 2 out of 4)
select * from t1 tablesample(bucket 2 #不要超过总桶数 out of 4 #桶的因数 倍数 )
1 2 3 4 5 6 7 8
抽到2跟6
/***
* ,%%%%%%%%,
* ,%%/\%%%%/\%%
* ,%%%\c "" J/%%%
* %. %%%%/ o o \%%%
* `%%. %%%% _ |%%%
* `%% `%%%%(__Y__)%%'
* // ;%%%%`\-/%%%'
* (( / `%%%%%%%'
* \\ .' |
* \\ / \ | |
* \\/ ) | |
* \ /_ | |__
* (___________))))))) 攻城湿
*/