show databases ;
describe formatted student2;--查看表的类型
desc database kb23hivedb;
show create database kb23hivedb;
show create table student2;
--分桶(Bucket)***********************************************************************
(1)分桶对应于HDFS中的文件
更高的查询处理效率
使抽样(sampling)更高效
一般根据”桶列“的哈希函数将数据进行分桶
(2)分桶只有动态分桶
set hive.enforce.bucketing=true;
(3)定义分桶(分桶的列是表中已有的列,分桶数最好是2的N次方)
clustered by (emplyee_id) into 2 BUCKETS
(4)必须使用insert方式加载数据
(5)进行分桶,将一个文件分成两个文件,并且一个文件夹是奇数数据,一个文件是偶数数据
--------------------------------------------------建表--------------------------------------------------
create table emplyee_id_buckets(
name string,
emplyee_id int,
work_place array<string>,
gender_age struct<gender:string,age:int>,
skills_score map<string,int>,
depart_title map<string,array<string>>
)
clustered by (emplyee_id) into 2 BUCKETS --分为2个cluster,分桶为2的n次方/倍数
row format delimited fields terminated by '|'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n';
----------------------------------------------------------------------------------------------------------
select * from emplyee_id_buckets; --选择分桶所有数据
set map.reduce.tasks=2; --分桶2
set hive.enforce.bucketing=true; --分桶只有动态分桶
insert overwrite table emplyee_id_buckets select * from emplyee_id; --分桶加载数据
----------------------------------------------------------------------------------------------------------
(6)分桶随机抽样
(1)查看一兆的数据
select * from emplyee_id_buckets tablesample ( 1M )s;
(2)查看十行的数据
select * from emplyee_id_buckets tablesample ( 10 rows )s;
(3)查看10%数据
select * from emplyee_id_buckets tablesample ( 10 percent )s;
(4)查看30%数据
select * from emplyee_id_buckets tablesample ( 30 percent )s;
(5)抽取随机数据随机抽样:将2个cluster分进16个桶,2/16=八分之一,每个桶有八分之一cluster组成,抽取第3个桶中的数据即八分之三cluster数据
select * from emplyee_id_buckets tablesample ( bucket 3 out of 16 on rand())s;
(6)抽取随机数据指定emplyee_id抽样:将2个cluster分进