分区、分桶练习

最新推荐文章于 2022-11-09 20:38:54 发布

雨中禁火

最新推荐文章于 2022-11-09 20:38:54 发布

阅读量110

点赞数

分类专栏：大数据文章标签：大数据 hive

本文链接：https://blog.csdn.net/weixin_57551874/article/details/116241776

版权

大数据专栏收录该内容

23 篇文章 0 订阅

订阅专栏

本文介绍了如何将数据分区存储在Hive表ip1中，根据IP首字进行分区，并将数据按照fadd字段进行分桶排序。随后，详细步骤演示了如何查询特定桶的数据，如第2和5桶。涉及的技术包括数据加载、分区表、分桶排序和SQL查询。

摘要由CSDN通过智能技术生成

1.将数据放入分区表ip1(fadd,sadd,tadd,fouadd)中并进行分区,(区号:firstadd),即:ip第一位
2.将数据放入分桶排序(正序)表ip2,分8个桶
3.按照fadd查出第2,5桶
4.按照fadd查找第2,4,6,8桶

1.将数据放入分区表ip1(fadd,sadd,tadd,fouadd)中并进行分区,(区号:firstadd),即:ip第一位
create external table if not exists ip1(
fadd int,
asdd int,
tadd int,
fouadd int
)
partitioned by (firstadd int)
row format delimited fields terminated by '.'
lines terminated by '\n'

load data local inpath '/root/ip/ip1.txt' into table ip.ip1
partition(firstadd=192)

load data local inpath '/root/ip/ip2.txt' into table ip.ip1
partition(firstadd=132)

load data local inpath '/root/ip/ip3.txt' into table ip.ip1
partition(firstadd=122)

load data local inpath '/root/ip/ip4.txt' into table ip.ip1
partition(firstadd=182)

2.将数据放入分桶排序(正序)表ip2,分8个桶
分桶
第一种：
create external table if not exists ip2(
fadd int,
asdd int,
tadd int,
fouadd int
)
clustered by (fadd)
sorted by (fadd) into 8 buckets
row format delimited fields terminated by '.'
lines terminated by '\n'

set mapreduce.job.reduces=8

load data local inpath '/root/ip/ip1.txt' into table ip.ip2
load data local inpath '/root/ip/ip2.txt' into table ip.ip2
load data local inpath '/root/ip/ip3.txt' into table ip.ip2
load data local inpath '/root/ip/ip4.txt' into table ip.ip2


第二种：
create external table if not exists t_ip2(
fadd int,
asdd int,
tadd int,
fouadd int
)
row format delimited fields terminated by '.'
lines terminated by '\n'

load data local inpath '/root/ip/ip1.txt' into table ip.t_ip2;
load data local inpath '/root/ip/ip2.txt' into table ip.t_ip2;
load data local inpath '/root/ip/ip3.txt' into table ip.t_ip2;
load data local inpath '/root/ip/ip4.txt' into table ip.t_ip2;

insert overwrite table ip2
select fadd,asdd,tadd,fouadd
from t_ip2


set hive.exec.mode.local.auto=true;

create external table if not exists ip3(
fadd int,
asdd int,
tadd int,
fouadd int
)
clustered by (asdd)
sorted by (asdd) into 8 buckets
row format delimited fields terminated by '.'
lines terminated by '\n'

insert overwrite table ip3
select fadd,asdd,tadd,fouadd
from t_ip2


3.按照fadd查出第2,5桶
select * from ip3 tablesample(bucket 2 out of 4 on asdd);


4.按照fadd查找第2,4,6,8桶
select * from ip3 tablesample(bucket 2 out of 2 on asdd);

ip1.txt
192.178.10.128
192.168.10.138
192.168.10.118
192.168.10.28
192.158.10.128	
192.158.10.138
192.138.0.118
ip2.txt
132.168.11.28
132.128.19.128
132.128.10.138
132.158.10.118
132.168.12.28
ip3.txt
122.168.10.38
122.128.14.138
122.178.10.118
122.168.12.29
ip4.txt
182.168.10.129
182.168.10.108
182.168.10.118
182.168.10.280
182.168.10.120
182.168.10.108
182.168.10.118
182.168.10.256