分区，分桶，JDBC

小胖超凶哦！

已于 2022-04-09 14:48:09 修改

阅读量1k

点赞数 1

分类专栏： hive基础初学大数据文章标签： hive

于 2022-04-06 21:22:37 首次发布

本文链接：https://blog.csdn.net/ZZJXP/article/details/123997952

版权

初学大数据同时被 2 个专栏收录

158 篇文章 1 订阅

订阅专栏

hive基础

19 篇文章 0 订阅

订阅专栏

分区

hive> create database test3; 
OK
Time taken: 0.051 seconds
hive> use test3;
OK
Time taken: 0.011 seconds
hive> create external table students_pt1
    > (
    >     id bigint,
    >     name string,
    >     age int,
    >     gender string,
    >     clazz string
    > )
    > PARTITIONED BY(pt string)
    > ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    > LOCATION '/student/input1';
OK
Time taken: 0.058 seconds
hive> alter table students_pt1 add partition(pt='20220405');
OK
Time taken: 0.086 seconds
hive> load data local inpath '/usr/local/soft/data/students.txt' into table students_pt1;
FAILED: SemanticException [Error 10062]: Need to specify partition columns because the destination table is partitioned
hive> load data local inpath '/usr/local/soft/data/students.txt' into table students_pt1 partition(pt='20220405');
Loading data to table test3.students_pt1 partition (pt=20220405)
Partition test3.students_pt1{pt=20220405} stats: [numFiles=1, totalSize=41998]
OK
Time taken: 0.653 seconds

hive> select * from students_pt1 limit 10;
OK
1500100001	施笑槐	22	女	文科六班	20220405
1500100002	吕金鹏	24	男	文科六班	20220405
1500100003	单乐蕊	22	女	理科六班	20220405
1500100004	葛德曜	24	男	理科三班	20220405
1500100005	宣谷芹	22	女	理科五班	20220405
1500100006	边昂雄	21	男	理科二班	20220405
1500100007	尚孤风	23	女	文科六班	20220405
1500100008	符半双	22	女	理科六班	20220405
1500100009	沈德昌	21	男	理科一班	20220405
1500100010	羿彦昌	23	男	理科六班	20220405
Time taken: 0.15 seconds, Fetched: 10 row(s)
hive> select * from students_pt1 where pt='20220405' limit 10;
OK
1500100001	施笑槐	22	女	文科六班	20220405
1500100002	吕金鹏	24	男	文科六班	20220405
1500100003	单乐蕊	22	女	理科六班	20220405
1500100004	葛德曜	24	男	理科三班	20220405
1500100005	宣谷芹	22	女	理科五班	20220405
1500100006	边昂雄	21	男	理科二班	20220405
1500100007	尚孤风	23	女	文科六班	20220405
1500100008	符半双	22	女	理科六班	20220405
1500100009	沈德昌	21	男	理科一班	20220405
1500100010	羿彦昌	23	男	理科六班	20220405
Time taken: 0.16 seconds, Fetched: 10 row(s)

hive> alter table students_pt1 add partition(pt='20220404');
OK
Time taken: 0.048 seconds
hive> alter table students_pt1 add partition(pt='20220403');
OK
Time taken: 0.049 seconds
hive> alter table students_pt1 add partition(pt='20220402');
OK
Time taken: 0.047 seconds
hive> alter table students_pt1 add partition(pt='20220401');
OK
Time taken: 0.048 seconds
hive> alter table students_pt1 drop partition(pt='20220401');
Dropped the partition pt=20220401
OK
Time taken: 0.107 seconds
hive> show partitions students_pt1;
OK
pt=20220402
pt=20220403
pt=20220404
pt=20220405
Time taken: 0.035 seconds, Fetched: 4 row(s)
hive> alter table students_pt1 add partition(pt='20220401');
OK
Time taken: 0.046 seconds
hive> show partitions students_pt1;
OK
pt=20220401
pt=20220402
pt=20220403
pt=20220404
pt=20220405
Time taken: 0.04 seconds, Fetched: 5 row(s)

hive> desc students_pt1;
OK
id                  	bigint              	                    
name                	string              	                    
age                 	int                 	                    
gender              	string              	                    
clazz               	string              	                    
pt                  	string              	                    
	 	 
# Partition Information	 	 
# col_name            	data_type           	comment             
	 	 
pt                  	string              	                    
Time taken: 0.031 seconds, Fetched: 11 row(s)

动态分区

# 表示开启动态分区
hive> set hive.exec.dynamic.partition=true;
# 表示动态分区模式：strict（需要配合静态分区一起使用）、nostrict
# strict： insert into table students_pt partition(dt='anhui',pt) select ......,pt from students;
hive> set hive.exec.dynamic.partition.mode=nostrict;
# 表示支持的最大的分区数量为1000，可以根据业务自己调整
hive> set hive.exec.max.dynamic.partitions.pernode=1000;

hive> set hive.exec.dynamic.partition=true;
hive> set hive.exec.dynamic.partition.mode=nostrict;
hive> set hive.exec.max.dynamic.partitions.pernode=1000;
hive> create table students_dt
    > (
    >     id bigint,
    >     name string,
    >     age int,
    >     gender string,
    >     clazz string,
    >     dt string
    > )
    > ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
OK
Time taken: 0.039 seconds
hive> load data local inpath '/usr/local/soft/data/students_dt.txt' into table students_dt;
Loading data to table test3.students_dt
Table test3.students_dt stats: [numFiles=1, totalSize=50998]
OK
Time taken: 0.511 seconds
hive> show tables;
OK
students_dt
students_pt1
Time taken: 0.013 seconds, Fetched: 2 row(s)
hive> desc students_dt;
OK
id                  	bigint              	                    
name                	string              	                    
age                 	int                 	                    
gender              	string              	                    
clazz               	string              	                    
dt                  	string              	                    
Time taken: 0.035 seconds, Fetched: 6 row(s)
hive> select * from students_dt limit 10;
OK
1500100001	施笑槐	22	女	文科六班	20220101
1500100002	吕金鹏	24	男	文科六班	20220101
1500100003	单乐蕊	22	女	理科六班	20220101
1500100004	葛德曜	24	男	理科三班	20220101
1500100005	宣谷芹	22	女	理科五班	20220101
1500100006	边昂雄	21	男	理科二班	20220101
1500100007	尚孤风	23	女	文科六班	20220101
1500100008	符半双	22	女	理科六班	20220101
1500100009	沈德昌	21	男	理科一班	20220101
1500100010	羿彦昌	23	男	理科六班	20220101
Time taken: 0.034 seconds, Fetched: 10 row(s)

hive> create table students_dt_p
    > (
    >     id bigint,
    >     name string,
    >     age int,
    >     gender string,
    >     clazz string
    > )
    > PARTITIONED BY(dt string)
    > ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
OK
Time taken: 0.033 seconds

// 分区字段需要放在 select 的最后，如果有多个分区字段同理，它是按位置匹配，不是按名字匹配
insert into table students_dt_p partition(dt) select id,name,age,gender,clazz,dt from students_dt;
// 比如下面这条语句会使用age作为分区字段，而不会使用student_dt中的dt作为分区字段
insert into table students_dt_p partition(dt) select id,name,age,gender,dt,age from students_dt;

hive> insert into table students_dt_p partition(dt) select id,name,age,gender,clazz,dt from students_dt;

多级分区

hive> create table students_year_month
    > (
    >     id bigint,
    >     name string,
    >     age int,
    >     gender string,
    >     clazz string,
    >     year string,
    >     month string
    > )
    > ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
OK
Time taken: 0.032 seconds
hive> create table students_year_month_pt
    > (
    >     id bigint,
    >     name string,
    >     age int,
    >     gender string,
    >     clazz string
    > )
    > PARTITIONED BY(year string,month string)
    > ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
OK
Time taken: 0.03 seconds
hive> load data local inpath '/usr/local/soft/data/students_year_month.txt' into table students_year_month;
Loading data to table test3.students_year_month
Table test3.students_year_month stats: [numFiles=1, totalSize=49998]
OK
Time taken: 0.496 seconds
hive> select * from students_year_month limit 10;
OK
1500100001	施笑槐	22	女	文科六班	2022   01
1500100002	吕金鹏	24	男	文科六班	2022   01
1500100003	单乐蕊	22	女	理科六班	2022   01
1500100004	葛德曜	24	男	理科三班	2022   01
1500100005	宣谷芹	22	女	理科五班	2022   01
1500100006	边昂雄	21	男	理科二班	2022   01
1500100007	尚孤风	23	女	文科六班	2022   01
1500100008	符半双	22	女	理科六班	2022   01
1500100009	沈德昌	21	男	理科一班	2022   01
1500100010	羿彦昌	23	男	理科六班	2022   01
Time taken: 0.031 seconds, Fetched: 10 row(s)

hive> insert into table students_year_month_pt partition(year,month) select id,name,age,gender,clazz,year,month from students_year_month;
Query ID = root_20220406203304_f82525b2-c127-47fa-a0be-0c43cc56ae86
Total jobs = 3
Launching Job 1 out of 3
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1649211406017_0010, Tracking URL = http://master:8088/proxy/application_1649211406017_0010/
Kill Command = /usr/local/soft/hadoop-2.7.6/bin/hadoop job  -kill job_1649211406017_0010
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2022-04-06 20:33:12,324 Stage-1 map = 0%,  reduce = 0%
2022-04-06 20:33:18,566 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.49 sec
MapReduce Total cumulative CPU time: 1 seconds 490 msec
Ended Job = job_1649211406017_0010
Stage-4 is selected by condition resolver.
Stage-3 is filtered out by condition resolver.
Stage-5 is filtered out by condition resolver.
Moving data to: hdfs://master:9000/user/hive/warehouse/test3.db/students_year_month_pt/.hive-staging_hive_2022-04-06_20-33-04_845_6271486721060924938-1/-ext-10000
Loading data to table test3.students_year_month_pt partition (year=null, month=null)
	 Time taken for load dynamic partitions : 392
	Loading partition {year=2022, month=07}
	Loading partition {year=2022, month=04}
	Loading partition {year=2022, month=06}
	Loading partition {year=2022, month=03}
	Loading partition {year=2022, month=01}
	Loading partition {year=2022, month=02}
	Loading partition {year=2022, month=08}
	Loading partition {year=2022, month=05}
	Loading partition {year=2021, month=10}
	Loading partition {year=2022, month=09}
	Loading partition {year=2021, month=09}
	 Time taken for adding to write entity : 1
Partition test3.students_year_month_pt{year=2021, month=09} stats: [numFiles=1, numRows=12, totalSize=492, rawDataSize=480]
Partition test3.students_year_month_pt{year=2021, month=10} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=01} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=02} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=03} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=04} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=05} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=06} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=07} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=08} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=09} stats: [numFiles=1, numRows=88, totalSize=3608, rawDataSize=3520]
MapReduce Jobs Launched: 
Stage-Stage-1: Map: 1   Cumulative CPU: 1.49 sec   HDFS Read: 54609 HDFS Write: 41793 SUCCESS
Total MapReduce CPU Time Spent: 1 seconds 490 msec
OK
Time taken: 16.637 seconds

分桶

hive> set hive.enforce.bucketing=true;
hive> create table students_buks
    > (
    >     id bigint,
    >     name string,
    >     age int,
    >     gender string,
    >     clazz string
    > )
    > CLUSTERED BY (clazz) into 12 BUCKETS
    > ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; 
OK
Time taken: 0.039 seconds

// 直接使用load data 并不能将数据打散
load data local inpath '/usr/local/soft/data/students.txt' into table students_buks;

// 需要使用下面这种方式插入数据，才能使分桶表真正发挥作用
insert into students_buks select * from students;

hive> create table students
    > (
    >     id bigint,
    >     name string,
    >     age int,
    >     gender string,
    >     clazz string
    > )
    > ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
OK
Time taken: 0.031 seconds
hive> load data local inpath '/usr/local/soft/data/students.txt' into table students;
Loading data to table test3.students
Table test3.students stats: [numFiles=1, totalSize=41998]
OK
Time taken: 0.484 seconds
hive> show tables;
OK
students
students_buks
students_dt
students_dt_p
students_pt1
students_year_month
students_year_month_pt
Time taken: 0.01 seconds, Fetched: 7 row(s)
hive> select * from students limit 10;
OK
1500100001	施笑槐	22	女	文科六班
1500100002	吕金鹏	24	男	文科六班
1500100003	单乐蕊	22	女	理科六班
1500100004	葛德曜	24	男	理科三班
1500100005	宣谷芹	22	女	理科五班
1500100006	边昂雄	21	男	理科二班
1500100007	尚孤风	23	女	文科六班
1500100008	符半双	22	女	理科六班
1500100009	沈德昌	21	男	理科一班
1500100010	羿彦昌	23	男	理科六班
Time taken: 0.02 seconds, Fetched: 10 row(s)
hive> insert into students_buks select * from students;
Query ID = root_20220406204908_39ca2ae4-22d1-4733-915a-44d7de857372
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 12
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Job = job_1649211406017_0012, Tracking URL = http://master:8088/proxy/application_1649211406017_0012/
Kill Command = /usr/local/soft/hadoop-2.7.6/bin/hadoop job  -kill job_1649211406017_0012
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 12
2022-04-06 20:49:15,919 Stage-1 map = 0%,  reduce = 0%
2022-04-06 20:49:22,171 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.87 sec
2022-04-06 20:49:37,757 Stage-1 map = 100%,  reduce = 6%, Cumulative CPU 1.64 sec
2022-04-06 20:49:38,797 Stage-1 map = 100%,  reduce = 14%, Cumulative CPU 2.91 sec
2022-04-06 20:49:41,948 Stage-1 map = 100%,  reduce = 19%, Cumulative CPU 4.58 sec
2022-04-06 20:49:45,213 Stage-1 map = 100%,  reduce = 25%, Cumulative CPU 6.09 sec
2022-04-06 20:49:46,240 Stage-1 map = 100%,  reduce = 31%, Cumulative CPU 7.25 sec
2022-04-06 20:49:48,369 Stage-1 map = 100%,  reduce = 42%, Cumulative CPU 8.75 sec
2022-04-06 20:49:50,491 Stage-1 map = 100%,  reduce = 58%, Cumulative CPU 11.51 sec
2022-04-06 20:49:51,577 Stage-1 map = 100%,  reduce = 72%, Cumulative CPU 14.0 sec
2022-04-06 20:49:52,626 Stage-1 map = 100%,  reduce = 75%, Cumulative CPU 14.26 sec
2022-04-06 20:49:53,671 Stage-1 map = 100%,  reduce = 92%, Cumulative CPU 16.93 sec
2022-04-06 20:49:54,743 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 18.09 sec
MapReduce Total cumulative CPU time: 18 seconds 90 msec
Ended Job = job_1649211406017_0012
Loading data to table test3.students_buks
Table test3.students_buks stats: [numFiles=12, numRows=1000, totalSize=41000, rawDataSize=40000]
MapReduce Jobs Launched: 
Stage-Stage-1: Map: 1  Reduce: 12   Cumulative CPU: 18.09 sec   HDFS Read: 88657 HDFS Write: 41863 SUCCESS
Total MapReduce CPU Time Spent: 18 seconds 90 msec
OK
Time taken: 48.349 seconds

JDBC

hive --service hiveserver2 &

或者
hiveserver2 &

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.6</version>
</dependency>

<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>1.2.1</version>
</dependency>

[root@master ~]# ps -aux | grep hive
root      63493  7.5  9.1 2320328 354592 pts/3  Sl+  21:10   0:09 /usr/local/soft/jdk1.8.0_171/bin/java -Xmx256m -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/usr/local/soft/hadoop-2.7.6/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/usr/local/soft/hadoop-2.7.6 -Dhadoop.id.str=root -Dhadoop.root.logger=INFO,console -Djava.library.path=/usr/local/soft/hadoop-2.7.6/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Xmx512m -Dhadoop.security.logger=INFO,NullAppender org.apache.hadoop.util.RunJar /usr/local/soft/hive-1.2.1/lib/hive-service-1.2.1.jar org.apache.hive.service.server.HiveServer2
root      63630  0.0  0.0 112824   988 pts/2    S+   21:12   0:00 grep --color=auto hive
[root@master ~]# netstat -tnlp | grep 63493
tcp        0      0 0.0.0.0:10000           0.0.0.0:*               LISTEN      63493/java

package com.shujia.hive;

import java.sql.*;

public class Demo01HiveJDBC {
    public static void main(String[] args) throws ClassNotFoundException, SQLException {
        //1.加载驱动
        Class.forName("org.apache.hive.jdbc.HiveDriver");
        //2.建立连接
        Connection conn = DriverManager.getConnection("jdbc:hive2://master:10000/test3");
        //3.创建statement
        Statement stat = conn.createStatement();
        ResultSet rs = stat.executeQuery("select * from students limit 10");
        while (rs.next()) {
            int id = rs.getInt(1);
            String name = rs.getString(2);
            int age = rs.getInt(3);
            String gender = rs.getString(4);
            String clazz = rs.getString(5);
            System.out.println(id + "," + name + "," + age + "," + gender + "," + clazz);
        }
        rs.close();
        stat.close();
        conn.close();
    }
}

小胖超凶哦！

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
分区，分桶，JDBC

分区hive> create database test3; OKTime taken: 0.051 secondshive> use test3;OKTime taken: 0.011 secondshive> create external table students_pt1 > ( > id bigint, > name string, > age int, > .
复制链接

扫一扫