分区
hive> create database test3;
OK
Time taken: 0.051 seconds
hive> use test3;
OK
Time taken: 0.011 seconds
hive> create external table students_pt1
> (
> id bigint,
> name string,
> age int,
> gender string,
> clazz string
> )
> PARTITIONED BY(pt string)
> ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
> LOCATION '/student/input1';
OK
Time taken: 0.058 seconds
hive> alter table students_pt1 add partition(pt='20220405');
OK
Time taken: 0.086 seconds
hive> load data local inpath '/usr/local/soft/data/students.txt' into table students_pt1;
FAILED: SemanticException [Error 10062]: Need to specify partition columns because the destination table is partitioned
hive> load data local inpath '/usr/local/soft/data/students.txt' into table students_pt1 partition(pt='20220405');
Loading data to table test3.students_pt1 partition (pt=20220405)
Partition test3.students_pt1{pt=20220405} stats: [numFiles=1, totalSize=41998]
OK
Time taken: 0.653 seconds
hive> select * from students_pt1 limit 10;
OK
1500100001 施笑槐 22 女 文科六班 20220405
1500100002 吕金鹏 24 男 文科六班 20220405
1500100003 单乐蕊 22 女 理科六班 20220405
1500100004 葛德曜 24 男 理科三班 20220405
1500100005 宣谷芹 22 女 理科五班 20220405
1500100006 边昂雄 21 男 理科二班 20220405
1500100007 尚孤风 23 女 文科六班 20220405
1500100008 符半双 22 女 理科六班 20220405
1500100009 沈德昌 21 男 理科一班 20220405
1500100010 羿彦昌 23 男 理科六班 20220405
Time taken: 0.15 seconds, Fetched: 10 row(s)
hive> select * from students_pt1 where pt='20220405' limit 10;
OK
1500100001 施笑槐 22 女 文科六班 20220405
1500100002 吕金鹏 24 男 文科六班 20220405
1500100003 单乐蕊 22 女 理科六班 20220405
1500100004 葛德曜 24 男 理科三班 20220405
1500100005 宣谷芹 22 女 理科五班 20220405
1500100006 边昂雄 21 男 理科二班 20220405
1500100007 尚孤风 23 女 文科六班 20220405
1500100008 符半双 22 女 理科六班 20220405
1500100009 沈德昌 21 男 理科一班 20220405
1500100010 羿彦昌 23 男 理科六班 20220405
Time taken: 0.16 seconds, Fetched: 10 row(s)
hive> alter table students_pt1 add partition(pt='20220404');
OK
Time taken: 0.048 seconds
hive> alter table students_pt1 add partition(pt='20220403');
OK
Time taken: 0.049 seconds
hive> alter table students_pt1 add partition(pt='20220402');
OK
Time taken: 0.047 seconds
hive> alter table students_pt1 add partition(pt='20220401');
OK
Time taken: 0.048 seconds
hive> alter table students_pt1 drop partition(pt='20220401');
Dropped the partition pt=20220401
OK
Time taken: 0.107 seconds
hive> show partitions students_pt1;
OK
pt=20220402
pt=20220403
pt=20220404
pt=20220405
Time taken: 0.035 seconds, Fetched: 4 row(s)
hive> alter table students_pt1 add partition(pt='20220401');
OK
Time taken: 0.046 seconds
hive> show partitions students_pt1;
OK
pt=20220401
pt=20220402
pt=20220403
pt=20220404
pt=20220405
Time taken: 0.04 seconds, Fetched: 5 row(s)
hive> desc students_pt1;
OK
id bigint
name string
age int
gender string
clazz string
pt string
# Partition Information
# col_name data_type comment
pt string
Time taken: 0.031 seconds, Fetched: 11 row(s)
动态分区
# 表示开启动态分区
hive> set hive.exec.dynamic.partition=true;
# 表示动态分区模式:strict(需要配合静态分区一起使用)、nostrict
# strict: insert into table students_pt partition(dt='anhui',pt) select ......,pt from students;
hive> set hive.exec.dynamic.partition.mode=nostrict;
# 表示支持的最大的分区数量为1000,可以根据业务自己调整
hive> set hive.exec.max.dynamic.partitions.pernode=1000;
hive> set hive.exec.dynamic.partition=true;
hive> set hive.exec.dynamic.partition.mode=nostrict;
hive> set hive.exec.max.dynamic.partitions.pernode=1000;
hive> create table students_dt
> (
> id bigint,
> name string,
> age int,
> gender string,
> clazz string,
> dt string
> )
> ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
OK
Time taken: 0.039 seconds
hive> load data local inpath '/usr/local/soft/data/students_dt.txt' into table students_dt;
Loading data to table test3.students_dt
Table test3.students_dt stats: [numFiles=1, totalSize=50998]
OK
Time taken: 0.511 seconds
hive> show tables;
OK
students_dt
students_pt1
Time taken: 0.013 seconds, Fetched: 2 row(s)
hive> desc students_dt;
OK
id bigint
name string
age int
gender string
clazz string
dt string
Time taken: 0.035 seconds, Fetched: 6 row(s)
hive> select * from students_dt limit 10;
OK
1500100001 施笑槐 22 女 文科六班 20220101
1500100002 吕金鹏 24 男 文科六班 20220101
1500100003 单乐蕊 22 女 理科六班 20220101
1500100004 葛德曜 24 男 理科三班 20220101
1500100005 宣谷芹 22 女 理科五班 20220101
1500100006 边昂雄 21 男 理科二班 20220101
1500100007 尚孤风 23 女 文科六班 20220101
1500100008 符半双 22 女 理科六班 20220101
1500100009 沈德昌 21 男 理科一班 20220101
1500100010 羿彦昌 23 男 理科六班 20220101
Time taken: 0.034 seconds, Fetched: 10 row(s)
hive> create table students_dt_p
> (
> id bigint,
> name string,
> age int,
> gender string,
> clazz string
> )
> PARTITIONED BY(dt string)
> ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
OK
Time taken: 0.033 seconds
// 分区字段需要放在 select 的最后,如果有多个分区字段 同理,它是按位置匹配,不是按名字匹配
insert into table students_dt_p partition(dt) select id,name,age,gender,clazz,dt from students_dt;
// 比如下面这条语句会使用age作为分区字段,而不会使用student_dt中的dt作为分区字段
insert into table students_dt_p partition(dt) select id,name,age,gender,dt,age from students_dt;
hive> insert into table students_dt_p partition(dt) select id,name,age,gender,clazz,dt from students_dt;
多级分区
hive> create table students_year_month
> (
> id bigint,
> name string,
> age int,
> gender string,
> clazz string,
> year string,
> month string
> )
> ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
OK
Time taken: 0.032 seconds
hive> create table students_year_month_pt
> (
> id bigint,
> name string,
> age int,
> gender string,
> clazz string
> )
> PARTITIONED BY(year string,month string)
> ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
OK
Time taken: 0.03 seconds
hive> load data local inpath '/usr/local/soft/data/students_year_month.txt' into table students_year_month;
Loading data to table test3.students_year_month
Table test3.students_year_month stats: [numFiles=1, totalSize=49998]
OK
Time taken: 0.496 seconds
hive> select * from students_year_month limit 10;
OK
1500100001 施笑槐 22 女 文科六班 2022 01
1500100002 吕金鹏 24 男 文科六班 2022 01
1500100003 单乐蕊 22 女 理科六班 2022 01
1500100004 葛德曜 24 男 理科三班 2022 01
1500100005 宣谷芹 22 女 理科五班 2022 01
1500100006 边昂雄 21 男 理科二班 2022 01
1500100007 尚孤风 23 女 文科六班 2022 01
1500100008 符半双 22 女 理科六班 2022 01
1500100009 沈德昌 21 男 理科一班 2022 01
1500100010 羿彦昌 23 男 理科六班 2022 01
Time taken: 0.031 seconds, Fetched: 10 row(s)
hive> insert into table students_year_month_pt partition(year,month) select id,name,age,gender,clazz,year,month from students_year_month;
Query ID = root_20220406203304_f82525b2-c127-47fa-a0be-0c43cc56ae86
Total jobs = 3
Launching Job 1 out of 3
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1649211406017_0010, Tracking URL = http://master:8088/proxy/application_1649211406017_0010/
Kill Command = /usr/local/soft/hadoop-2.7.6/bin/hadoop job -kill job_1649211406017_0010
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2022-04-06 20:33:12,324 Stage-1 map = 0%, reduce = 0%
2022-04-06 20:33:18,566 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 1.49 sec
MapReduce Total cumulative CPU time: 1 seconds 490 msec
Ended Job = job_1649211406017_0010
Stage-4 is selected by condition resolver.
Stage-3 is filtered out by condition resolver.
Stage-5 is filtered out by condition resolver.
Moving data to: hdfs://master:9000/user/hive/warehouse/test3.db/students_year_month_pt/.hive-staging_hive_2022-04-06_20-33-04_845_6271486721060924938-1/-ext-10000
Loading data to table test3.students_year_month_pt partition (year=null, month=null)
Time taken for load dynamic partitions : 392
Loading partition {year=2022, month=07}
Loading partition {year=2022, month=04}
Loading partition {year=2022, month=06}
Loading partition {year=2022, month=03}
Loading partition {year=2022, month=01}
Loading partition {year=2022, month=02}
Loading partition {year=2022, month=08}
Loading partition {year=2022, month=05}
Loading partition {year=2021, month=10}
Loading partition {year=2022, month=09}
Loading partition {year=2021, month=09}
Time taken for adding to write entity : 1
Partition test3.students_year_month_pt{year=2021, month=09} stats: [numFiles=1, numRows=12, totalSize=492, rawDataSize=480]
Partition test3.students_year_month_pt{year=2021, month=10} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=01} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=02} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=03} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=04} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=05} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=06} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=07} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=08} stats: [numFiles=1, numRows=100, totalSize=4100, rawDataSize=4000]
Partition test3.students_year_month_pt{year=2022, month=09} stats: [numFiles=1, numRows=88, totalSize=3608, rawDataSize=3520]
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Cumulative CPU: 1.49 sec HDFS Read: 54609 HDFS Write: 41793 SUCCESS
Total MapReduce CPU Time Spent: 1 seconds 490 msec
OK
Time taken: 16.637 seconds
分桶
hive> set hive.enforce.bucketing=true;
hive> create table students_buks
> (
> id bigint,
> name string,
> age int,
> gender string,
> clazz string
> )
> CLUSTERED BY (clazz) into 12 BUCKETS
> ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
OK
Time taken: 0.039 seconds
// 直接使用load data 并不能将数据打散
load data local inpath '/usr/local/soft/data/students.txt' into table students_buks;// 需要使用下面这种方式插入数据,才能使分桶表真正发挥作用
insert into students_buks select * from students;
hive> create table students
> (
> id bigint,
> name string,
> age int,
> gender string,
> clazz string
> )
> ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
OK
Time taken: 0.031 seconds
hive> load data local inpath '/usr/local/soft/data/students.txt' into table students;
Loading data to table test3.students
Table test3.students stats: [numFiles=1, totalSize=41998]
OK
Time taken: 0.484 seconds
hive> show tables;
OK
students
students_buks
students_dt
students_dt_p
students_pt1
students_year_month
students_year_month_pt
Time taken: 0.01 seconds, Fetched: 7 row(s)
hive> select * from students limit 10;
OK
1500100001 施笑槐 22 女 文科六班
1500100002 吕金鹏 24 男 文科六班
1500100003 单乐蕊 22 女 理科六班
1500100004 葛德曜 24 男 理科三班
1500100005 宣谷芹 22 女 理科五班
1500100006 边昂雄 21 男 理科二班
1500100007 尚孤风 23 女 文科六班
1500100008 符半双 22 女 理科六班
1500100009 沈德昌 21 男 理科一班
1500100010 羿彦昌 23 男 理科六班
Time taken: 0.02 seconds, Fetched: 10 row(s)
hive> insert into students_buks select * from students;
Query ID = root_20220406204908_39ca2ae4-22d1-4733-915a-44d7de857372
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 12
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1649211406017_0012, Tracking URL = http://master:8088/proxy/application_1649211406017_0012/
Kill Command = /usr/local/soft/hadoop-2.7.6/bin/hadoop job -kill job_1649211406017_0012
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 12
2022-04-06 20:49:15,919 Stage-1 map = 0%, reduce = 0%
2022-04-06 20:49:22,171 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 0.87 sec
2022-04-06 20:49:37,757 Stage-1 map = 100%, reduce = 6%, Cumulative CPU 1.64 sec
2022-04-06 20:49:38,797 Stage-1 map = 100%, reduce = 14%, Cumulative CPU 2.91 sec
2022-04-06 20:49:41,948 Stage-1 map = 100%, reduce = 19%, Cumulative CPU 4.58 sec
2022-04-06 20:49:45,213 Stage-1 map = 100%, reduce = 25%, Cumulative CPU 6.09 sec
2022-04-06 20:49:46,240 Stage-1 map = 100%, reduce = 31%, Cumulative CPU 7.25 sec
2022-04-06 20:49:48,369 Stage-1 map = 100%, reduce = 42%, Cumulative CPU 8.75 sec
2022-04-06 20:49:50,491 Stage-1 map = 100%, reduce = 58%, Cumulative CPU 11.51 sec
2022-04-06 20:49:51,577 Stage-1 map = 100%, reduce = 72%, Cumulative CPU 14.0 sec
2022-04-06 20:49:52,626 Stage-1 map = 100%, reduce = 75%, Cumulative CPU 14.26 sec
2022-04-06 20:49:53,671 Stage-1 map = 100%, reduce = 92%, Cumulative CPU 16.93 sec
2022-04-06 20:49:54,743 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 18.09 sec
MapReduce Total cumulative CPU time: 18 seconds 90 msec
Ended Job = job_1649211406017_0012
Loading data to table test3.students_buks
Table test3.students_buks stats: [numFiles=12, numRows=1000, totalSize=41000, rawDataSize=40000]
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Reduce: 12 Cumulative CPU: 18.09 sec HDFS Read: 88657 HDFS Write: 41863 SUCCESS
Total MapReduce CPU Time Spent: 18 seconds 90 msec
OK
Time taken: 48.349 seconds
JDBC
hive --service hiveserver2 &
或者
hiveserver2 &
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>1.2.1</version>
</dependency>
[root@master ~]# ps -aux | grep hive
root 63493 7.5 9.1 2320328 354592 pts/3 Sl+ 21:10 0:09 /usr/local/soft/jdk1.8.0_171/bin/java -Xmx256m -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/usr/local/soft/hadoop-2.7.6/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/usr/local/soft/hadoop-2.7.6 -Dhadoop.id.str=root -Dhadoop.root.logger=INFO,console -Djava.library.path=/usr/local/soft/hadoop-2.7.6/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Xmx512m -Dhadoop.security.logger=INFO,NullAppender org.apache.hadoop.util.RunJar /usr/local/soft/hive-1.2.1/lib/hive-service-1.2.1.jar org.apache.hive.service.server.HiveServer2
root 63630 0.0 0.0 112824 988 pts/2 S+ 21:12 0:00 grep --color=auto hive
[root@master ~]# netstat -tnlp | grep 63493
tcp 0 0 0.0.0.0:10000 0.0.0.0:* LISTEN 63493/java
package com.shujia.hive;
import java.sql.*;
public class Demo01HiveJDBC {
public static void main(String[] args) throws ClassNotFoundException, SQLException {
//1.加载驱动
Class.forName("org.apache.hive.jdbc.HiveDriver");
//2.建立连接
Connection conn = DriverManager.getConnection("jdbc:hive2://master:10000/test3");
//3.创建statement
Statement stat = conn.createStatement();
ResultSet rs = stat.executeQuery("select * from students limit 10");
while (rs.next()) {
int id = rs.getInt(1);
String name = rs.getString(2);
int age = rs.getInt(3);
String gender = rs.getString(4);
String clazz = rs.getString(5);
System.out.println(id + "," + name + "," + age + "," + gender + "," + clazz);
}
rs.close();
stat.close();
conn.close();
}
}