这里写目录标题
一、开启HIVE中分区表支持中文字段
1.根据HIVE开启分区支持中文.txt中的操作步骤去MySQL中执行,修改HIVE元数据库中的编码格式
2. 插入数据
INSERT INTO TABLE filetest.partition_student PARTITION(gender="女生") SELECT "1500100002","吕金鹏",24,"文科六班"
二、分区表操作
1、 特点:
分区表与其他表不同点在于,分区字段的值为表目录下的子目录格式为: 分区字段=值
2、建表语句
create database learn2;
CREATE TABLE IF NOT EXISTS learn2.partition_student(
id STRING COMMENT "学生ID",
name STRING COMMENT "学生姓名",
age int COMMENT "年龄",
gender STRING COMMENT "性别"
) PARTITIONED BY (clazz STRING COMMENT "班级")
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
STORED AS TEXTFILE;
3、分区表插入数据
load data local inpath "本地路径" into table 表名 PARTITION(分区字段 = 值)
load data local inpath "/usr/local/soft/hive-3.1.2/data/文科一班.txt" into table learn2.partition_student PARTITION(clazz="文科一班");
load data local inpath "/usr/local/soft/hive-3.1.2/data/文科二班.txt" into table learn2.partition_student PARTITION(clazz="文科二班");
插入多级分区:
load data local inpath "/usr/local/soft/hive-3.1.2/data/文科一班女.txt" into table learn2.partition_student4 PARTITION(clazz="文科一班",gender="女");
load data local inpath "/usr/local/soft/hive-3.1.2/data/文科二班男.txt" into table learn2.partition_student4 PARTITION(clazz="文科二班",gender="男");
3.1 覆盖原先分区中的数据
load data local inpath "本地路径" overwrite into table 表名 PARTITION(分区字段 = 值)
load data local inpath "/usr/local/soft/hive-3.1.2/data/新文科一班.txt" overwrite into table learn2.partition_student PARTITION(clazz="文科一班");
3.2 put 方法上传数据
dfs -put /usr/local/soft/hive-3.1.2/data/理科一班.txt /user/hive/warehouse/learn2.db/partition_student2/clazz=理科一班/
3.3 增加动态分区操作
set hive.exec.dynamic.partition=true; -- 设置开启动态分区
set hive.exec.dynamic.partition.mode=nostrict; -- 设置动态分区的模式为非严格模式
set hive.exec.max.dynamic.partitions.pernode=1000; --设置分区的最大分区数
插入格式:
INSERT INTO TABLE 表名 PARTITION(分区字段) SELECT查询语句
分区的规则:默认是查询语句中后几列
CREATE TABLE IF NOT EXISTS learn2.partition_student3(
id STRING COMMENT "学生ID",
name STRING COMMENT "学生姓名",
age int COMMENT "年龄",
gender STRING COMMENT "性别"
) PARTITIONED BY (clazz STRING COMMENT "班级")
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
STORED AS TEXTFILE;
INSERT INTO TABLE learn2.partition_student3 PARTITION(clazz) SELECT id,name,age,gender,clazz FROM learn2.partition_student2
注意:
错误:FAILED: SemanticException [Error 10096]: Dynamic partition strict mode requires at least one static partition column. To turn this off set hive.exec.dynamic.partition.mode=nonstrict
解决方式: 执行开启动态分区设置
4、查询分区
1. show partitions 表名;
show partitions learn2.partition_student;
5、删除分区
1. alter table 表名 drop PARTITION(分区字段=值)
alter table learn2.partition_student drop PARTITION(clazz="文科二班");
注意: 如果分区表是外部表,那么删除分区操作只能删除HIVE中的元数据 数据依然存在
CREATE EXTERNAL TABLE IF NOT EXISTS learn2.partition_student2(
id STRING COMMENT "学生ID",
name STRING COMMENT "学生姓名",
age int COMMENT "年龄",
gender STRING COMMENT "性别"
) PARTITIONED BY (clazz STRING COMMENT "班级")
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
load data local inpath "/usr/local/soft/hive-3.1.2/data/文科一班.txt" into table learn2.partition_student2 PARTITION(clazz="文科一班");
load data local inpath "/usr/local/soft/hive-3.1.2/data/文科二班.txt" into table learn2.partition_student2 PARTITION(clazz="文科二班");
alter table learn2.partition_student2 drop PARTITION(clazz="文科二班");
5.1 强制删除分区
dfs -rmr /user/hive/warehouse/learn2.db/partition_student2/clazz=文科二班
alter table learn2.partition_student2 drop PARTITION(clazz="文科二班");
6、恢复被删除分区
msck repair table 表名;
7、创建多级分区
特点:
1.查询和操作分区时 格式为: clazz=文科一班/gender=女
2.在HDFS上多级分区表现为多层级目录
3.根据具体的数据情况进行区分,尽量将大一级划分放在前
4.多级分区常用来将数据按天进行存储
CREATE TABLE IF NOT EXISTS learn2.partition_student4(
id STRING COMMENT "学生ID",
name STRING COMMENT "学生姓名",
age int COMMENT "年龄"
) PARTITIONED BY (clazz STRING COMMENT "班级",gender STRING COMMENT "性别")
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
STORED AS TEXTFILE;
插入数据与单级分区类似
三、数据加载
1、通过load加载数据
从HDFS上加载数据至表中
dfs -mkdir /data;
dfs -put "/usr/local/soft/hive-3.1.2/data/理科一班.txt" /data
CREATE TABLE IF NOT EXISTS learn2.partition_student5(
id STRING COMMENT "学生ID",
name STRING COMMENT "学生姓名",
age int COMMENT "年龄",
clazz STRING COMMENT "班级",
gender STRING COMMENT "性别"
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
STORED AS TEXTFILE;
load data inpath "/data/理科一班.txt" into table learn2.partition_student5;
注意: 通过HDFS中的数据加载至表中时,源路径中的数据会移动至表目录下。
从本地文件系统加载数据至表中
load data local inpath "本地路径" into table 表名
load data local inpath "本地路径" overwrite into table 表名
load data local inpath "/usr/local/soft/hive-3.1.2/data/理科一班.txt"
注意:通过本地数据加载至表中时,源数据不变。
2、HDFS复制本地数据至表目录下(注意,创建表时,数据存储格斯需要与数据的格式对应)
1.创建表,获取表所在的路径
2.通过dfs -put "路径" 表所在路径
3、INSERT 方式:
INSERT INTO:
INSERT INTO TABLE 表名 SELECT 查询语句
CREATE TABLE IF NOT EXISTS learn2.partition_student6(
id STRING COMMENT "学生ID",
name STRING COMMENT "学生姓名",
age int COMMENT "年龄",
gender STRING COMMENT "性别"
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
STORED AS TEXTFILE;
INSERT INTO TABLE learn2.partition_student6 SELECT id,name,age,clazz as gender FROM learn2.partition_student5;
注意:查询语句中的字段需要与建表语句中的字段顺序一致,不然会导致数据错位。
INSERT OVERWRITE:
INSERT OVERWRITE TABLE 表名 SELECT 查询语句
通过查询的数据覆盖表中原有数据
4、AS SELECT
通过查询结果创建表,并输入数据
CREATE TABLE IF NOT EXISTS learn2.partition_student7 AS SELECT id,name,age,clazz as gender FROM learn2.partition_student5;
如果只想创建表而不需要加载数据,那么可以使用 LIKE
CREATE TABLE IF NOT EXISTS learn2.partition_student8 LIKE learn2.partition_student5;
5、LOCATION
通过创建表并指定数据所在位置,然后加载数据(数据已存在于HDFS目录中)
6、import table
需要搭配export使用,导入表时,可以指定新表名
import table 表名 FROM "路径"
IMPORT TABLE learn2.partition_student6 FROM "/data/export_data";
IMPORT TABLE learn2.partition_student9 FROM "/data/export_data";
四、数据导出
1、INSERT OVERWRITE LOCAL DIRECTORY “路径” SELECT 查询语句
INSERT OVERWRITE LOCAL DIRECTORY "/usr/local/soft/hive-3.1.2/data/output" SELECT * FROM learn2.partition_student6;
导出数据时,通过执行MapReduce任务导出到本地文件系统,查看导出数据,发现数据没有分隔符
INSERT OVERWRITE LOCAL DIRECTORY "/usr/local/soft/hive-3.1.2/data/output" ROW FORMAT DELIMITED FIELDS TERMINATED BY ":" SELECT * FROM learn2.partition_student6;
2、INSERT INTO LOCAL DIRECTORY “路径” SELECT 查询语句
3、HDFS get 方法
dfs -get 表所在路径 本地路径
dfs -get hdfs://master:9000/user/hive/warehouse/learn2.db/partition_student6 /本地路径
4、hive -e “SQL语句” > 本地文件系统
分析:
hive -e "SQL语句" 表示通过-e 将SQL语句传入CLI并执行,最后退出CLI
> 表示Linux中的重定向
hive -e "SELECT * FROM learn2.partition_student6" > /usr/local/soft/hive-3.1.2/data/output/partition_student6.txt
5、export方法
注:数据导出包括数据及其表的元数据信息
export table 库.表名 to HDFS
export table learn2.partition_student6 to "/data/export_data"
五、数据类型
复杂数据类型
1、ARRAY 使用
create table learn2.person(
name string,
work_locations array<string>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ','; --表示Array中的数据是以,作为分隔
biansutao beijing,shanghai,tianjin,hangzhou
linan changchu,chengdu,wuhan
上传数据:
dfs -put /usr/local/soft/hive-3.1.2/data/ArrayTest.txt hdfs://master:9000/user/hive/warehouse/learn2.db/person
查询:
SELECT work_locations[0], work_locations[1] FROM learn2.person;
2、MAP使用
create table learn2.score(name string, score map<string,int> )
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' --列的分隔符
COLLECTION ITEMS TERMINATED BY ',' -- 集合的分隔符
MAP KEYS TERMINATED BY ':'; -- 每个KV数据对的分隔符
create table learn2.score(name string, score map<string,int> )
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':'; -- 每个KV数据对的分隔符
biansutao '数学':80,'语文':89,'英语':95
jobs '语文':60,'数学':80,'英语':99
DESC FORMATTED learn2.score;
dfs -put /usr/local/soft/hive-3.1.2/data/score.txt hdfs://master:9000/user/hive/warehouse/learn2.db/score
SELECT score["'数学'"] FROM learn2.score;
3、Struct的使用
创建数据表
CREATE TABLE test(id int,course struct<course:string,score:int> )
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' --列分隔符
COLLECTION ITEMS TERMINATED BY ','; -- 集合分隔符
CREATE TABLE test(id int,course struct<course:string,score:int> )
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ','; -- 集合分隔符
数据
1 english,80
2 math,89
3 chinese,95
dfs -put /usr/local/soft/hive-3.1.2/data/struct.txt hdfs://master:9000/user/hive/warehouse/learn2.db/test
SELECT course.score FROM learn2.test;
注意:struct就相当于一个对象通过 列名.属性 获取里面的数据
数据类型转换
1.强制转换小数型为int
select cast("1.1" as int);
2.小数转字符串
select cast(1.1 as STRING);
六、简单数据查询
1、where
WHERE + 过滤条件
between/ in / is NULL / IS NOT NULL / > < = ! ...
如果多个存在多个过滤条件 可以用 AND OR 进行条件关联 或者是用NOT 进行条件结果取反
2、JOIN
JOIN 内连接 左外连接 右外连接 自连接 满连接
创建部门表:
create table learn2.emp(
EMPNO int
,ENAME string
,JOB string
,MGR int
,HIREDATE string
,SAL int
,BONUS int
,DEPTNO int
)
row format delimited
fields terminated by ',';
load data local inpath "/usr/local/soft/hive-3.1.2/data/emp.txt" into table learn2.emp;
创建部门表:
create table learn2.dept(
DEPTNO int
,DEPTNAME string
,DEPTADDR string
)
row format delimited
fields terminated by ',';
load data local inpath "/usr/local/soft/hive-3.1.2/data/dept.txt" into table learn2.dept;
内连接:
注意:关联时一定要跟上关联条件
SELECT
T1.DEPTNAME
,T2.ENAME
FROM learn2.dept T1 JOIN learn2.emp T2 ON T1.DEPTNO = T2.DEPTNO
INSERT INTO TABLE learn2.dept (DEPTNO,DEPTNAME,DEPTADDR) VALUES (50,"bigdata","hefei");
INSERT INTO TABLE learn2.emp (EMPNO ,ENAME ,JOB ,MGR ,HIREDATE ,SAL ,BONUS ,DEPTNO )
VALUES (7934,"yanglaoban","CLERK",7782,"2018-10-1",300,10000,60);
左外连接:
SELECT
T1.DEPTNAME
,T1.DEPTNO
,T2.ENAME
FROM learn2.dept T1 LEFT JOIN learn2.emp T2 ON T1.DEPTNO = T2.DEPTNO
右外连接:
SELECT
T1.DEPTNAME
,T1.DEPTNO
,T2.ENAME
FROM learn2.dept T1 RIGHT JOIN learn2.emp T2 ON T1.DEPTNO = T2.DEPTNO
3、GROUP BY 分组聚合
-- 统计每个部门下的员工数
SELECT
T1.DEPTNO
,count(1) as num
FROM learn2.emp T1 GROUP BY T1.DEPTNO
4、排序
4.1 ORDER BY 使用
-- 对薪资进行倒排
SELECT
T1.ENAME
,T1.SAL
FROM learn2.emp T1 ORDER BY T1.SAL desc
通过查看MAPREDUCE日志 可以看到 Reduce: 1 ORDER BY 中默认的reduce数量只能为1
那么为什么Reduce数量只有一个
在实际处理数据过程中,要尽量避免使用全局排序
**** set mapreduce.job.reduces; 表示查看当前reduce数量
**** set mapreduce.job.reduces = 3;表示设置当前reduce数量为3
4.2 sort by 使用
SELECT
T1.ENAME
,T1.SAL
FROM learn2.emp T1 SORT BY T1.SAL desc
通过设置reduce数量为3 查询出的结果为乱序
INSERT OVERWRITE LOCAL DIRECTORY "/usr/local/soft/hive-3.1.2/data/emp_sort"
SELECT
T1.ENAME
,T1.SAL
FROM learn2.emp T1 SORT BY T1.SAL desc;
通过输出的结果中三个文件可以看出 sort by 是分区内有序
多字段排序
-- 以部门做升序,薪资做倒叙排列
SELECT
T1.DEPTNO
,T1.SAL
FROM learn2.emp T1 ORDER BY T1.DEPTNO, T1.SAL desc
SELECT
T1.DEPTNO
,T1.SAL
FROM learn2.emp T1 SORT BY T1.DEPTNO, T1.SAL desc;
-- 结果也是分区间无序,分区内有序
4.3 distribute by + sort by 搭配使用进行排序
distribute by:指定按某列数据进行分区操作,和之前MR学习中的自定义分区类似
SELECT
T1.DEPTNO
,T1.SAL
FROM learn2.emp T1 distribute by T1.DEPTNO SORT BY T1.SAL desc;
4.4 cluster by 也表示排序操作 但是不能对数据做倒序, cluster by 表示对一个列数据进行分区排序
SELECT
T1.SAL
FROM learn2.emp T1 cluster by T1.SAL ;
+---------+
| t1.sal |
+---------+
| 300 |
| 1500 |
| 2850 |
| 3000 |
| 3000 |
| 1300 |
| 1600 |
| 800 |
| 950 |
| 1100 |
| 1250 |
| 1250 |
| 2450 |
| 2975 |
| 5000 |
SELECT
T1.SAL
FROM learn2.emp T1 distribute by T1.SAL SORT BY T1.SAL;
+---------+
| t1.sal |
+---------+
| 300 |
| 1500 |
| 2850 |
| 3000 |
| 3000 |
| 1300 |
| 1600 |
| 800 |
| 950 |
| 1100 |
| 1250 |
| 1250 |
| 2450 |
| 2975 |
| 5000 |
+---------+
4.5 distinct 去重操作
--查看人员表中一共有多少部门
SELECT
DISTINCT T1.DEPTNO
FROM learn2.emp T1
5、分桶操作
5.1 创建分桶表
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name
[(col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS] --根据 给定的列进行 分桶排序 指定存入 N 个桶中
-- 需求:
将bucket.txt中的数据分成4个桶进行存储
CREATE TABLE IF NOT EXISTS learn2.bucket_table(
id int,name STRING
)
CLUSTERED BY(id) INTO 4 BUCKETS
ROW FORMAT DELIMITED FIELDS TERMINATED BY ",";
load data local inpath "/usr/local/soft/hive-3.1.2/data/bucket.txt" into table learn2.bucket_table;
如果加载数据的时候提示文件不存在,那么可以将数据先上传至HDFS,之后再去加载数据至表中,同时需要开启桶表的支持,对桶表的load操作
dfs -put /usr/local/soft/hive-3.1.2/data/bucket.txt /data/
load data inpath "/data/bucket.txt" into table learn2.bucket_table;
--启用桶表
set hive.enforce.bucketing=true;
--限制对桶表进行load操作
set hive.strict.checks.bucketing = false;
通过查看表对应HDFS上的路径 可以看到数据分为4分,这样有什么好处?
通过查看000000_0文件数据可以看到,ID列中对4取余都为0
通过查看000001_0文件数据可以看到,ID列中对4取余都为1
这样我们后面去表中取数据,如果对id进行过滤,如下SQL语句
SELECT * FROM learn2.bucket_table WHERE id in ('4','1');
那么它的执行逻辑为: 对需要查找的 id ('4','1') 进行取余,
如果余数为0那么就去000000_0文件中加载数据
如果余数为1那么就去000001_0文件中加载数据
这样就可以避免加载不必要的数据,提高执行效率
5.2 查看桶的数量
DESC FORMATTED learn2.bucket_table; 中的 Num Buckets: 4 参数查看具体对应的桶数
6、函数的基本使用
6.1 如何查看函数
1)SHOW FUNCTIONS 查看所有支持的函数 共289个
2)SHOW FUNCTIONS LIKE "**" 模糊查询函数名
3)DESC FUNCTION + 函数名 可以查看函数的具体使用方法
6.2 IF 函数
SELECT
T1.ENAME
,IF(T1.SAL > 1000,"高收入","低收入") as if_sal
FROM learn2.emp T1
6.3 CASE WHEN 函数:CASE WHEN 。。。END
CASE WHEN 判断条件1 THEN 如果为TURE返回THEN1后的值,
WHEN 判断条件2 THEN 如果为TURE返回THEN2后的值,
ELSE 以上都不满足,则返回ELSE后的值
END
-- 需求:将收入分成三等分
SELECT
T1.ENAME
,CASE WHEN T1.SAL > 3000 THEN "高收入"
WHEN T1.SAL <= 3000 AND T1.SAL > 1000 THEN "中收入"
ELSE "低收入" END case_when_sal
FROM learn2.emp T1
注意:如果需求中只有一个判断,那么使用IF函数更为方便,否则则使用 CASE WHEN
6.4 日期函数(请查阅HIVE日期函数使用.pdf)
select unix_timestamp(from_unixtime(unix_timestamp()));
6.5 字符串函数的使用(请查阅字符串函数.pdf)
字符串拼接:
SELECT CONCAT(T1.EMPNO,"/",T1.ENAME) FROM learn2.emp T1
SELECT CONCAT_WS("/",CAST(T1.EMPNO AS STRING),T1.ENAME) FROM learn2.emp T1
注:CONCAT_WS需要提供STRING类型数据,如果不满足可以通过CAST将数据类型进行转换
字符串截取函数:
SELECT SUBSTRING(CURRENT_DATE,1,7);
切分函数:
select split('abtcdtef','t') 返回的数据类型为复杂数据类型Array