Hive-HQL操作

捡头发的小灰灰

已于 2022-02-10 18:52:23 修改

阅读量1.3k

点赞数 1

分类专栏：大数据文章标签： hive hadoop big data

于 2022-02-09 21:01:33 首次发布

本文链接：https://blog.csdn.net/l_zerol/article/details/122849533

版权

大数据专栏收录该内容

4 篇文章 0 订阅

订阅专栏

HQL

一、DDL

1.1 数据库操作

-- 创建数据库
create database if not exists mydb2 
[comment 'this is mydb'] 
[location '/user/hive/mydb.db'];

-- 查看数据库
-- 查看所有数据库 
show database; 
-- 查看数据库信息 
desc database mydb2; 
desc database extended mydb2; 
describe database extended mydb2;

-- 删除数据库
-- 删除一个空数据库 
drop database databasename; 
-- 如果数据库不为空，使用 cascade 强制删除 
drop database databasename cascade;

1.2 内部表

-- 创建内部表   
-- 数据[2;zhangsan;book,TV,code;beijing:chaoyang,shagnhai:pudong ]

create table t1( 
    id int, 
    name string,
    hobby array<string>, 
    addr map<string, string> 
)
row format delimited 
-- 每列之间的分隔符
fields terminated by ";" 
-- 集合数据分隔符
collection items terminated by "," 
-- map数据分隔符
map keys terminated by ":";

-- 显示表的定义，显示的信息较少 
desc t1; 

-- 显示表的定义，显示的信息多，格式友好 
desc formatted t1; 

-- 加载数据 
load data local inpath '/home/hadoop/data/t1.dat' into table t1; 

-- 查询数据 
select * from t1; 

-- 查询数据文件 
dfs -ls /user/hive/warehouse/mydb.db/t1; 

-- 删除表。表和数据同时被删除 
drop table t1;

-- 再次查询数据文件，已经被删除
select * from t1;

1.3 外部表

-- 创建外部表 external
create external table t2( 
    id int, 
    name string, 
    hobby array<string>, 
    addr map<string, string> 
)
row format delimited 
fields terminated by ";" 
collection items terminated by "," 
map keys terminated by ":";

-- 显示表的定义 
desc formatted t2; 

-- 加载数据 
load data local inpath '/home/hadoop/data/t1.dat' into table t2;

-- 查询数据 
select * from t2; 

-- 删除表。表删除了，数据仍然存在 
drop table t2; 

-- 再次查询数据文件，仍然存在
select * from t2;

1.4 内外部表的转化

-- 内部表转外部表 
alter table t1 set tblproperties('EXTERNAL'='TRUE'); 

-- 外部表转内部表。EXTERNAL 大写，false 不区分大小 
alter table t1 set tblproperties('EXTERNAL'='FALSE');

1.5 分区表

-- 创建分区表
create table if not exists t3( 
    id int ,
    name string ,
    hobby array<string> ,
    addr map<String,string> 
)
partitioned by (dt string) 
row format delimited 
fields terminated by ';' 
collection items terminated by ',' 
map keys terminated by ':';

-- 加载数据到分区表
load data local inpath "/home/hadoop/data/t1.dat" into table t3 partition(dt="2020-06-01"); 

load data local inpath "/home/hadoop/data/t1.dat" into table t3 partition(dt="2020-06-02");

-- ### 备注：分区字段不是表中已经存在的数据，可以将分区字段看成伪列

-- 查看分区
show partitions t3;

-- 增加一个分区，不加载数据 
alter table t3 add partition(dt='2020-06-03');

-- 增加多个分区，不加载数据 
alter table t3 add partition(dt='2020-06-05') partition(dt='2020-06-06');

-- 增加多个分区。准备数据
hdfs dfs -cp /user/hive/warehouse/mydb.db/t3/dt=2020-06-01 /user/hive/warehouse/mydb.db/t3/dt=2020-06-07 

hdfs dfs -cp /user/hive/warehouse/mydb.db/t3/dt=2020-06-01 /user/hive/warehouse/mydb.db/t3/dt=2020-06-08

-- 增加多个分区。加载数据 
alter table t3 add 
partition(dt='2020-06-07') location '/user/hive/warehouse/mydb.db/t3/dt=2020-06-07' 
partition(dt='2020-06-08') location '/user/hive/warehouse/mydb.db/t3/dt=2020-06-08';

-- 查询数据 
select * from t3;

-- 修改分区的hdfs的路径
alter table t3 partition(dt='2020-06-01') 
set location '/user/hive/warehouse/t3/dt=2020-06-03';

-- 删除分区   可以删除一个或多个分区，用逗号隔开
alter table t3 drop partition(dt='2020-06-03'), partition(dt='2020-06-04');

1.6 分桶表

-- Hive中：分桶字段.hashCode % 分桶个数

-- 测试数据
1 java 90 
1 c 78 
1 python 91 
1 hadoop 80 
2 java 75 
2 c 76 
2 python 80 
2 hadoop 93 
3 java 98 
3 c 74 
3 python 89 
3 hadoop 91 
5 java 93 
6 c 76 
7 python 87 
8 hadoop 88


-- 创建分桶表
create table course( 
    id int, 
    name string,
    score int 
)
clustered by (id) into 3 buckets 
row format delimited 
fields terminated by "\t";

-- 创建普通表 
create table course_common( 
    id int, 
    name string, 
    score int 
)
row format delimited 
fields terminated by "\t";

-- 普通表加载数据 
load data local inpath '/home/hadoop/data/course.dat' into table course_common; 

-- 通过 insert ... select ... 给桶表加载数据 
insert into table course select * from course_common; 

-- 查询分区表数据
select * from course;

-- 观察分桶数据。数据按照：(分区字段.hashCode) % (分桶数) 进行分区

1.7 修改表，删除表

-- 修改表名。rename 
alter table course_common rename to course_common1;

-- 修改列名。change column 
alter table course_common1 change column id cid int;

-- 修改字段类型。change column
alter table course_common1 change column cid cid string; 
-- The following columns have types incompatible with the existing columns in their respective positions 
-- 修改字段数据类型时，要满足数据类型转换的要求。如int可以转为string，但是 string不能转为int

-- 增加字段。add columns
alter table course_common1 add columns (common string); 

-- 删除字段：replace columns 
-- 这里仅仅只是在元数据中删除了字段，并没有改动hdfs上的数据文件 
alter table course_common1 replace columns( id string, cname string, score int); 

-- 删除表 
drop table course_common1;

二、数据导入导出

2.1 数据导入

-- load装载
## 基本语法：
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]

### LOAD DATA LOCAL ... 从本地文件系统加载数据到Hive表中。本地文件会拷贝到Hive表指定的位置
### LOAD DATA ... 从HDFS加载数据到Hive表中。HDFS文件移动到Hive表指定的位置
### INPATH：加载数据的路径
### OVERWRITE：覆盖表中已有数据；否则表示追加数据
### PARTITION：将数据加载到指定的分区

2.2 数据插入

-- 插入数据
insert into table xx partition(dt='20211012') values (),(),(),...;

-- 插入查询的结果数据
insert into table xx partition(dt='20211012') 
select id, name, area from xx where dt='20211011';

-- 多表（多分区）插入模式 
from tabC 
insert overwrite table tabC partition(month='202003') 
select id, name, area where month='202002' 
insert overwrite table tabC partition(month='202004') 
select id, name, area where month='202002'; 

-- 创建表并插入数据 根据查询结果创建表
create table if not exists tabD as select * from tabC;

-- import导入数据
import table student2 partition(month='201709') from '/user/hive/warehouse/export/student';

2.3 数据导出

-- 将查询结果导出到本地 
insert overwrite local directory '/home/hadoop/data/tabC' 
select * from tabC; 

-- 将查询结果格式化输出到本地 
insert overwrite local directory '/home/hadoop/data/tabC2' 
row format delimited fields terminated by ' ' 
select * from tabC; 

-- 将查询结果导出到HDFS 
insert overwrite directory '/user/hadoop/data/tabC3' 
row format delimited fields terminated by ' ' 
select * from tabC; 

-- dfs 命令导出数据到本地。本质是执行数据文件的拷贝 
dfs -get /user/hive/warehouse/mydb.db/tabc/month=202001 /home/hadoop/data/tabC4

-- hive 命令导出数据到本地。执行查询将查询结果重定向到文件 
hive -e "select * from tabC" > a.log 

-- export 导出数据到HDFS。使用export导出数据时，不仅有数还有表的元数据信息
export table tabC to '/user/hadoop/data/tabC4'; 

-- export 导出的数据，可以使用 import 命令导入到 Hive 表中 
-- 使用 like tname创建的表结构与原表一致。create ... as select ... 结构可能不一致 
create table tabE like tabc; import table tabE from ''/user/hadoop/data/tabC4'; 

-- 截断表，清空数据。(注意：仅能操作内部表) 外部表不能执行 truncate 操作
truncate table tabE;

三、DQL - 排序

3.1 每个MR内部排序 [sort by]

-- 对于大规模数据而言order by效率低；
-- 在很多业务场景，我们并不需要全局有序的数据，此时可以使用sort by；
-- sort by为每个reduce产生一个排序文件，在reduce内部进行排序，得到局部有序的结果；sort by的数据只能保证在同一个reduce中的数据可以按指定字段排序；


-- 设置reduce个数 
set mapreduce.job.reduces=2; 

-- 按照工资降序查看员工信息 
select * from emp sort by sal desc; 

-- 将查询结果导入到文件中（按照工资降序）。生成两个输出文件，每个文件内部数据 按工资降序排列 
insert overwrite local directory '/home/hadoop/output/sortsal' 
select * from emp sort by sal desc;

3.2 分区排序 [distribute by]

-- distribute by 将特定的行发送到特定的reducer中，便于后继的聚合与排序操作；
-- distribute by 类似于MR中的分区操作，可以结合sort by操作，使分区数据有序；
-- distribute by 要写在sort by之前；


-- 启动2个reducer task；先按 deptno 分区，在分区内按 sal+comm 排序 
set mapreduce.job.reduces=2; 

-- 将结果输出到文件，观察输出结果 
insert overwrite local directory '/home/hadoop/output/distBy' 
select empno, ename, job, deptno, sal + nvl(comm, 0) salcomm 
from emp distribute by deptno sort by salcomm desc; 

-- 上例中，数据被分到了统一区，看不出分区的结果

-- 将数据分到3个区中，每个分区都有数据
set mapreduce.job.reduces=3; 

insert overwrite local directory '/home/hadoop/output/distBy1' 
select empno, ename, job, deptno, sal + nvl(comm, 0) salcomm 
from emp distribute by deptno sort by salcomm desc;

3.3 cluster by

-- 当distribute by 与 sort by是同一个字段时，可使用cluster by简化语法；

-- 语法上是等价的 
select * from emp distribute by deptno sort by deptno; 
select * from emp cluster by deptno;

四、函数*

4.1 查看系统内置函数

-- 查看系统自带函数 
show functions; 

-- 显示自带函数的用法 
desc function upper; 
desc function extended upper;

4.2 日期函数*

-- 当前前日期 
select current_date; 
select unix_timestamp(); 

-- 建议使用current_timestamp，有没有括号都可以 
select current_timestamp(); 

-- 时间戳转日期 
select from_unixtime(1505456567); 
select from_unixtime(1505456567, 'yyyyMMdd'); 
select from_unixtime(1505456567, 'yyyy-MM-dd HH:mm:ss'); 

-- 日期转时间戳 
select unix_timestamp('2019-09-15 14:23:00'); 

-- 计算时间差 
select datediff('2020-04-18','2019-11-21'); 
select datediff('2019-11-21', '2020-04-18'); 

-- 查询当月第几天 
select dayofmonth(current_date); 

-- 计算月末: 
select last_day(current_date); 

-- 当月第1天: 
select date_sub(current_date, dayofmonth(current_date)-1) 

-- 下个月第1天: 
select add_months(date_sub(current_date, dayofmonth(current_date)-1), 1) 

-- 字符串转时间（字符串必须为：yyyy-MM-dd格式） 
select to_date('2020-01-01'); select to_date('2020-01-01 12:12:12'); 

-- 日期、时间戳、字符串类型格式化输出标准时间格式 
select date_format(current_timestamp(), 'yyyy-MM-dd HH:mm:ss'); 
select date_format(current_date(), 'yyyyMMdd'); 
select date_format('2020-06-01', 'yyyy-MM-dd HH:mm:ss');

-- 计算emp表中，每个人的工龄 
select *, round(datediff(current_date, hiredate)/365,1) workingyears from emp;

4.3 字符串函数

-- 转小写。lower 
select lower("HELLO WORLD"); 

-- 转大写。upper 
select lower(ename), ename from emp; 

-- 求字符串长度。length 
select length(ename), ename from emp; 

-- 字符串拼接。 concat / || 
select empno || " " ||ename idname from emp; 
select concat(empno, " " ,ename) idname from emp; 

-- 指定分隔符。concat_ws(separator, [string | array(string)]+) 
SELECT concat_ws('.', 'www', array('lagou', 'com')); 
www.lagou.com

select concat_ws(" ", ename, job) from emp; 

-- 求子串。substr 
SELECT substr('www.lagou.com', 5); 
lagou.com

SELECT substr('www.lagou.com', -5); 
u.com

SELECT substr('www.lagou.com', 5, 5);
lagou 

-- 字符串切分。split，注意 '.' 要转义 
select split("www.lagou.com", "\\.");
["www","lagou","com"]

4.4 数学函数

-- 四舍五入。round 
select round(314.15926); 
select round(314.15926, 2); 
select round(314.15926, -2);

-- 向上取整。ceil 
select ceil(3.1415926); 

-- 向下取整。floor 
select floor(3.1415926); 

-- 其他数学函数包括：绝对值、平方、开方、对数运算、三角运算等

4.5 条件函数*

-- if (boolean testCondition, T valueTrue, T valueFalseOrNull) 
select sal, if (sal<1500, 1, if (sal < 3000, 2, 3)) from emp;
=>if (sal < 1500) then 1 else if sal < 3000 then 2 else 3

-- CASE WHEN a THEN b [WHEN c THEN d]* [ELSE e] END
select sal, case when sal<=1500 then 1 
				 when sal<=3000 then 2 
				 else 3 
		    end sallevel 
from emp;

-- 下面两个等价：
select ename, deptno, 
		case deptno when 10 then 'accounting' 
					when 20 then 'research' 
					when 30 then 'sales' 
					else 'unknown' 
		end deptname 
from emp; 

select ename, deptno, 
	case when deptno=10 then 'accounting' 
		 when deptno=20 then 'research' 
		 when deptno=30 then 'sales' 
		 else 'unknown' 
	end deptname 
from emp;

4.5.1 coalesce函数

-- coalesce函数：返回参数中的第一个非空值；如果所有值都为 NULL，那么返回NULL；
select sal, coalesce(comm, 0) from emp;
-- 输入类型是整数型且允许为空coalesce(comm, 0)，表示把空值转换成0

4.5.2 isnull & isnotnull

select * from emp where isnull(comm); 
select * from emp where isnotnull(comm);

4.5.3 nvl

-- nvl(T value, T default_value)  comm是空值返回0
select empno, ename, job, mgr, hiredate, deptno, sal + nvl(comm,0) sumsal from emp;

4.5.4 nullif

-- nullif(x, y) 
-- 比较 expr1 和 expr2 的值，若二者相等，则返回 NULL,否则返回 expr1 的值。其中 expr1 不能为 NULL
SELECT nullif("b", "b"), nullif("b", "a");

4.6 UDTF函数*

参考：
https://blog.csdn.net/weixin_43230682/article/details/108096441?spm=1001.2101.3001.6650.5&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7Edefault-5.essearch_pc_relevant&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7Edefault-5.essearch_pc_relevant

-- explode 炸裂函数  就是将一行中复杂的 array 或者 map 结构拆分成多行
select explode(array('A','B','C')) as col; 
col
A
B
C

select explode(map('a', 8, 'b', 88, 'c', 888));
key	value
a	8
b	88
c	888

-- lateral view用于和split、explode等UDTF一起使用的，
-- 能将一行数据拆分成多行数据，在此基础上可以对拆分的数据进行聚合，
-- lateral view首先为原始表的每行调用UDTF，UDTF会把一行拆分成一行或者多行，
-- lateral view再把结果组合，产生一个支持别名表的虚拟表。

-- lateral view的基本使用：
with t1 as ( select 'OK' cola, split('www.lagou.com', '\\.') colb )
select cola, colc from t1 lateral view explode(colb) t2 as colc;
cola	colc
OK	www
OK	lagou
OK	com

-- UDTF函数的用例：	
-- 用例1：
1 1,2,3 
2 2,3 
3 1,2
==》
1 1
1 2
1 3 
2 2 
2 3 
3 1 
3 2
-- sql：
select uid, tag from t1lateral view explode(split(tags,",")) t2 as tag;

-- 用例2：
lisi|Chinese:90,Math:80,English:70 
wangwu|Chinese:88,Math:90,English:96
maliu|Chinese:99,Math:65,English:60

-- 创建表 
create table studscore( 
	name string ,
	score map<String,string>
) 
row format delimited 
fields terminated by '|' 
collection items terminated by ',' 
map keys terminated by ':';

-- 需求：找到每个学员的最好成绩 
-- 第一步，使用 explode 函数将map结构拆分为多行 
select explode(score) as (subject, socre) from studscore; 

-- 但是这里缺少了学员姓名，加上学员姓名后出错。下面的语句有是错的 
select name, explode(score) as (subject, socre) from studscore; 

-- 第二步：explode常与 lateral view 函数联用，这两个函数结合在一起能关 联其他字段 
select name, subject, score1 as score 
from studscore lateral view explode(score) t1 as subject, score1; 

-- 第三步：找到每个学员的最好成绩 
select name, max(mark) maxscore 
from (select name, subject, mark 
		from studscore lateral view explode(score) t1 as subject, mark
	 ) t1 
group by name; 

with tmp as ( 
	select name, subject, mark 
	from studscore 
	lateral view explode(score) t1 as subject, mark 
)
select name, max(mark) maxscore from tmp group by name;

4.7 窗口函数*

窗口函数用于计算基于组的某种聚合值，它和聚合函数的不同之处是：对于每个组返回多行，而聚合函数对于每个组只返回一行。窗口函数指定了分析函数工作的数据窗口大小，这个数据窗口大小可能会随着行的变化而变化。
一般要通过 over() 进行开窗，窗口函数是针对每一行数据的，如果over中没有参数，则默认的是全部结果集；

4.7.1 over() 函数

-- 查询emp表工资总和 
select sum(sal) from emp; 

-- 不使用窗口函数，有语法错误 
select ename, sal, sum(sal) salsum from emp; 

-- 使用窗口函数，查询员工姓名、薪水、薪水总和 
select ename, sal, 
	sum(sal) over() salsum, concat(round(sal / sum(sal) over()*100, 1) || '%') ratiosal 
from emp;

4.7.2 partition by 子句

-- 在over窗口中进行分区，对某一列进行分区统计，窗口的大小就是分区的大小;
-- 查询员工姓名、薪水、部门薪水总和 
select ename, sal, sum(sal) over(partition by deptno) salsum from emp;

4.7.3 order by 子句

-- order by 子句对输入的数据进行排序
-- 增加了order by子句；sum：从分组的第一行到当前行求和 
select ename, sal, deptno, 
	sum(sal) over(partition by deptno order by sal) salsum 
from emp;

4.7.4 Window 子句

rows between ... and ...

-- 对窗口的结果做更细粒度的划分，使用window子句，有如下的几个选项：
-- unbounded preceding。组内第一行数据
-- n preceding。组内当前行的前n行数据
-- current row。当前行数据
-- n following。组内当前行的后n行数据
-- unbounded following。组内最后一行数据

在这里插入图片描述

-- 用例
-- rows between ... and ... 子句 
-- 等价。组内，第一行到当前行的和 
select ename, sal, deptno, 
	sum(sal) over(partition by deptno order by ename) 
from emp; 

select ename, sal, deptno, 
	sum(sal) over(partition by deptno order by ename rows between unbounded preceding and current row ) 
from emp; 

-- 组内，第一行到最后一行的和
select ename, sal, deptno, 
	sum(sal) over(partition by deptno order by ename rows between unbounded preceding and unbounded following ) 
from emp; 

-- 组内，前一行、当前行、后一行的和 
select ename, sal, deptno, 
	sum(sal) over(partition by deptno order by ename rows between 1 preceding and 1 following ) 
from emp;

4.7.5 排名函数

## 都是从1开始，生成数据项在分组中的排名。
-- row_number()。排名顺序增加不会重复；如1、2、3、4、... ...
-- RANK()。 排名相等会在名次中留下空位；如1、2、2、4、5、... ...
-- DENSE_RANK()。 排名相等会在名次中不会留下空位 ；如1、2、2、3、4、... ...

### 排序方式的区别
-- row_number / rank / dense_rank 
100 	1	 	1 		1 
100 	2 		1 		1 
100 	3 		1 		1 
99 		4 		4 		2 
98 		5 		5 		3 
98		6		5 		3 
97 		7 		7 		4

### 用例
-- 数据准备 
class1 s01 100 
class1 s03 100 
class1 s05 100 
class1 s07 99 
class1 s09 98 
class1 s02 98 
class1 s04 97 
class2 s21 100 
class2 s24 99 
class2 s27 99 
class2 s22 98
class2 s25 98 
class2 s28 97
class2 s26 96

-- 创建表加载数据 
create table t2( 
	cname string, 
	sname string, 
	score int 
)
row format delimited fields terminated by '\t'; 
load data local inpath '/home/hadoop/data/t2.dat' into table t2;

-- 按照班级，使用3种方式对成绩进行排名 
select cname, sname, score, 
	row_number() over (partition by cname order by score desc) rank1, 
	rank() over (partition by cname order by score desc) rank2, 
	dense_rank() over (partition by cname order by score desc) rank3 
from t2; 

-- 求每个班级前3名的学员--前3名的定义是什么--假设使用dense_rank 
select cname, sname, score, rank from (
	select cname, sname, score, 
		dense_rank() over (partition by cname order by score desc) rank 
	from t2
) tmp 
where rank <= 3;

4.7.6 序列函数

-- lag。返回当前数据行的上一行数据
-- lead。返回当前数据行的下一行数据
-- first_value。取分组内排序后，截止到当前行，第一个值
-- last_value。分组内排序后，截止到当前行，最后一个值
-- ntile。将分组的数据按照顺序切分成n片，返回当前切片值

### 测试数据 userpv.dat。cid ctime pv
cookie1,2019-04-10,1 
cookie1,2019-04-11,5 
cookie1,2019-04-12,7
cookie1,2019-04-13,3 
cookie1,2019-04-14,2 
cookie1,2019-04-15,4
cookie1,2019-04-16,4 
cookie2,2019-04-10,2 
cookie2,2019-04-11,3
cookie2,2019-04-12,5 
cookie2,2019-04-13,6
cookie2,2019-04-14,3 
cookie2,2019-04-15,9
cookie2,2019-04-16,7

-- 建表语句 
create table userpv( 
	cid string, 
	ctime date, 
	pv int 
)
row format delimited fields terminated by ",";

-- 加载数据 
Load data local inpath '/home/hadoop/data/userpv.dat' into table userpv; 

-- lag。返回当前数据行的上一行数据 
-- lead。功能上与lag类似 
select cid, ctime, pv, 
	lag(pv) over(partition by cid order by ctime) lagpv, 
	lead(pv) over(partition by cid order by ctime) leadpv 
from userpv; 

-- first_value / last_value 
select cid, ctime, pv, 
	first_value(pv) over (partition by cid order by ctime rows between unbounded preceding and unbounded following) as firstpv,
	last_value(pv) over (partition by cid order by ctime rows between unbounded preceding and unbounded following) as lastpv 
from userpv; 

-- ntile。按照cid进行分组，每组数据分成2份 
select cid, ctime, pv, ntile(2) over(partition by cid order by ctime) ntile 
from userpv;