一、HQL操作之--DQL命令【重点】
~~~ [Hive_DQL之简单查询]
~~~ [Hive_DQL之where子句]
~~~ [Hive_DQL之grouply子句]
~~~ [Hive_DQL之表连接]
~~~ [Hive_DQL之order by]
~~~ [Hive_DQL之sort by]
~~~ [Hive_DQL之简单查询]
~~~ [Hive_DQL之distribute]
~~~ [Hive_DQL之cluster by]
### --- DQL -- Data Query Language 数据查询语言
~~~ # select语法:
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list]
[ORDER BY col_list]
[CLUSTER BY col_list | [DISTRIBUTE BY col_list] [SORT BY
col_list]]
[LIMIT [offset,] rows]
### --- SQL语句书写注意事项:
~~~ SQL语句对大小写不敏感
~~~ SQL语句可以写一行(简单SQL)也可以写多行(复杂SQL)
~~~ 关键字不能缩写,也不能分行
~~~ 各子句一般要分行
~~~ 使用缩进格式,提高SQL语句的可读性(重要)
### --- 创建表,加载数据
~~~ # 测试数据 /home/hadoop/data/emp.dat
[root@linux123 ~]# vim /home/hadoop/data/emp.dat
7369,SMITH,CLERK,7902,2010-12-17,800,,20
7499,ALLEN,SALESMAN,7698,2011-02-20,1600,300,30
7521,WARD,SALESMAN,7698,2011-02-22,1250,500,30
7566,JONES,MANAGER,7839,2011-04-02,2975,,20
7654,MARTIN,SALESMAN,7698,2011-09-28,1250,1400,30
7698,BLAKE,MANAGER,7839,2011-05-01,2850,,30
7782,CLARK,MANAGER,7839,2011-06-09,2450,,10
7788,SCOTT,ANALYST,7566,2017-07-13,3000,,20
7839,KING,PRESIDENT,,2011-11-07,5000,,10
7844,TURNER,SALESMAN,7698,2011-09-08,1500,0,30
7876,ADAMS,CLERK,7788,2017-07-13,1100,,20
7900,JAMES,CLERK,7698,2011-12-03,950,,30
7902,FORD,ANALYST,7566,2011-12-03,3000,,20
7934,MILLER,CLERK,7782,2012-01-23,1300,,10
~~~ # 建表并加载数据
hive (mydb)> CREATE TABLE emp (
empno int,
ename string,
job string,
mgr int,
hiredate DATE,
sal int,
comm int,
deptno int
)row format delimited fields terminated by ",";
~~~ # 加载数据
hive (mydb)> LOAD DATA LOCAL INPATH '/home/hadoop/data/emp.dat' INTO TABLE emp;
二、基本查询
### --- 查询语句
~~~ # 省略from子句的查询
hive (mydb)> select 8*888 ;
7104
hive (mydb)> select current_date ;
2021-08-23
~~~ # 使用列别名
hive (mydb)> select 8*888 product;
product
7104
hive (mydb)> select current_date as currdate;
currdate
2021-08-23
~~~ # 全表查询
hive (mydb)> select * from emp;
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7369 SMITH CLERK 7902 2010-12-17 800 NULL 20
7499 ALLEN SALESMAN 7698 2011-02-20 1600 300 30
7521 WARD SALESMAN 7698 2011-02-22 1250 500 30
7566 JONES MANAGER 7839 2011-04-02 2975 NULL 20
7654 MARTIN SALESMAN 7698 2011-09-28 1250 1400 30
7698 BLAKE MANAGER 7839 2011-05-01 2850 NULL 30
7782 CLARK MANAGER 7839 2011-06-09 2450 NULL 10
7788 SCOTT ANALYST 7566 2017-07-13 3000 NULL 20
7839 KING PRESIDENT NULL 2011-11-07 5000 NULL 10
7844 TURNER SALESMAN 7698 2011-09-08 1500 0 30
7876 ADAMS CLERK 7788 2017-07-13 1100 NULL 20
7900 JAMES CLERK 7698 2011-12-03 950 NULL 30
7902 FORD ANALYST 7566 2011-12-03 3000 NULL 20
7934 MILLER CLERK 7782 2012-01-23 1300 NULL 10
~~~ # 选择特定列查询
hive (mydb)> select ename, sal, comm from emp;
ename sal comm
SMITH 800 NULL
ALLEN 1600 300
WARD 1250 500
JONES 2975 NULL
MARTIN 1250 1400
BLAKE 2850 NULL
CLARK 2450 NULL
SCOTT 3000 NULL
KING 5000 NULL
TURNER 1500 0
ADAMS 1100 NULL
JAMES 950 NULL
FORD 3000 NULL
MILLER 1300 NULL
~~~ # 使用函数
hive (mydb)> select count(*) from emp;
14
~~~ # count(colname) 按字段进行count,不统计NULL
hive (mydb)> select sum(sal) from emp;
29025
hive (mydb)> select max(sal) from emp;
5000
hive (mydb)> select min(sal) from emp;
800
hive (mydb)> select avg(sal) from emp;
2073.214285714286
~~~ # 使用limit子句限制返回的行数
hive (mydb)> select * from emp limit 3;
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7369 SMITH CLERK 7902 2010-12-17 800 NULL 20
7499 ALLEN SALESMAN 7698 2011-02-20 1600 300 30
7521 WARD SALESMAN 7698 2011-02-22 1250 500 30
三、where子句
### --- where子句
~~~ WHERE子句紧随FROM子句,使用WHERE子句,过滤不满足条件的数据;
~~~ # where 子句中不能使用列的别名;
### --- where子句中会涉及到较多的比较运算 和 逻辑运算;
hive (mydb)> select * from emp where sal > 2000;
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7566 JONES MANAGER 7839 2011-04-02 2975 NULL 20
7698 BLAKE MANAGER 7839 2011-05-01 2850 NULL 30
7782 CLARK MANAGER 7839 2011-06-09 2450 NULL 10
7788 SCOTT ANALYST 7566 2017-07-13 3000 NULL 20
7839 KING PRESIDENT NULL 2011-11-07 5000 NULL 10
7902 FORD ANALYST 7566 2011-12-03 3000 NULL 20
### --- 比较运算符
~~~ 官方文档:https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
比较运算符 | 描述 |
=、==、<=> | 等于 |
<>、!= | 不等于 |
<、<=、 >、>= | 大于等于、小于等于 |
is [not]null | 如果A等于NULL,则返回TRUE,反之返回FALSE使用NOT关键字结果相反。 |
in(value1,value2, ......) | 匹配列表中的值 |
LIKE | 简单正则表达式,也称通配符模式。'x%' 表示必须以字母 'x' 开头; '%x'表示必须以字母'x'结尾;'%x%'表示包含有字母'x', 可以位于字符串任意位置。使用NOT关键字结果相反。 % 代表匹配零个或多个字符(任意个字符);_ 代表匹配一个字符。 |
[NOT]BETWEEN ... AND ... | 范围的判断,使用NOT关键字结果相反。 |
RLIKE、REGEXP | 基于java的正则表达式,匹配返回TRUE,反之返回FALSE。 匹配使用的是JDK中的正则表达式接口实现的,因为正则也依据其中的规则。 例如,正则表达式必须和整个字符串A相匹配,而不是只需与其字符串匹配。 |
### --- 备注:通常情况下NULL参与运算,返回值为NULL;# NULL<=>NULL的结果为true
~~~ # 逻辑运算符
~~~ 就是我们所熟悉的:and、or、not
~~~ 比较运算符,null参与运算
hive (mydb)> select null=null;
NULL
hive (mydb)> select null==null;
NULL
hive (mydb)> select null<=>null;
true
~~~ # 使用 is null 判空
hive (mydb)> select * from emp where comm is null;
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7369 SMITH CLERK 7902 2010-12-17 800 NULL 20
7566 JONES MANAGER 7839 2011-04-02 2975 NULL 20
7698 BLAKE MANAGER 7839 2011-05-01 2850 NULL 30
7782 CLARK MANAGER 7839 2011-06-09 2450 NULL 10
7788 SCOTT ANALYST 7566 2017-07-13 3000 NULL 20
7839 KING PRESIDENT NULL 2011-11-07 5000 NULL 10
7876 ADAMS CLERK 7788 2017-07-13 1100 NULL 20
7900 JAMES CLERK 7698 2011-12-03 950 NULL 30
7902 FORD ANALYST 7566 2011-12-03 3000 NULL 20
7934 MILLER CLERK 7782 2012-01-23 1300 NULL 10
~~~ # 使用 in
hive (mydb)> select * from emp where deptno in (20, 30);
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7369 SMITH CLERK 7902 2010-12-17 800 NULL 20
7499 ALLEN SALESMAN 7698 2011-02-20 1600 300 30
7521 WARD SALESMAN 7698 2011-02-22 1250 500 30
7566 JONES MANAGER 7839 2011-04-02 2975 NULL 20
7654 MARTIN SALESMAN 7698 2011-09-28 1250 1400 30
7698 BLAKE MANAGER 7839 2011-05-01 2850 NULL 30
7788 SCOTT ANALYST 7566 2017-07-13 3000 NULL 20
7844 TURNER SALESMAN 7698 2011-09-08 1500 0 30
7876 ADAMS CLERK 7788 2017-07-13 1100 NULL 20
7900 JAMES CLERK 7698 2011-12-03 950 NULL 30
7902 FORD ANALYST 7566 2011-12-03 3000 NULL 20
~~~ # 使用 between ... and ...
hive (mydb)> select * from emp where sal between 1000 and 2000;
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7499 ALLEN SALESMAN 7698 2011-02-20 1600 300 30
7521 WARD SALESMAN 7698 2011-02-22 1250 500 30
7654 MARTIN SALESMAN 7698 2011-09-28 1250 1400 30
7844 TURNER SALESMAN 7698 2011-09-08 1500 0 30
7876 ADAMS CLERK 7788 2017-07-13 1100 NULL 20
7934 MILLER CLERK 7782 2012-01-23 1300 NULL 10
~~~ # 使用 like
hive (mydb)> select ename, sal from emp where ename like '%L%';
ename sal
ALLEN 1600
BLAKE 2850
CLARK 2450
MILLER 1300
~~~ # 使用 rlike。正则表达式,名字以A或S开头
hive (mydb)> select ename, sal from emp where ename rlike '^(A|S).*';
ename sal
SMITH 800
ALLEN 1600
SCOTT 3000
ADAMS 1100