简单查询
between
-- SQL执行顺序:
from > join > where > group by > having > select >order by
-- 需求:获取薪资水平在 1000-3000
SELECT
*
FROM emp WHERE sal between 1000 and 3000
in
-- 获取emp表中,部门编号为 10 和 20
SELECT
*
FROM emp WHERE deptno in (10,20)
-- 获取emp表中,大于平均薪资水平的员工
select
T2.*
from emp T2 where T2.sal > (select avg(T1.sal) from emp T1)
-- 获取emp表中,薪资水平有5000的部门中所有员工的信息
SELECT
*
FROM emp WHERE deptno in (SELECT deptno from emp T1 where T1.sal = 5000)
关联
JOIN
-- 在数据量较小的情况下,MR执行过程只有Map端,说明join发生在map端,
-- 在hive中对join有优化
SELECT
T1.*
,T2.deptname
FROM emp T1 JOIN dept T2 ON T1.deptno = T2.deptno
WHERE T2.deptno = 20;
LEFT JOIN
SELECT
T1.*
,T2.*
FROM dept T1 LEFT JOIN emp T2 ON T1.deptno = T2.deptno
RIGHT JOIN
-- 左右关联时,保存主表中的所有数据, RIGHT JOIN 以右表为主表,LEFT JOIN 以左表为主表
SELECT
T1.*
,T2.*
FROM dept T1 RIGHT JOIN emp T2 ON T1.deptno = T2.deptno
GROUP BY
-- 根据部门编号取各部门中薪资最高人员
SELECT
deptno
,max(sal) as sal
FROM emp
GROUP BY deptno
-- 根据部门编号取各部门中薪资最高人员的详细人员信息
-- 注意:GROUP BY中不包含的字段不能在SELECT中存在
SELECT
deptno
,max(sal) as sal
FROM emp
GROUP BY deptno
SELECT
T1.*
FROM emp T1 JOIN (
SELECT
deptno
,max(sal) as sal
FROM emp
GROUP BY deptno
)T2 ON T1.deptno = T2.deptno AND T2.sal = T1.sal
排序
ORDER BY
查看及设置Reduce数量
set mapreduce.job.reduces = 2;
set mapreduce.job.reduces;
-- ORDER BY在执行过程中,只能获取一个Reduce,如果数据量较大的情况下,单个reduce处理速度会非常慢,
SELECT
T1.sal
,T1.empno
,T1.deptno
FROM emp T1
ORDER BY T1.sal
sort by
set mapreduce.job.reduces = -1;
SELECT
T1.sal
,T1.empno
,T1.deptno
FROM emp T1
sort by T1.sal
set mapreduce.job.reduces=2;
INSERT OVERWRITE LOCAL DIRECTORY "/usr/local/soft/data/emp_sort2"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
SELECT
T1.sal
,T1.empno
,T1.deptno
FROM emp T1
sort by T1.sal
-- 通过查看输出文件,可以了解到,当reduce数量大于1时,会生成多个文件,并在文件中,数据是有序的
distribute by + sort by
-- distribute by是给字段进行做分区,和sort by 搭配使用时,可以对分区内的数据进行做排序
set mapreduce.job.reduces=3;
INSERT OVERWRITE LOCAL DIRECTORY "/usr/local/soft/data/emp_distribute"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
SELECT
T1.sal
,T1.empno
,T1.deptno
FROM emp T1
distribute by T1.deptno sort by T1.deptno,T1.sal
-- 如果当排序列只有一个时,distribute by + sort by并且reduce数量大于1可能会产生错误结果
cluster by
set mapreduce.job.reduces=2
SELECT
T1.sal
,T1.empno
,T1.deptno
FROM emp T1
cluster by T1.deptno,T1.sal
去重
distinct
-- 当reduce数量大于1时也可以得到正确结果
set mapreduce.job.reduces=2;
select
distinct job
FROM emp
group by
-- 当reduce数量大于1时也可以得到正确结果
set mapreduce.job.reduces=2
select
job
FROM emp
GROUP BY job
WITH AS
#打印出每个部门工资最高的员工信息
WITH T2 as (
SELECT
deptno
,max(sal) as sal
FROM emp
GROUP BY deptno
)
, T3 AS (
select T1.*
FROM emp T1 JOIN T2 ON T1.deptno = T2.deptno AND T2.sal = T1.sal
)
SELECT
*
FROM T3 JOIN T2 ON T3.deptno = T2.deptno
-- 场景:
-- 通常 WITH AS被应用在某些表被反复利用的场景中,或者是SQL相对比较复杂的业务逻辑中
拼接
UNION 去重
UNION ALL:不去重
函数
查看函数
-- 查询所有函数 show functions -- 模糊查询 show functions like '%sub%'; -- 查看函数的描述信息 desc function 'substr'; -- 查看函数的描述信息及样例 desc function extended 'substr';