Hive: ------ SQL查询、开窗函数、Cube等

分析查询

数据筹备

7369,SMITH,CLERK,7902,1980-12-17 00:00:00,800,\N,20
7499,ALLEN,SALESMAN,7698,1981-02-20 00:00:00,1600,300,30
7521,WARD,SALESMAN,7698,1981-02-22 00:00:00,1250,500,30
7566,JONES,MANAGER,7839,1981-04-02 00:00:00,2975,\N,20
7654,MARTIN,SALESMAN,7698,1981-09-28 00:00:00,1250,1400,30
7698,BLAKE,MANAGER,7839,1981-05-01 00:00:00,2850,\N,30
7782,CLARK,MANAGER,7839,1981-06-09 00:00:00,2450,\N,10
7788,SCOTT,ANALYST,7566,1987-04-19 00:00:00,1500,\N,20
7839,KING,PRESIDENT,\N,1981-11-17 00:00:00,5000,\N,10
7844,TURNER,SALESMAN,7698,1981-09-08 00:00:00,1500,0,30
7876,ADAMS,CLERK,7788,1987-05-23 00:00:00,1100,\N,20
7900,JAMES,CLERK,7698,1981-12-03 00:00:00,950,\N,30
7902,FORD,ANALYST,7566,1981-12-03 00:00:00,3000,\N,20
7934,MILLER,CLERK,7782,1982-01-23 00:00:00,1300,\N,10
CREATE TABLE t_employee(
    empno INT,
    ename STRING,
    job STRING,
    mgr INT,
    hiredate TIMESTAMP,
    sal DECIMAL(7,2),
    comm DECIMAL(7,2),
    deptno INT)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;
10,ACCOUNTING,NEW YORK
20,RESEARCH,DALLAS
30,SALES,CHICAGO
40,OPERATIONS,BOSTON
CREATE TABLE t_dept(
    DEPTNO INT,
    DNAME STRING,
    LOC STRING)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;
0: jdbc:hive2://CentOS:10000> select empno,ename,job,mgr,hiredate,sal,comm,deptno  from t_employee;
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| empno  |  ename  |    job     |  mgr  |        hiredate        |  sal  | comm  | deptno  |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| 7369   | SMITH   | CLERK      | 7902  | 1980-12-17 00:00:00.0  | 800   | NULL  | 20      |
| 7499   | ALLEN   | SALESMAN   | 7698  | 1981-02-20 00:00:00.0  | 1600  | 300   | 30      |
| 7521   | WARD    | SALESMAN   | 7698  | 1981-02-22 00:00:00.0  | 1250  | 500   | 30      |
| 7566   | JONES   | MANAGER    | 7839  | 1981-04-02 00:00:00.0  | 2975  | NULL  | 20      |
| 7654   | MARTIN  | SALESMAN   | 7698  | 1981-09-28 00:00:00.0  | 1250  | 1400  | 30      |
| 7698   | BLAKE   | MANAGER    | 7839  | 1981-05-01 00:00:00.0  | 2850  | NULL  | 30      |
| 7782   | CLARK   | MANAGER    | 7839  | 1981-06-09 00:00:00.0  | 2450  | NULL  | 10      |
| 7788   | SCOTT   | ANALYST    | 7566  | 1987-04-19 00:00:00.0  | 1500  | NULL  | 20      |
| 7839   | KING    | PRESIDENT  | NULL  | 1981-11-17 00:00:00.0  | 5000  | NULL  | 10      |
| 7844   | TURNER  | SALESMAN   | 7698  | 1981-09-08 00:00:00.0  | 1500  | 0     | 30      |
| 7876   | ADAMS   | CLERK      | 7788  | 1987-05-23 00:00:00.0  | 1100  | NULL  | 20      |
| 7900   | JAMES   | CLERK      | 7698  | 1981-12-03 00:00:00.0  | 950   | NULL  | 30      |
| 7902   | FORD    | ANALYST    | 7566  | 1981-12-03 00:00:00.0  | 3000  | NULL  | 20      |
| 7934   | MILLER  | CLERK      | 7782  | 1982-01-23 00:00:00.0  | 1300  | NULL  | 10      |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
14 rows selected (0.047 seconds)
0: jdbc:hive2://CentOS:10000> select deptno,dname,loc from t_dept;
+---------+-------------+-----------+--+
| deptno  |    dname    |    loc    |
+---------+-------------+-----------+--+
| 10      | ACCOUNTING  | NEW YORK  |
| 20      | RESEARCH    | DALLAS    |
| 30      | SALES       | CHICAGO   |
| 40      | OPERATIONS  | BOSTON    |
+---------+-------------+-----------+--+
4 rows selected (0.046 seconds)
CREATE TABLE t_employee_partition(
    empno INT,
    ename STRING,
    job STRING,
    mgr INT,
    hiredate TIMESTAMP,
    sal DECIMAL(7,2),
    comm DECIMAL(7,2))
PARTITIONED BY(deptno INT) 
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;
0: jdbc:hive2://CentOS:10000> set hive.exec.dynamic.partition.mode=nonstrict
0: jdbc:hive2://CentOS:10000> INSERT OVERWRITE TABLE t_employee_partition PARTITION (deptno)  SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee;

SQL查询

单表查询
0: jdbc:hive2://CentOS:10000> select empno,ename,job,mgr,hiredate,sal,comm,deptno  from t_employee;
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| empno  |  ename  |    job     |  mgr  |        hiredate        |  sal  | comm  | deptno  |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| 7369   | SMITH   | CLERK      | 7902  | 1980-12-17 00:00:00.0  | 800   | NULL  | 20      |
| 7499   | ALLEN   | SALESMAN   | 7698  | 1981-02-20 00:00:00.0  | 1600  | 300   | 30      |
| 7521   | WARD    | SALESMAN   | 7698  | 1981-02-22 00:00:00.0  | 1250  | 500   | 30      |
| 7566   | JONES   | MANAGER    | 7839  | 1981-04-02 00:00:00.0  | 2975  | NULL  | 20      |
| 7654   | MARTIN  | SALESMAN   | 7698  | 1981-09-28 00:00:00.0  | 1250  | 1400  | 30      |
| 7698   | BLAKE   | MANAGER    | 7839  | 1981-05-01 00:00:00.0  | 2850  | NULL  | 30      |
| 7782   | CLARK   | MANAGER    | 7839  | 1981-06-09 00:00:00.0  | 2450  | NULL  | 10      |
| 7788   | SCOTT   | ANALYST    | 7566  | 1987-04-19 00:00:00.0  | 1500  | NULL  | 20      |
| 7839   | KING    | PRESIDENT  | NULL  | 1981-11-17 00:00:00.0  | 5000  | NULL  | 10      |
| 7844   | TURNER  | SALESMAN   | 7698  | 1981-09-08 00:00:00.0  | 1500  | 0     | 30      |
| 7876   | ADAMS   | CLERK      | 7788  | 1987-05-23 00:00:00.0  | 1100  | NULL  | 20      |
| 7900   | JAMES   | CLERK      | 7698  | 1981-12-03 00:00:00.0  | 950   | NULL  | 30      |
| 7902   | FORD    | ANALYST    | 7566  | 1981-12-03 00:00:00.0  | 3000  | NULL  | 20      |
| 7934   | MILLER  | CLERK      | 7782  | 1982-01-23 00:00:00.0  | 1300  | NULL  | 10      |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
14 rows selected (0.056 seconds)
WHERE查询
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee WHERE empno > 7782 AND deptno = 10;
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| empno  |  ename  |    job     |  mgr  |        hiredate        |  sal  | comm  | deptno  |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| 7839   | KING    | PRESIDENT  | NULL  | 1981-11-17 00:00:00.0  | 5000  | NULL  | 10      |
| 7934   | MILLER  | CLERK      | 7782  | 1982-01-23 00:00:00.0  | 1300  | NULL  | 10      |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
2 rows selected (0.067 seconds)
DISTINCT查询
0: jdbc:hive2://CentOS:10000> select distinct(job) from t_employee;  去重!!!!!!!!!!
+------------+--+
|    job     |
+------------+--+
| ANALYST    |
| CLERK      |
| MANAGER    |
| PRESIDENT  |
| SALESMAN   |
+------------+--+
分区查询
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee_partition e  WHERE e.deptno >= 20 AND e.deptno <= 40;
+--------+---------+-----------+-------+------------------------+-------+-------+---------+--+
| empno  |  ename  |    job    |  mgr  |        hiredate        |  sal  | comm  | deptno  |
+--------+---------+-----------+-------+------------------------+-------+-------+---------+--+
| 7369   | SMITH   | CLERK     | 7902  | 1980-12-17 00:00:00.0  | 800   | NULL  | 20      |
| 7566   | JONES   | MANAGER   | 7839  | 1981-04-02 00:00:00.0  | 2975  | NULL  | 20      |
| 7788   | SCOTT   | ANALYST   | 7566  | 1987-04-19 00:00:00.0  | 1500  | NULL  | 20      |
| 7876   | ADAMS   | CLERK     | 7788  | 1987-05-23 00:00:00.0  | 1100  | NULL  | 20      |
| 7902   | FORD    | ANALYST   | 7566  | 1981-12-03 00:00:00.0  | 3000  | NULL  | 20      |
| 7499   | ALLEN   | SALESMAN  | 7698  | 1981-02-20 00:00:00.0  | 1600  | 300   | 30      |
| 7521   | WARD    | SALESMAN  | 7698  | 1981-02-22 00:00:00.0  | 1250  | 500   | 30      |
| 7654   | MARTIN  | SALESMAN  | 7698  | 1981-09-28 00:00:00.0  | 1250  | 1400  | 30      |
| 7698   | BLAKE   | MANAGER   | 7839  | 1981-05-01 00:00:00.0  | 2850  | NULL  | 30      |
| 7844   | TURNER  | SALESMAN  | 7698  | 1981-09-08 00:00:00.0  | 1500  | 0     | 30      |
| 7900   | JAMES   | CLERK     | 7698  | 1981-12-03 00:00:00.0  | 950   | NULL  | 30      |
+--------+---------+-----------+-------+------------------------+-------+-------+---------+--+
11 rows selected (0.123 seconds)
LIMIT查询
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee  ORDER BY sal DESC LIMIT 5;
+--------+--------+------------+-------+------------------------+-------+-------+---------+--+
| empno  | ename  |    job     |  mgr  |        hiredate        |  sal  | comm  | deptno  |
+--------+--------+------------+-------+------------------------+-------+-------+---------+--+
| 7839   | KING   | PRESIDENT  | NULL  | 1981-11-17 00:00:00.0  | 5000  | NULL  | 10      |
| 7902   | FORD   | ANALYST    | 7566  | 1981-12-03 00:00:00.0  | 3000  | NULL  | 20      |
| 7566   | JONES  | MANAGER    | 7839  | 1981-04-02 00:00:00.0  | 2975  | NULL  | 20      |
| 7698   | BLAKE  | MANAGER    | 7839  | 1981-05-01 00:00:00.0  | 2850  | NULL  | 30      |
| 7782   | CLARK  | MANAGER    | 7839  | 1981-06-09 00:00:00.0  | 2450  | NULL  | 10      |
+--------+--------+------------+-------+------------------------+-------+-------+---------+--+
5 rows selected (14.294 seconds)
GROUP BY查询
0: jdbc:hive2://CentOS:10000> set hive.map.aggr=true;
0: jdbc:hive2://CentOS:10000> SELECT deptno,SUM(sal) as total FROM t_employee GROUP BY deptno;
+---------+--------+--+
| deptno  | total  |
+---------+--------+--+
| 10      | 8750   |
| 20      | 9375   |
| 30      | 9400   |
+---------+--------+--+
3 rows selected (12.645 seconds)

hive.map.aggr控制程序如何进行聚合。默认值为false。如果设置为true,Hive会在map阶段就执行一次聚合。这可以提高聚合效率,但需要消耗更多内存。

ORDER AND SORT

可以使用ORDER BY或者Sort BY对查询结果进行排序,排序字段可以是整型也可以是字符串:如果是整型,则按照大小排序;如果是字符串,则按照字典序排序。ORDER BY 和 SORT BY 的区别如下:使用ORDER BY时会有一个Reducer对全部查询结果进行排序,可以保证数据的全局有序性;使用SORT BY时只会在每个Reducer中进行排序,这可以保证每个Reducer的输出数据是有序的,但不能保证全局有序。由于ORDER BY的时间可能很长,如果你设置了严格模式(hive.mapred.mode = strict),则其后面必须再跟一个limit子句。

  • sort by
0: jdbc:hive2://CentOS:10000> set mapreduce.job.reduces=2
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal from t_employee sort by sal desc;
+--------+---------+-------+--+
| empno  |  ename  |  sal  |
+--------+---------+-------+--+
| 7902   | FORD    | 3000  |
| 7566   | JONES   | 2975  |
| 7844   | TURNER  | 1500  |
| 7788   | SCOTT   | 1500  |
| 7521   | WARD    | 1250  |
| 7654   | MARTIN  | 1250  |
| 7876   | ADAMS   | 1100  |
| 7900   | JAMES   | 950   |
| 7369   | SMITH   | 800   |
| 7839   | KING    | 5000  |
| 7698   | BLAKE   | 2850  |
| 7782   | CLARK   | 2450  |
| 7499   | ALLEN   | 1600  |
| 7934   | MILLER  | 1300  |
+--------+---------+-------+--+
14 rows selected (14.474 seconds)
  • order by
0: jdbc:hive2://CentOS:10000> set mapreduce.job.reduces=3;
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal from t_employee order by sal desc;
+--------+---------+-------+--+
| empno  |  ename  |  sal  |
+--------+---------+-------+--+
| 7839   | KING    | 5000  |
| 7902   | FORD    | 3000  |
| 7566   | JONES   | 2975  |
| 7698   | BLAKE   | 2850  |
| 7782   | CLARK   | 2450  |
| 7499   | ALLEN   | 1600  |
| 7844   | TURNER  | 1500  |
| 7788   | SCOTT   | 1500  |
| 7934   | MILLER  | 1300  |
| 7654   | MARTIN  | 1250  |
| 7521   | WARD    | 1250  |
| 7876   | ADAMS   | 1100  |
| 7900   | JAMES   | 950   |
| 7369   | SMITH   | 800   |
+--------+---------+-------+--+
14 rows selected (13.049 seconds)
0: jdbc:hive2://CentOS:10000> set hive.mapred.mode = strict;
No rows affected (0.004 seconds)
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal from t_employee order by sal desc;
Error: Error while compiling statement: FAILED: SemanticException 1:48 In strict mode, if ORDER BY is specified, LIMIT must also be specified. Error encountered near token 'sal' (state=42000,code=40000)
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal from t_employee order by sal desc limit 5; 
+--------+--------+-------+--+
| empno  | ename  |  sal  |
+--------+--------+-------+--+
| 7839   | KING   | 5000  |
| 7902   | FORD   | 3000  |
| 7566   | JONES  | 2975  |
| 7698   | BLAKE  | 2850  |
| 7782   | CLARK  | 2450  |
+--------+--------+-------+--+
5 rows selected (12.468 seconds)

8、HAVING过滤

0: jdbc:hive2://CentOS:10000> SELECT deptno,SUM(sal) total FROM t_employee GROUP BY deptno HAVING SUM(sal)>9000;
+---------+--------+--+
| deptno  | total  |
+---------+--------+--+
| 30      | 9400   |
| 20      | 9375   |
+---------+--------+--+
2 rows selected (18.361 seconds)
DISTRIBUTE BY

默认情况下,MapReduce程序会对Map输出结果的Key值进行散列,并均匀分发到所有Reducer上。如果想要把具有相同Key值的数据分发到同一个Reducer进行处理,这就需要使用DISTRIBUTE BY字句。需要注意的是,DISTRIBUTE BY虽然能保证具有相同Key值的数据分发到同一个Reducer,但是不能保证数据在Reducer上是有序的。

0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal, deptno  FROM t_employee distribute BY deptno;
+--------+---------+-------+---------+--+
| empno  |  ename  |  sal  | deptno  |
+--------+---------+-------+---------+--+
| 7654   | MARTIN  | 1250  | 30      |
| 7900   | JAMES   | 950   | 30      |
| 7698   | BLAKE   | 2850  | 30      |
| 7521   | WARD    | 1250  | 30      |
| 7844   | TURNER  | 1500  | 30      |
| 7499   | ALLEN   | 1600  | 30      |
| 7934   | MILLER  | 1300  | 10      |
| 7839   | KING    | 5000  | 10      |
| 7782   | CLARK   | 2450  | 10      |
| 7788   | SCOTT   | 1500  | 20      |
| 7566   | JONES   | 2975  | 20      |
| 7876   | ADAMS   | 1100  | 20      |
| 7902   | FORD    | 3000  | 20      |
| 7369   | SMITH   | 800   | 20      |
+--------+---------+-------+---------+--+
14 rows selected (15.504 seconds)
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal, deptno  FROM t_employee distribute BY deptno sort by sal desc;
+--------+---------+-------+---------+--+
| empno  |  ename  |  sal  | deptno  |
+--------+---------+-------+---------+--+
| 7698   | BLAKE   | 2850  | 30      |
| 7499   | ALLEN   | 1600  | 30      |
| 7844   | TURNER  | 1500  | 30      |
| 7521   | WARD    | 1250  | 30      |
| 7654   | MARTIN  | 1250  | 30      |
| 7900   | JAMES   | 950   | 30      |
| 7839   | KING    | 5000  | 10      |
| 7782   | CLARK   | 2450  | 10      |
| 7934   | MILLER  | 1300  | 10      |
| 7902   | FORD    | 3000  | 20      |
| 7566   | JONES   | 2975  | 20      |
| 7788   | SCOTT   | 1500  | 20      |
| 7876   | ADAMS   | 1100  | 20      |
| 7369   | SMITH   | 800   | 20      |
+--------+---------+-------+---------+--+
14 rows selected (16.528 seconds)
CLUSTER BY

如果SORT BYDISTRIBUTE BY指定的是相同字段,且SORT BY排序规则是ASC,此时可以使用CLUSTER BY进行替换。

0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal, deptno  FROM t_employee cluster by deptno;
+--------+---------+-------+---------+--+
| empno  |  ename  |  sal  | deptno  |
+--------+---------+-------+---------+--+
| 7934   | MILLER  | 1300  | 10      |
| 7839   | KING    | 5000  | 10      |
| 7782   | CLARK   | 2450  | 10      |
| 7876   | ADAMS   | 1100  | 20      |
| 7788   | SCOTT   | 1500  | 20      |
| 7369   | SMITH   | 800   | 20      |
| 7566   | JONES   | 2975  | 20      |
| 7902   | FORD    | 3000  | 20      |
| 7844   | TURNER  | 1500  | 30      |
| 7499   | ALLEN   | 1600  | 30      |
| 7698   | BLAKE   | 2850  | 30      |
| 7654   | MARTIN  | 1250  | 30      |
| 7521   | WARD    | 1250  | 30      |
| 7900   | JAMES   | 950   | 30      |
+--------+---------+-------+---------+--+
14 rows selected (25.847 seconds)
表Join查询

Hive支持内连接,外连接,左外连接,右外连接,笛卡尔连接,这和传统数据库中的概念是一致的。需要特别强调:JOIN语句的关联条件必须用ON指定,不能用WHERE指定,否则就会先做笛卡尔积,再过滤,这会导致你得不到预期的结果。

  • 内连接
0: jdbc:hive2://CentOS:10000>  SELECT e.empno,e.ename,e.sal,d.dname,d.deptno FROM t_employee e JOIN t_dept d ON e.deptno = d.deptno WHERE e.empno=7369;
+----------+----------+--------+-----------+-----------+--+
| e.empno  | e.ename  | e.sal  |  d.dname  | d.deptno  |
+----------+----------+--------+-----------+-----------+--+
| 7369     | SMITH    | 800    | RESEARCH  | 20        |
+----------+----------+--------+-----------+-----------+--+
1 row selected (10.419 seconds)
  • 外连接
0: jdbc:hive2://CentOS:10000>  SELECT e.empno,e.ename,e.sal,d.dname,d.deptno FROM t_employee e LEFT OUTER JOIN t_dept d ON e.deptno = d.deptno;
+----------+----------+--------+-------------+-----------+--+
| e.empno  | e.ename  | e.sal  |   d.dname   | d.deptno  |
+----------+----------+--------+-------------+-----------+--+
| 7369     | SMITH    | 800    | RESEARCH    | 20        |
| 7499     | ALLEN    | 1600   | SALES       | 30        |
| 7521     | WARD     | 1250   | SALES       | 30        |
| 7566     | JONES    | 2975   | RESEARCH    | 20        |
| 7654     | MARTIN   | 1250   | SALES       | 30        |
| 7698     | BLAKE    | 2850   | SALES       | 30        |
| 7782     | CLARK    | 2450   | ACCOUNTING  | 10        |
| 7788     | SCOTT    | 1500   | RESEARCH    | 20        |
| 7839     | KING     | 5000   | ACCOUNTING  | 10        |
| 7844     | TURNER   | 1500   | SALES       | 30        |
| 7876     | ADAMS    | 1100   | RESEARCH    | 20        |
| 7900     | JAMES    | 950    | SALES       | 30        |
| 7902     | FORD     | 3000   | RESEARCH    | 20        |
| 7934     | MILLER   | 1300   | ACCOUNTING  | 10        |
+----------+----------+--------+-------------+-----------+--+
14 rows selected (11.424 seconds)
0: jdbc:hive2://CentOS:10000>  SELECT e.empno,e.ename,e.sal,d.dname,d.deptno FROM t_employee e RIGHT OUTER JOIN t_dept d ON e.deptno = d.deptno;
+----------+----------+--------+-------------+-----------+--+
| e.empno  | e.ename  | e.sal  |   d.dname   | d.deptno  |
+----------+----------+--------+-------------+-----------+--+
| 7782     | CLARK    | 2450   | ACCOUNTING  | 10        |
| 7839     | KING     | 5000   | ACCOUNTING  | 10        |
| 7934     | MILLER   | 1300   | ACCOUNTING  | 10        |
| 7369     | SMITH    | 800    | RESEARCH    | 20        |
| 7566     | JONES    | 2975   | RESEARCH    | 20        |
| 7788     | SCOTT    | 1500   | RESEARCH    | 20        |
| 7876     | ADAMS    | 1100   | RESEARCH    | 20        |
| 7902     | FORD     | 3000   | RESEARCH    | 20        |
| 7499     | ALLEN    | 1600   | SALES       | 30        |
| 7521     | WARD     | 1250   | SALES       | 30        |
| 7654     | MARTIN   | 1250   | SALES       | 30        |
| 7698     | BLAKE    | 2850   | SALES       | 30        |
| 7844     | TURNER   | 1500   | SALES       | 30        |
| 7900     | JAMES    | 950    | SALES       | 30        |
| NULL     | NULL     | NULL   | OPERATIONS  | 40        |
+----------+----------+--------+-------------+-----------+--+
15 rows selected (11.063 seconds)
0: jdbc:hive2://CentOS:10000>  SELECT e.empno,e.ename,e.sal,d.dname,d.deptno FROM t_employee e FULL OUTER JOIN t_dept d ON e.deptno = d.deptno;
+----------+----------+--------+-------------+-----------+--+
| e.empno  | e.ename  | e.sal  |   d.dname   | d.deptno  |
+----------+----------+--------+-------------+-----------+--+
| 7934     | MILLER   | 1300   | ACCOUNTING  | 10        |
| 7839     | KING     | 5000   | ACCOUNTING  | 10        |
| 7782     | CLARK    | 2450   | ACCOUNTING  | 10        |
| 7876     | ADAMS    | 1100   | RESEARCH    | 20        |
| 7788     | SCOTT    | 1500   | RESEARCH    | 20        |
| 7369     | SMITH    | 800    | RESEARCH    | 20        |
| 7566     | JONES    | 2975   | RESEARCH    | 20        |
| 7902     | FORD     | 3000   | RESEARCH    | 20        |
| 7844     | TURNER   | 1500   | SALES       | 30        |
| 7499     | ALLEN    | 1600   | SALES       | 30        |
| 7698     | BLAKE    | 2850   | SALES       | 30        |
| 7654     | MARTIN   | 1250   | SALES       | 30        |
| 7521     | WARD     | 1250   | SALES       | 30        |
| 7900     | JAMES    | 950    | SALES       | 30        |
| NULL     | NULL     | NULL   | OPERATIONS  | 40        |
+----------+----------+--------+-------------+-----------+--+
15 rows selected (24.703 seconds)

12、LEFT SEMI JOIN

LEFT SEMI JOIN (左半连接)是 IN/EXISTS 子查询的一种更高效的实现。

  • JOIN 子句中右边的表只能在 ON 子句中设置过滤条件;
  • 查询结果只包含左边表的数据,所以只能SELECT左表中的列。
0: jdbc:hive2://CentOS:10000> SELECT e.empno,e.ename,d.dname FROM t_employee e LEFT SEMI JOIN t_dept d ON e.deptno = d.deptno AND d.loc="NEW YORK";
+----------+----------+-----------+--+
| e.empno  | e.ename  | e.deptno  |
+----------+----------+-----------+--+
| 7782     | CLARK    | 10        |
| 7839     | KING     | 10        |
| 7934     | MILLER   | 10        |
+----------+----------+-----------+--+
3 rows selected (10.119 seconds)
JOIN优化
  • STREAMTABLE

在多表进行join的时候,如果每个ON子句都使用到共同的列,此时Hive会进行优化,将多表JOIN在同一个map / reduce作业上进行。同时假定查询的最后一个表是最大的一个表,在对每行记录进行JOIN操作时,它将尝试将其他的表缓存起来,然后扫描最后那个表进行计算。因此用户需要保证查询的表的大小从左到右是依次增加的。

SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key) JOIN c ON (c.key = b.key)

然而用户并非需要总是把最大的表放在查询语句的最后面,Hive提供了/*+ STREAMTABLE() */标志,使用该标识来指出大表,能避免数据表过大导致占用内存过多而产生的问题。示例如下:

0: jdbc:hive2://CentOS:10000> SELECT /*+ STREAMTABLE(e) */ e.empno,e.ename,d.dname,d.deptno FROM t_employee e JOIN t_dept d ON e.deptno = d.deptno WHERE job='CLERK';
+----------+----------+-------------+-----------+--+
| e.empno  | e.ename  |   d.dname   | d.deptno  |
+----------+----------+-------------+-----------+--+
| 7369     | SMITH    | RESEARCH    | 20        |
| 7876     | ADAMS    | RESEARCH    | 20        |
| 7900     | JAMES    | SALES       | 30        |
| 7934     | MILLER   | ACCOUNTING  | 10        |
+----------+----------+-------------+-----------+--+
4 rows selected (11.645 seconds)
  • MAPJOIN

如果在进行join操作时,有一个表很小,则可以将join操作调整到map阶段执行。这就是典型的极大表和极小表关联问题。有两种解决方式:1.增加**/*+ MAPJOIN(b) */标示;2.设置参数hive.optimize.bucketmapjoin = true**,在

0: jdbc:hive2://CentOS:10000> SELECT /*+ MAPJOIN(d) */ e.empno, e.ename,d.dname FROM t_employee e  JOIN t_dept d ON d.deptno = e.deptno;
+----------+----------+-------------+--+
| e.empno  | e.ename  |   d.dname   |
+----------+----------+-------------+--+
| 7369     | SMITH    | RESEARCH    |
| 7499     | ALLEN    | SALES       |
| 7521     | WARD     | SALES       |
| 7566     | JONES    | RESEARCH    |
| 7654     | MARTIN   | SALES       |
| 7698     | BLAKE    | SALES       |
| 7782     | CLARK    | ACCOUNTING  |
| 7788     | SCOTT    | RESEARCH    |
| 7839     | KING     | ACCOUNTING  |
| 7844     | TURNER   | SALES       |
| 7876     | ADAMS    | RESEARCH    |
| 7900     | JAMES    | SALES       |
| 7902     | FORD     | RESEARCH    |
| 7934     | MILLER   | ACCOUNTING  |
+----------+----------+-------------+--+
14 rows selected (11.416 seconds)
开窗函数
0: jdbc:hive2://CentOS:10000> select e.empno ,e.ename,e.sal,e.deptno,rank() over(partition by e.deptno order by e.sal desc) as rank from t_employee e; 
+----------+----------+--------+-----------+-------+--+
| e.empno  | e.ename  | e.sal  | e.deptno  | rank  |
+----------+----------+--------+-----------+-------+--+
| 7839     | KING     | 5000   | 10        | 1     |
| 7782     | CLARK    | 2450   | 10        | 2     |
| 7934     | MILLER   | 1300   | 10        | 3     |
| 7902     | FORD     | 3000   | 20        | 1     |
| 7566     | JONES    | 2975   | 20        | 2     |
| 7788     | SCOTT    | 1500   | 20        | 3     |
| 7876     | ADAMS    | 1100   | 20        | 4     |
| 7369     | SMITH    | 800    | 20        | 5     |
| 7698     | BLAKE    | 2850   | 30        | 1     |
| 7499     | ALLEN    | 1600   | 30        | 2     |
| 7844     | TURNER   | 1500   | 30        | 3     |
| 7654     | MARTIN   | 1250   | 30        | 4     |
| 7521     | WARD     | 1250   | 30        | 4     |
| 7900     | JAMES    | 950    | 30        | 6     |
+----------+----------+--------+-----------+-------+--+
0: jdbc:hive2://CentOS:10000> select e.empno ,e.ename,e.sal,e.deptno,dense_rank() over(partition by e.deptno order by e.sal desc) as rank from t_employee e; 
+----------+----------+--------+-----------+-------+--+
| e.empno  | e.ename  | e.sal  | e.deptno  | rank  |
+----------+----------+--------+-----------+-------+--+
| 7839     | KING     | 5000   | 10        | 1     |
| 7782     | CLARK    | 2450   | 10        | 2     |
| 7934     | MILLER   | 1300   | 10        | 3     |
| 7902     | FORD     | 3000   | 20        | 1     |
| 7566     | JONES    | 2975   | 20        | 2     |
| 7788     | SCOTT    | 1500   | 20        | 3     |
| 7876     | ADAMS    | 1100   | 20        | 4     |
| 7369     | SMITH    | 800    | 20        | 5     |
| 7698     | BLAKE    | 2850   | 30        | 1     |
| 7499     | ALLEN    | 1600   | 30        | 2     |
| 7844     | TURNER   | 1500   | 30        | 3     |
| 7654     | MARTIN   | 1250   | 30        | 4     |
| 7521     | WARD     | 1250   | 30        | 4     |
| 7900     | JAMES    | 950    | 30        | 5     |
+----------+----------+--------+-----------+-------+--+
14 rows selected (24.262 seconds)

1,查询员工信息,展示该员工所在部门的平均薪资

select e1.empno,e1.ename,e3.avg,e3.dno from t_employee e1 left join (select e2.deptno dno,avg(e2.sal) avg from t_employee e2 group by e2.deptno) e3 on e1.deptno=e3.dno;
开窗等价
select e1.empno,e1.ename, avg(e1.sal) over (partition by e1.deptno) from t_employee e1;

2,查询员工信息,展示该员工所在部门薪资排名?

select e1.empno,e1.ename,e1.sal,e1.deptno,sum(1) over (partition by e1.deptno ) from t_employee e1;
等价写法
select e1.empno,e1.ename,e1.sal,e1.deptno,sum(1) over (partition by e1.deptno order by e1.sal rows between  unbounded preceding and unbounded following ) from t_employee e1;

select e1.empno,e1.ename,e1.sal,e1.deptno,sum(1) over (partition by e1.deptno order by e1.sal desc rows between unbounded preceding and current row ) from t_employee e1 order by e1.deptno;
               无限的     前面           无限的      后面
rows between  unbounded preceding and unbounded following  
               无限的                    当前行
rows between  unbounded preceding and current row 
              当前行             无限的   后续的
rows between  current row  and unbounded following  
               前1个           后一个
rows between  -1 preceding and 1 following  
               前1个           后一个
rows between  -1 preceding and 1 following  
Cube分析

例如下面示例 分别算出(10部门和 CLERK 工作)、(10部门)、(CLERK工作)的avg、max、min 用Cube直接实现

0: jdbc:hive2://CentOS:10000> select e.deptno,e.job,avg(e.sal) avg,max(e.sal) max,min(e.sal) min from t_employee e group by e.deptno,e.job with cube;
+-----------+------------+--------------+-------+-------+--+
| e.deptno  |   e.job    |     avg      |  max  |  min  |
+-----------+------------+--------------+-------+-------+--+
| NULL      | ANALYST    | 2250         | 3000  | 1500  |
| 10        | CLERK      | 1300         | 1300  | 1300  |
| 20        | CLERK      | 950          | 1100  | 800   |
| 30        | CLERK      | 950          | 950   | 950   |
| 20        | ANALYST    | 2250         | 3000  | 1500  |
| NULL      | PRESIDENT  | 5000         | 5000  | 5000  |
| 10        | PRESIDENT  | 5000         | 5000  | 5000  |
| NULL      | SALESMAN   | 1400         | 1600  | 1250  |
| NULL      | MANAGER    | 2758.333333  | 2975  | 2450  |
| 30        | SALESMAN   | 1400         | 1600  | 1250  |
| 10        | MANAGER    | 2450         | 2450  | 2450  |
| 20        | MANAGER    | 2975         | 2975  | 2975  |
| 30        | MANAGER    | 2850         | 2850  | 2850  |
| NULL      | NULL       | 1966.071429  | 5000  | 800   |
| NULL      | CLERK      | 1037.5       | 1300  | 800   |
| 10        | NULL       | 2916.666667  | 5000  | 1300  |
| 20        | NULL       | 1875         | 3000  | 800   |
| 30        | NULL       | 1566.666667  | 2850  | 950   |
+-----------+------------+--------------+-------+-------+--+
18 rows selected (25.037 seconds)
行转列
1,语文,100
1,数学,100
1,英语,100
2,数学,79
2,语文,80
2,英语,100
CREATE TABLE t_student(
    id INT,
    course STRING,
    score double)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;
0: jdbc:hive2://CentOS:10000> select * from t_student;
+---------------+-------------------+------------------+--+
| t_student.id  | t_student.course  | t_student.score  |
+---------------+-------------------+------------------+--+
| 1             | 语文                | 100.0            |
| 1             | 数学                | 100.0            |
| 1             | 英语                | 100.0            |
| 2             | 数学                | 79.0             |
| 2             | 语文                | 80.0             |
| 2             | 英语                | 100.0            |
+---------------+-------------------+------------------+--+
6 rows selected (0.05 seconds)
0: jdbc:hive2://CentOS:10000> select id,max(case course when '语文' then score else 0 end) as chinese,max(case course when '数学' then score else 0 end ) as math,max(case course when '英语' then score else 0 end ) as english from t_student group by id ;

+-----+----------+--------+----------+--+
| id  | chinese  |  math  | english  |
+-----+----------+--------+----------+--+
| 1   | 100.0    | 100.0  | 100.0    |
| 2   | 80.0     | 79.0   | 100.0    |
+-----+----------+--------+----------+--+
2 rows selected (25.617 seconds)

SELECT id,concat_ws(’,’, collect_set(concat(course, ‘:’, score))) 成绩 FROM t_student GROUP BY id

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值