HiveQL中left join误区和多join的执行顺序

有数的编程笔记

已于 2022-08-21 11:22:52 修改

阅读量3.5k

点赞数 3

分类专栏： Spark/Hive 文章标签：数据仓库 hive

于 2022-08-21 11:21:12 首次发布

本文链接：https://blog.csdn.net/qq_33446500/article/details/126448817

版权

Spark/Hive 专栏收录该内容

17 篇文章 0 订阅

订阅专栏

文章目录

1. join中的on条件和where条件的区别

--建表
create table t1(id int, value int) partitioned by (ds string);
create table t2(id int, value int) partitioned by (ds string);
create table t3(c1 int, c2 int, c3 int);

--数据装载，t1表
insert overwrite table t1 partition(ds='20220120') select '1','2022';
insert overwrite table t1 partition(ds='20220121') select '2','2022';
insert overwrite table t1 partition(ds='20220122') select '2','2022';

--数据装载，t2表
insert overwrite table t2 partition(ds='20220120') select '1','120';
insert overwrite table t2 partition(ds='20220121') select '1','120';
insert into table t2 partition(ds='20220121') select '3','120';

--数据装载，t3表
insert into table t3 select '1','33','33';
insert into table t3 select '1','34','33';
insert into table t3 select '3','33','33';
insert into table t3 select '4','33','33';

1.1. left join

SQL案例

SELECT  *
FROM t1
LEFT JOIN t2
ON t1.id = t2.id 
where t1.ds = '20220120';
-- 执行结果为
+--------+-----------+-----------+--------+-----------+-----------+
| t1.id  | t1.value  |   t1.ds   | t2.id  | t2.value  |   t2.ds   |
+--------+-----------+-----------+--------+-----------+-----------+
| 1      | 2022      | 20220120  | 1      | 120       | 20220120  |
| 1      | 2022      | 20220120  | 1      | 120       | 20220121  |
+--------+-----------+-----------+--------+-----------+-----------+

对于上述执行结果相信并没有任何的问题，实际执行的结果和我们的预期是完全一致的。

SELECT  *
FROM t1
LEFT JOIN t2
ON t1.id = t2.id AND t1.ds = '20220120';
-- 执行结果为，t1.ds = '20220120'条件未生效
+--------+-----------+-----------+--------+-----------+-----------+
| t1.id  | t1.value  |   t1.ds   | t2.id  | t2.value  |   t2.ds   |
+--------+-----------+-----------+--------+-----------+-----------+
| 1      | 2022      | 20220120  | 1      | 120       | 20220120  |
| 1      | 2022      | 20220120  | 1      | 120       | 20220121  |
| 2      | 2022      | 20220121  | NULL   | NULL      | NULL      |
| 2      | 2022      | 20220122  | NULL   | NULL      | NULL      |
+--------+-----------+-----------+--------+-----------+-----------+

执行结果中返回了t1表中的全部数据(是不是和预期的结果并不一样)，从结果看 join 条件中的 AND t1.ds = '20220120' 并没有“生效”，为什么这样？

而如下所示，在on条件中对t2表进行筛选 t2.ds = '20220120'，则返回结果中过滤了指定的数据。

SELECT  *
FROM t1
LEFT JOIN t2
ON t1.id = t2.id AND t2.ds = '20220120';
-- 执行结果为，t2.ds = '20220120'条件生效
+--------+-----------+-----------+--------+-----------+-----------+
| t1.id  | t1.value  |   t1.ds   | t2.id  | t2.value  |   t2.ds   |
+--------+-----------+-----------+--------+-----------+-----------+
| 1      | 2022      | 20220120  | 1      | 120       | 20220120  |
| 2      | 2022      | 20220121  | NULL   | NULL      | NULL      |
| 2      | 2022      | 20220122  | NULL   | NULL      | NULL      |
+--------+-----------+-----------+--------+-----------+-----------+

分别查看上述三个SQL的执行计划

SELECT  *
FROM t1
LEFT JOIN t2
ON t1.id = t2.id 
where t1.ds = '20220120';

== Physical Plan ==
*(2) BroadcastHashJoin [id#24], [id#27], LeftOuter, BuildRight
:- *(2) FileScan orc zhanglei.t1[id#24,value#25,ds#26] Batched: true, Format: ORC, Location: PrunedInMemoryFileIndex[hdfs://bdptest/warehouse/tablespace/managed/hive/zhanglei.db/t1/ds=20220120], PartitionCount: 1, PartitionFilters: [isnotnull(ds#26), (ds#26 = 20220120)], PushedFilters: [], ReadSchema: struct<id:int,value:int>
+- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
   +- *(1) Project [id#27, value#28, ds#29]
      +- *(1) Filter isnotnull(id#27)
         +- *(1) FileScan orc zhanglei.t2[id#27,value#28,ds#29] Batched: true, Format: ORC, Location: CatalogFileIndex[hdfs://bdptest/warehouse/tablespace/managed/hive/zhanglei.db/t2], PartitionCount: 2, PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:int,value:int>

从上述执行计划中得知where条件中t1.ds = '20220120'生效，体现在对表t1的读取上只读取了ds=20220120分区。

SELECT  *
FROM t1
LEFT JOIN t2
ON t1.id = t2.id AND t1.ds = '20220120';

== Physical Plan ==
*(2) BroadcastHashJoin [id#32], [id#35], LeftOuter, BuildRight, (ds#34 = 20220120)
:- *(2) FileScan orc zhanglei.t1[id#32,value#33,ds#34] Batched: true, Format: ORC, Location: CatalogFileIndex[hdfs://bdptest/warehouse/tablespace/managed/hive/zhanglei.db/t1], PartitionCount: 3, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:int,value:int>
+- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
   +- *(1) Project [id#35, value#36, ds#37]
      +- *(1) Filter isnotnull(id#35)
         +- *(1) FileScan orc zhanglei.t2[id#35,value#36,ds#37] Batched: true, Format: ORC, Location: CatalogFileIndex[hdfs://bdptest/warehouse/tablespace/managed/hive/zhanglei.db/t2], PartitionCount: 2, PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:int,value:int>

从上述执行计划中得知on条件中t1.ds = '20220120'并没有生效，因为并没有过滤条件，仅仅在BroadcastHashJoin中进行了体现。

SELECT  *
FROM t1
LEFT JOIN t2
ON t1.id = t2.id AND t2.ds = '20220120';

== Physical Plan ==
*(2) BroadcastHashJoin [id#40], [id#43], LeftOuter, BuildRight
:- *(2) FileScan orc zhanglei.t1[id#40,value#41,ds#42] Batched: true, Format: ORC, Location: CatalogFileIndex[hdfs://bdptest/warehouse/tablespace/managed/hive/zhanglei.db/t1], PartitionCount: 3, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:int,value:int>
+- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
   +- *(1) Project [id#43, value#44, ds#45]
      +- *(1) Filter isnotnull(id#43)
         +- *(1) FileScan orc zhanglei.t2[id#43,value#44,ds#45] Batched: true, Format: ORC, Location: PrunedInMemoryFileIndex[hdfs://bdptest/warehouse/tablespace/managed/hive/zhanglei.db/t2/ds=20220120], PartitionCount: 1, PartitionFilters: [isnotnull(ds#45), (ds#45 = 20220120)], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:int,value:int>

从上述执行计划中得知where条件中t2.ds = '20220120'生效，体现在对表t2的读取上只读取了ds=20220120分区。

1.2. right join

SELECT  *
FROM t1
RIGHT JOIN t2
ON t1.id = t2.id AND t1.ds = '20220121';
-- 执行结果，t1.ds = '20220121'的条件生效
+--------+-----------+--------+--------+-----------+-----------+
| t1.id  | t1.value  | t1.ds  | t2.id  | t2.value  |   t2.ds   |
+--------+-----------+--------+--------+-----------+-----------+
| NULL   | NULL      | NULL   | 1      | 120       | 20220120  |
| NULL   | NULL      | NULL   | 1      | 120       | 20220121  |
| NULL   | NULL      | NULL   | 3      | 120       | 20220121  |
+--------+-----------+--------+--------+-----------+-----------+

SELECT  *
FROM t1
RIGHT JOIN t2
ON t1.id = t2.id AND t2.ds = '20220120';
-- 执行结果为，t2.ds = '20220120' 的条件未生效
+--------+-----------+-----------+--------+-----------+-----------+
| t1.id  | t1.value  |   t1.ds   | t2.id  | t2.value  |   t2.ds   |
+--------+-----------+-----------+--------+-----------+-----------+
| 1      | 2022      | 20220120  | 1      | 120       | 20220120  |
| NULL   | NULL      | NULL      | 1      | 120       | 20220121  |
| NULL   | NULL      | NULL      | 3      | 120       | 20220121  |
+--------+-----------+-----------+--------+-----------+-----------+

!!! note “”
从上述执行结果中看，left join的on中t1(左表)的条件并不会生效，但是t2(右表)的条件会生效。right join的on中(右表)的条件并不会生效，但是左表的条件会生效。

问题：在left join的on条件中为什么左表的筛选条件不会生效，而右表的筛选条件会生效？？？

1.3. inner join

SELECT  *
FROM t1
JOIN t2
ON t1.id = t2.id 
where t1.ds = '20220120';
--
SELECT  *
FROM t1
JOIN t2
ON t1.id = t2.id AND t1.ds = '20220120';
-- 以上两个SQL执行结果相同
+--------+-----------+-----------+--------+-----------+-----------+
| t1.id  | t1.value  |   t1.ds   | t2.id  | t2.value  |   t2.ds   |
+--------+-----------+-----------+--------+-----------+-----------+
| 1      | 2022      | 20220120  | 1      | 120       | 20220120  |
| 1      | 2022      | 20220120  | 1      | 120       | 20220121  |
+--------+-----------+-----------+--------+-----------+-----------+

在inner join中 on 和 where 条件中的条件都会正常生效。

SELECT  *
FROM t1
JOIN t2
ON t1.id = t2.id AND t2.ds = '20220120';
-- 执行结果如下
+--------+-----------+-----------+--------+-----------+-----------+
| t1.id  | t1.value  |   t1.ds   | t2.id  | t2.value  |   t2.ds   |
+--------+-----------+-----------+--------+-----------+-----------+
| 1      | 2022      | 20220120  | 1      | 120       | 20220120  |
+--------+-----------+-----------+--------+-----------+-----------+

!!! note “”
在inner join中on中的条件无论左右表的条件都会生效。

2. 多个left join执行顺序

继续给t2和t3表中插入测试数据

insert into table t2 partition(ds='20220121') select '3','120';

insert into table t3 select '1','33','33';
insert into table t3 select '1','34','33';
insert into table t3 select '3','33','33';
insert into table t3 select '4','33','33';

案例SQL

SELECT  *
FROM t1
LEFT JOIN t2
ON t1.id = t2.id
LEFT JOIN t3
ON t2.id = c1;
-- 执行结果为
+--------+-----------+-----------+--------+-----------+-----------+--------+--------+--------+
| t1.id  | t1.value  |   t1.ds   | t2.id  | t2.value  |   t2.ds   | t3.c1  | t3.c2  | t3.c3  |
+--------+-----------+-----------+--------+-----------+-----------+--------+--------+--------+
| 1      | 2022      | 20220120  | 1      | 120       | 20220120  | 1      | 33     | 33     |
| 1      | 2022      | 20220120  | 1      | 120       | 20220120  | 1      | 34     | 33     |
| 1      | 2022      | 20220120  | 1      | 120       | 20220121  | 1      | 33     | 33     |
| 1      | 2022      | 20220120  | 1      | 120       | 20220121  | 1      | 34     | 33     |
| 2      | 2022      | 20220122  | NULL   | NULL      | NULL      | NULL   | NULL   | NULL   |
| 2      | 2022      | 20220121  | NULL   | NULL      | NULL      | NULL   | NULL   | NULL   |
+--------+-----------+-----------+--------+-----------+-----------+--------+--------+--------+

从结果得知，多个表进行left join时，是按照join顺序进行的。即先由t1和t2表left join形成一个虚拟表后，再和t3表进行left join成最终结果。

3. 同时有inner join和left join时的执行顺序和结果

SELECT  *
FROM t2
JOIN t3
ON t2.id = t3.c1;
-- 执行结果如下
+--------+-----------+-----------+--------+--------+--------+
| t2.id  | t2.value  |   t2.ds   | t3.c1  | t3.c2  | t3.c3  |
+--------+-----------+-----------+--------+--------+--------+
| 1      | 120       | 20220121  | 1      | 33     | 33     |
| 1      | 120       | 20220120  | 1      | 33     | 33     |
| 1      | 120       | 20220121  | 1      | 34     | 33     |
| 1      | 120       | 20220120  | 1      | 34     | 33     |
| 3      | 120       | 20220121  | 3      | 33     | 33     |
+--------+-----------+-----------+--------+--------+--------+

SELECT  *
FROM t1
left join t2
on t1.id = t2.id
JOIN t3
ON t2.id = t3.c1;
-- 执行结果如下
+--------+-----------+-----------+--------+-----------+-----------+--------+--------+--------+
| t1.id  | t1.value  |   t1.ds   | t2.id  | t2.value  |   t2.ds   | t3.c1  | t3.c2  | t3.c3  |
+--------+-----------+-----------+--------+-----------+-----------+--------+--------+--------+
| 1      | 2022      | 20220120  | 1      | 120       | 20220120  | 1      | 33     | 33     |
| 1      | 2022      | 20220120  | 1      | 120       | 20220121  | 1      | 33     | 33     |
| 1      | 2022      | 20220120  | 1      | 120       | 20220120  | 1      | 34     | 33     |
| 1      | 2022      | 20220120  | 1      | 120       | 20220121  | 1      | 34     | 33     |
+--------+-----------+-----------+--------+-----------+-----------+--------+--------+--------+

SELECT  *
FROM t2
JOIN t3
ON t2.id = t3.c1
left join t1
on t1.id = t2.id;
-- 执行结果如下
+--------+-----------+-----------+--------+--------+--------+--------+-----------+-----------+
| t2.id  | t2.value  |   t2.ds   | t3.c1  | t3.c2  | t3.c3  | t1.id  | t1.value  |   t1.ds   |
+--------+-----------+-----------+--------+--------+--------+--------+-----------+-----------+
| 1      | 120       | 20220120  | 1      | 33     | 33     | 1      | 2022      | 20220120  |
| 1      | 120       | 20220121  | 1      | 33     | 33     | 1      | 2022      | 20220120  |
| 1      | 120       | 20220120  | 1      | 34     | 33     | 1      | 2022      | 20220120  |
| 1      | 120       | 20220121  | 1      | 34     | 33     | 1      | 2022      | 20220120  |
| 3      | 120       | 20220121  | 3      | 33     | 33     | NULL   | NULL      | NULL      |
+--------+-----------+-----------+--------+--------+--------+--------+-----------+-----------+

SELECT  *
FROM t2
JOIN t3
ON t2.id = t3.c1
right join t1
on t1.id = t2.id;
-- 执行结果如下
+--------+-----------+-----------+--------+--------+--------+--------+-----------+-----------+
| t2.id  | t2.value  |   t2.ds   | t3.c1  | t3.c2  | t3.c3  | t1.id  | t1.value  |   t1.ds   |
+--------+-----------+-----------+--------+--------+--------+--------+-----------+-----------+
| 1      | 120       | 20220120  | 1      | 33     | 33     | 1      | 2022      | 20220120  |
| 1      | 120       | 20220120  | 1      | 34     | 33     | 1      | 2022      | 20220120  |
| 1      | 120       | 20220121  | 1      | 33     | 33     | 1      | 2022      | 20220120  |
| 1      | 120       | 20220121  | 1      | 34     | 33     | 1      | 2022      | 20220120  |
| NULL   | NULL      | NULL      | NULL   | NULL   | NULL   | 2      | 2022      | 20220121  |
| NULL   | NULL      | NULL      | NULL   | NULL   | NULL   | 2      | 2022      | 20220122  |
+--------+-----------+-----------+--------+--------+--------+--------+-----------+-----------+