结论:(测试版本hive 1.1.0-cdh5.16.2)
join时如果关联字段值为null,则该行数据放弃,不会和其他表进行关联
join时如果关联字段值为空字符串,视为正常数据,会和其他表关联字段也为空字符串的行数据进行关联
测试如下:
create table tmp.test20200409_1 (id int,name string);
create table tmp.test20200409_2 (id int,name string);
insert into table tmp.test20200409_1 values(1,'zbz'),(2,null),(3,'');
insert into table tmp.test20200409_2 values(4,'zbz'),(5,null),(6,'');
inner join
select a.*,b.* from tmp.test20200409_1 a join tmp.test20200409_2 b on a.name=b.name;
a.id | a.name | b.name | b.id |
1 | zbz | zbz | 4 |
3 | 6 |
left join
select a.*,b.* from tmp.test20200409_1 a left join tmp.test20200409_2 b on a.name=b.name;
a.id | a.name | b.name | b.id |
1 | zbz | zbz | 4 |
2 | null | null | null |
3 | 6 |
right join
select a.*,b.* from tmp.test20200409_1 a right join tmp.test20200409_2 b on a.name=b.name;
a.id | a.name | b.name | b.id |
1 | zbz | zbz | 4 |
null | null | null | 5 |
3 | 6 |
full join
select a.*,b.* from tmp.test20200409_1 a full join tmp.test20200409_2 b on a.name=b.name;
a.id | a.name | b.name | b.id |
1 | zbz | zbz | 4 |
3 | 6 | ||
2 | null | null | null |
null | null | null | 5 |
left semi join
select a.*,b.* from tmp.test20200409_1 a left semi join tmp.test20200409_2 b on a.name=b.name;
a.id | a.name |
1 | zbz |
3 |
cross join
select a.*,b.* from tmp.test20200409_1 a cross join tmp.test20200409_2 b on a.name=b.name;
a.id | a.name | b.name | b.id |
1 | zbz | zbz | 4 |
3 | 6 |
select a.*,b.* from tmp.test20200409_1 a cross join tmp.test20200409_2 b ;
a.id | a.name | b.name | b.id |
1 | zbz | zbz | 4 |
2 | null | zbz | 4 |
3 | zbz | 4 | |
1 | zbz | null | 5 |
2 | null | null | 5 |
3 | null | 5 | |
1 | zbz | 6 | |
2 | null | 6 | |
3 | 6 |