hive的join操作,只支持等值连接
创建customers和orders表,一对多关系
创建customers表
create table customers(id int,name string,age int)
row format delimited
fields TERMINATED by '\t'
lines TERMINATED by '\n'
stored as textfile;
数据准备:
customers.txt
1 tom1 12
2 tom2 13
3 tom3 14
加载数据
load data local inpath '/home/hadoop/customers.txt' into table customers;
创建表orders
create table orders(id int,ordernum string,price float,cid int)
row format delimited
fields TERMINATED by '\t'
lines TERMINATED by '\n'
stored as textfile;
数据准备
orders.txt
1 No001 121.01 1
2 No002 121.02 1
3 No003 121.03 1
4 No004 121.04 2
5 No005 121.05 2
6 No006 121.06 2
7 No007 121.07
加载数据
load data local inpath '/home/hadoop/orders.txt' into table orders;
内连接 join -- on
hive (hive1)> select a.id,a.name,b.id,b.ordernum,b.price from customers a join orders b on a.id = b.cid;
a.id a.name b.id b.ordernum b.price
1 tom1 1 No001 121.01
1 tom1 2 No002 121.02
1 tom1 3 No003 121.03
2 tom2 4 No004 121.04
2 tom2 5 No005 121.05
2 tom2 6 No006 121.06
Time taken: 30.041 seconds, Fetched: 6 row(s)
左外连接
hive (hive1)> select a.id,a.name,b.id,b.ordernum,b.price from customers a left outer join orders b on a.id = b.cid;
a.id a.name b.id b.ordernum b.price
1 tom1 1 No001 121.01
1 tom1 2 No002 121.02
1 tom1 3 No003 121.03
2 tom2 4 No004 121.04
2 tom2 5 No005 121.05
2 tom2 6 No006 121.06
3 tom3 NULL NULL NULL
右外连接
hive (hive1)> select a.id,a.name,b.id,b.ordernum,b.price from customers a right outer join orders b on a.id = b.cid;
a.id a.name b.id b.ordernum b.price
1 tom1 1 No001 121.01
1 tom1 2 No002 121.02
1 tom1 3 No003 121.03
2 tom2 4 No004 121.04
2 tom2 5 No005 121.05
2 tom2 6 No006 121.06
NULL NULL 7 No007 121.07
全外连接
hive (hive1)> select a.id,a.name,b.id,b.ordernum,b.price from customers a full outer join orders b on a.id = b.cid;
a.id a.name b.id b.ordernum b.price
NULL NULL 7 No007 121.07
1 tom1 3 No003 121.03
1 tom1 2 No002 121.02
1 tom1 1 No001 121.01
2 tom2 6 No006 121.06
2 tom2 5 No005 121.05
2 tom2 4 No004 121.04
3 tom3 NULL NULL NULL
左半连接,select 和 where 子句不能引用到右边表字段
左表的记录在右表中一旦找到对应的记录,右侧表既停止扫描 (hive不支持右半连接操作 right semi join xxx)
hive (hive1)> select c.id,c.name from customers c left semi join orders o on c.id=o.cid;
c.id c.name
1 tom1
2 tom2
Time taken: 30.345 seconds, Fetched: 2 row(s)
笛卡尔连接 m x n
hive (hive1)> select c.id,c.name,o.ordernum from customers c join orders o;
c.id c.name o.ordernum
1 tom1 No001
2 tom2 No001
3 tom3 No001
1 tom1 No002
2 tom2 No002
3 tom3 No002
1 tom1 No003
2 tom2 No003
3 tom3 No003
1 tom1 No004
2 tom2 No004
3 tom3 No004
1 tom1 No005
2 tom2 No005
3 tom3 No005
1 tom1 No006
2 tom2 No006
3 tom3 No006
1 tom1 No007
2 tom2 No007
3 tom3 No007
order by 全排序,对所有的数据通过一个reduce进行排序
如果开启了hive.mapred.mode=strict,在全排序时必须结合limit使用
现在推荐使用hive.strict.check.* 属性
hive (hive1)> select * from orders order by cid asc,price desc;
orders.id orders.ordernum orders.price orders.cid
7 No007 121.07 NULL
3 No003 121.03 1
2 No002 121.02 1
1 No001 121.01 1
6 No006 121.06 2
5 No005 121.05 2
4 No004 121.04 2
Time taken: 28.336 seconds, Fetched: 7 row(s)
sort by 每一reduce进行排序(局部排序)
hive (hive1)> select * from orders sort by cid asc,price desc;
orders.id orders.ordernum orders.price orders.cid
7 No007 121.07 NULL
3 No003 121.03 1
2 No002 121.02 1
1 No001 121.01 1
6 No006 121.06 2
5 No005 121.05 2
4 No004 121.04 2
Time taken: 26.294 seconds, Fetched: 7 row(s)
union 联合操作 (union all 联合操作 字段类型和个数需要匹配)
hive (hive1)> select id,name from customers union select id,ordernum from orders;
u2.id u2.name
1 No001
1 tom1
2 No002
2 tom2
3 No003
3 tom3
4 No004
5 No005
6 No006
7 No007
Time taken: 28.261 seconds, Fetched: 10 row(s)