一、导入数据进入Hive表的几种方式
load data local inpath '/opt/datas/emp.txt' into table default.emp ;
load data inpath '/user/beifeng/hive/datas/emp.txt' overwrite into table default.emp ;
load data inpath '/user/beifeng/hive/datas/emp.txt' into table default.emp ;
4)创建表是通过insert加载
create table default.emp_ci like emp ;
insert into table default.emp_ci select * from default.emp ;
==================================================================
二、导出Hive表数据的几种方式
1) insert overwrite local directory '/opt/datas/hive_exp_emp'
select * from default.emp ;
2) insert overwrite local directory '/opt/datas/hive_exp_emp2'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY '\n'
select * from default.emp ;
4) insert overwrite directory '/user/beifeng/hive/hive_exp_emp'
select * from default.emp ;
5) sqoop
hdfs/hive -> rdbms
rdbms -> hdfs/hive/hbase
三、Hive查询方式
=======================全表(字段)查询=================
select * from emp ;
select t.empno, t.ename, t.deptno from emp t ;
=======================范围查询======================
= >= <= between and
select * from emp limit 5 ;
select t.empno, t.ename, t.deptno from emp t where t.sal between 800 and 1500 ;
=======================限制查询====================
is null / is not null /in /not in
select t.empno, t.ename, t.deptno from emp t where comm is null ;
=======================函数查询====================
max/min/count/sum/avg
select count(*) cnt from emp ;
select max(sal) max_sal from emp ;
select sum(sal) from emp ;
select avg(sal) from emp ;
=======================分组查询====================
group by /having
分组
emp表
* 每个部门的平均工资
select t.deptno, avg(t.sal) avg_sal from emp t group by t.deptno ;
* 每个部门中每个岗位的做高薪水
select t.deptno, t.job, max(t.sal) avg_sal from emp t group by t.deptno, job ;
>>>having
* where 是针对单条记录进行筛选
* having 是针对分组结果进行筛选
求每个部门的平均薪水大于2000的部门
select deptno, avg(sal) from emp group by deptno ;
select deptno, avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;
========================连接查询===================
join
两个表进行连接
m n
m表中一条记录和n表中的一条记录组成一条记录
等值jion
join ... on
select e.empno, e.ename, d.deptno, d.dname from emp e join dept d on e.deptno = d.deptno ;
左连接
left join
select e.empno, e.ename, d.deptno, d.dname from emp e left join dept d on e.deptno = d.deptno ;
右连接
right join
select e.empno, e.ename, e.deptno, d.dname from emp e right join dept d on e.deptno = d.deptno ;
全连接
full join
select e.empno, e.ename, e.deptno, d.dname from emp e full join dept d on e.deptno = d.deptno ;
>> order by
对全局数据的一个排序,仅仅只有个reduce
select * from emp order by empno desc ;
>> sort by
对每一个reduce内部数据进行排序的,全局结果集来说不是排序
set mapreduce.job.reduces= 3;
select * from emp sort by empno asc ;
insert overwrite local directory '/opt/datas/sortby-res' select * from emp sort by empno asc ;
>> distribute by
分区partition
类似于MapReduce中分区partition,对数据进行分区,结合sort by进行使用
insert overwrite local directory '/opt/datas/distby-res' select * from emp distribute by deptno sort by empno asc ;
注意事项:
distribute by 必须要在sort by 前面。
>> cluster by
当distribute by和sort by 字段相同时,可以使用cluster by ;
insert overwrite local directory '/opt/datas/cluster-res' select * from emp cluster by empno ;