一、HQL:
where 过滤筛选 (map 任务) select ,update ,delete ,insert into (在分组语句group by之前执行)
select * from emp where sal >3000;
limit 限制条件 (无mapreduce)
select * from emp where sal > 3000 limit 10
distinct (map + reduce任务)
select distinct sal from emp;
- between and (map 任务)
- select * from emp where sal between 2000 and 4000;
- is null & is not null
- select * from emp where comm is not null;
二、 聚合函数
count <>
select count(1) from emp;
avg
select avg(sal) from emp;
group by 分组
select deptno avg(sal) from emp group by deptno;
having 和where功能相同,设定条件
(having在分组语句之后执行,where先执行然后having执行)select deptno ,avg(sal) from emp group by deptno having avg(sal) > 3000;
join 两表join join 是耗时的,消耗资源大
5.1 左join left (位于 join 左边的表没有null值)5.2 右join right (join右边的表没有null值)
5.3 全join full (join左右两边的值都)
5.4 等值join (连接两张表共有的字段都显示出来,)
create table dept ( deptno int, dname string, loc string
)
row format delimited fields terminated by ‘\t’;
load data local inpath ‘/opt/tools/dept.txt’ into table dept;dept 和 emp 两表join;
三、hive 中与 MR相关的设置
reduce 相关
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=
In order to set a constant number of reducers:
set mapreduce.job.reduces=
四、排序
order by –全局排序, 只能有一个reduce,生成一个文件,(map + reduce)
insert local directory ‘/opt/emp.haha’ select * from emp order by sal;
insert overwrite local directory ‘/opt/datas/’ select * from emp;
sort by – 局部排序,不能保证所有数据都是有序 的,针对每个reduce的结果进行排序(reduce的一个排序阶段)
set mapreduce.job.reduces=3; set mapreduce.job.reduce; insert overwrite local directory "/opt/datas" row format delimited fields terminated by '\t' select sal,comm , deptno from emp group by deptno,sal,comm; insert overwrite local directory "/opt/datas" row format delimited fields terminated by '\t' select sal, comm , deptno from emp sort by sal;
distribute by –mapreduce的分区,按照指定的值进行分区,控制map的输出给哪个reduce。
(sort by 是对分区后的数据进行排序)insert overwrite local directory '/opt/datas' row format delimited fields terminated by '\t' select * from emp distribute by deptno sort by sal ;
cluster by (distributed by 和 sort by的结合使用)
insert overwrite local directory '/opt/datas' row format delimited fields terminated by '\t' select * from emp cluster by sal;
先用全局 order by 然后再使用其他sort by,会提高效率。