(重点)Hive表的查询-基本查询
普通查询
1、聚合函数对null的态度
create table test11(
id int,
score int
);
insert into test11 values (1,50);
insert into test11 values (2,50);
insert into test11 values (3,null);
insert into test11 values (4,50);
select * from test11;
select sum(score) from test11; // 150
select avg(score) from test11; // 150 / 3
select avg(if(score is null, 0, score)) from test11; // 150 / 4
select avg(coalesce(score,0)) from test11; // 150 / 4
2、limit关键字
select * from student limit 3;
select * from student limit 2,3; --从索引为2(从0开始)显示,显示3行
3、where条件查询
select * from score where sscore not in(80,90); -- 成绩不是80或者90
select * from score where not sscore in(80,90);-- 成绩不是80或者90
4、like关键字
select * from student where sname like '赵%'; -- 姓赵的
select * from student where sname like '%雷'; -- 名字最后一个字是 雷
select * from student where sname like '%雷%'; -- 名字中包含 雷
select * from student where sname like '_雷%'; -- 名字第二个字是 雷\
5、分组-group by
-- 分组之后每一组只剩下一条数据,所以select后边只能跟分组字段和聚合函数
select sid, sum(sscore) from score group by sid;
-- 分组之后的条件筛选是having,不是where
select sid, sum(sscore) as total_score from score group by sid having total_score > 450
join查询
-
内连接
-- 1、内连接:求交集 select * from teacher; insert into teacher values ('04','赵六'); select * from teacher inner join course c on teacher.tid = c.tid; select * from teacher join course c on teacher.tid = c.tid; select * from teacher , course where teacher.tid = course.tid;
-
左外连接
-- 2、左外连接 -- 左外是以左表为主,把左表的数据全部输出,右表有对应的数据就输出,没有对应的数据就输出NULL select * from teacher left join course c on teacher.tid = c.tid;
-
右外连接
-- 3、右外连接 select * from course; insert into course values ('04','政治','05'); -- 右外是以右表为主,把右表的数据全部输出,左表有对应的数据就输出,没有对应的数据就输出NULL select * from teacher right join course c on teacher.tid = c.tid;
-
满外连接
-- 4、满外连接 -- 查询左外连接和右外连接的并集 select * from teacher full join course c on teacher.tid = c.tid;
排序查询
-
order by (重点)
1、order by 用于全局排序,要求只能有一个Reduce 2、如果有多个Reduce,则不能使用order by 3、order by 的使用方法和MySQL是一样的
select * from score order by sscore ; -- 升序排序 select * from score order by sscore desc; -- 降序排序
-
sort by
1、 sort by 会做两件事情:
1)会将表文件拆分成多个文件(默认的分区)
2)保证每一个输出的文件内容都有序
1)设置reduce个数
set mapreduce.job.reduces=3;
2)查询成绩按照成绩降序排列
select * from score sort by sscore;
3)将查询结果导入到文件中(按照成绩降序排列)
insert overwrite local directory '/export/data/exporthive/sort' select * from score sort by sscore;
-
distributed by + sort by(重点)
1、distributed by 会按照某个字段进行分区,sort by 会给每个分区的数据进行排序
1)设置reduce的个数,将我们对应的sid划分到对应的reduce当中去 set mapreduce.job.reduces=7; 2)通过distribute by进行数据的分区 insert overwrite local directory '/export/data/exporthive/distribute' select * from score distribute by sid sort by sscore;
-
cluster by
1、当distributed by 和 sort by字段相同时:cluster by 等价于 distributed by + sort by cluster by id => distributed by id sort by id 2、当reduce个数 < id的个数时,排序有意义 id有100个 rduce 100个
set mapreduce.job.reduces=2; insert overwrite local directory '/export/data/exporthive/cluster_by' select * from score cluster by sid;
(重点)Hive的函数
内置函数
-
数学函数
-- 四舍五入函数 select round(3.1415926,4); -- 四舍五入 保留4位小数 3.1416 -- 获取 [1,100]之间的随机数 select `floor`(rand() * 100) + 1; -- 向下取整 select floor(2.8999); -- 2
-
字符串函数
-- 字符串拼接 select concat(rand(),'-',sid) as sid, sname from student; select concat(rand(),'-',sid) as sid, sname from student; -- 字符串拼接,带分隔符 select concat_ws('-','2022','10','15'); select log10(100) -- 字符串截取 select substr('2022-12-23 10:13:45',1,4); -- 2022 select substr('2022-12-23 10:13:45',6,6); -- 12 -- 字符串替换 select regexp_replace('foobar', 'oo|ar', ''); -- 解析URL select parse_url('http://www.facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'); select parse_url('http://www.facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH'); select parse_url('http://www.facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY'); select parse_url('http://www.facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY','k1'); select parse_url('http://www.facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY','k2'); -- 字符串切割 select split('2022-12-23','-');
-
日期函数
-- 获取当前的时间 select `current_date`(); -- 2022-10-25 -- 将日期转为指定的格式 select date_format('2022-1-1 1:1:1','yyyy-MM-dd HH:mm:ss') -- 2022-01-01 01:01:01 select to_date('2022-01-01 01:01:01'); -- 年月日 select year('2022-01-01 01:01:01'); -- 年 select month('2022-01-01 01:01:01'); -- 月 select day('2022-01-01 01:01:01'); -- 日 select hour('2022-01-01 01:01:01'); -- 小时 select minute('2022-01-01 01:01:01'); -- 分钟 select second('2022-01-01 01:01:01'); -- 秒 select quarter('2022-10-15 01:01:01'); -- 季度 select weekofyear('2022-10-15 01:01:01'); -- 获取今年的第几周 select `dayofweek`('2022-10-16 01:01:01'); -- 获取今天是周几(1-7) select date_add('2022-10-15',10); -- 将日期向后推10天 select date_add('2022-10-15',-10);-- 将日期向前推10天 select date_sub('2022-10-15',10); -- 将日期向前推10天 select abs(datediff('2022-02-13','2022-10-15')); -- 求日期之间的差值
条件函数
-
if语句
-- 标记每个学生的及格和不及格的情况 select *, if(sscore >= 60,'及格','不及格') as flag from score; -- 同时计算及格和不及格的平均分 select avg(if(sscore>=60,sscore,null)) as avg1,avg(if(sscore<60,sscore,null)) as avg2 from score
-
case when 语句
-
格式1
/* 订单号 金额 支付方式 order_id amount payment_type payment_name 1001 50 1 支付宝支付 1002 80 2 微信支付 1003 60 3 余额支付 1004 40 4 货到付款 */ select case payment_type when 1 then '支付宝支付' when 2 then '微信支付' when 3 then '余额支付' else '货到付款' end as payment_name;
-
格式2
select *, case when sscore >= 90 and sscore <= 100 then '优秀' when sscore >= 80 then '良好' when sscore >= 60 then '一般' else '及格' end as flag from score;
-
类型降转函数
select cast(12.35 as int);
select cast('123' as int);
select cast('2020-12-05' as date);
行转列
-- 1、准备数据
20 SMITH
30 ALLEN
30 WARD
20 JONES
30 MARTIN
30 BLAKE
10 CLARK
20 SCOTT
10 KING
30 TURNER
20 ADAMS
30 JAMES
20 FORD
10 MILLER
-- 2、创建表
create table emp(
deptno int,
ename string
) row format delimited fields terminated by '\t';
-- 3、加载数据
load data local inpath "/export/data/hivedatas/emp.txt" into table emp;
-- 4、实现功能
-- collect_set(去重)/collect_list(不去重) 可以将每一组的ename存入一个集合中(set集合,list集合)
select deptno, collect_list(ename) from emp group by deptno;
select deptno,concat_ws("|",collect_set(ename)) as ems from emp group by deptno;
select deptno, collect_set(ename)[0] from emp group by deptno; -- 用索引访问