SQL的书写顺序
#HiveSQL、PrestoSQL、SparkSQL、FlinkSQL、DorisSQL、HBaseSQL
SQL的执行顺序
Hive的基本查询
查询表行数
1、在Hive的一些版本中,当使用聚合函数来统计时,发现SQL语句没有返回返回任何结果
2、因为Hive默认是去MySQL的元数据中获取文件的行数、但是元数据中默认行数都是0
3、我们需要设置一个参数,不让SQL去元数据中获取行数,而是执行这条SQL对文件进行出来,自己来统计行数:set hive.compute.query.using.stats=false;(临时生效)
开启Hive的本地模式,加快查询速度
set hive.stats.column.autogather=false;
set hive.exec.mode.local.auto=true;--开启本地mr
聚合函数
-- 1)求总行数(count)
select count(*)
from t_covid; -- 求表的总行数,只要有一列不为NULL,则统计
select count(1)
from t_covid; -- 求表的总行数,只要有一列不为NULL,则统计
select count(state)
from t_covid;
-- 求state列有多少行,不会统计NULL值
--2)求分数的最大值(max)
select max(cases) max_cases
from t_covid;
-- 3)求分数的最小值(min)
select min(cases) min_cases
from t_covid;
-- 4)求分数的总和(sum)
select sum(cases) total_cases
from t_covid;
-- 5)求分数的平均值(avg)
select round(avg(cases), 2)
from t_covid;
Hive的条件查询
select *
from t_covid
where state = 'Alaska';
select *
from t_covid
where deaths between 1000 and 2000; -- [1000,2000] 包含左边和右边
select *
from t_covid
where deaths between 1000 and 2000; -- [1000,2000] 包含左边和右边
select *
from t_covid
where state = 'Alaska'
or state = 'New Mexico';
select *
from t_covid
where state in ('Alaska', 'New Mexico');
-- 和上边等价
-- 查询州名以S开头的疫情数据
select *
from t_covid
where state like 'S%';
-- 查询州名包含s的疫情数据
select *
from t_covid
where state like '%s%';
select *
from t_covid
where state rlike '[s]';
-- 做用同上
-- 查询确诊病例数大于50000,同时死亡病例数大于1000的信息
select *
from t_covid
where cases >= 50000
and deaths >= 1000;
-- 查询阿拉斯加州和新墨西哥州的疫情数据
select *
from t_covid
where state = 'Alaska'
or state = 'New Mexico';
-- 查询除了阿拉斯加州和新墨西哥州以外的疫情数据
select *
from t_covid
where state not in ('Alaska', 'New Mexico');
select *
from t_covid
where deaths not between 1000 and 2000;
Hive的join查询
-- Hive的join操作
create table teacher (tid string,tname string) row format delimited fields terminated by '\t';
load data local inpath '/root/hive_data/test/teacher.txt' overwrite into table teacher;
create table course (cid string,c_name string,tid string) row format delimited fields terminated by '\t';
load data local inpath '/root/hive_data/test/course.txt' overwrite into table course;
-- 1) 内连接
-- 求两张表的交集
select * from teacher join course ; -- 笛卡尔集
select * from course;
select * from teacher inner join course on teacher.tid = course.tid;
select * from teacher join course on teacher.tid = course.tid;
select * from teacher , course where teacher.tid = course.tid; -- 同上
-- 2) 外连接
-- 2.1 左外连接
-- 以左表为主,左表的数据全部输出,右表有对应的数据就输出,没有对应的就输出NULL
insert into teacher values ('04','赵六');
select * from teacher;
select * from teacher left join course on teacher.tid = course.tid;
-- 2.2 右外连接
-- 以右表为主,右表的数据全部输出,左表有对应的数据就输出,没有对应的就输出NULL
insert into course values ('04','地理','05');
select * from course;
select * from teacher right join course on teacher.tid = course.tid;
-- 2.3 满外连接
select * from teacher full join course on teacher.tid = course.tid;
select
*
from A表 a left join B表 b on a.id = b.id
left join C表 c on b.id = c.id
left join D表 d on c.id = d.id;
Hive的排序查询
Hive的排序查询
-- 8.1 order By
-- 全局排序,要求只能有一个Reduce
select * from t_covid order by cases desc limit 10;
-- distribute by + sort by
-- distribute by 对数据进行分区,sort by对每一个分区的数据进行排序
-- 按照州对数据进行分区,对每一个分区的州数据按照确诊病例进行降序排序
-- state.hash值 % reduce个数
set mapreduce.job.reduces = 55;
insert overwrite local directory '/root/hive_data/export_hive'
select * from t_covid distribute by state sort by cases desc;
-- cluster by
-- 如果你的distribute by和sort by后边的字段相同时,可以使用 cluster by来简化
-- 只能是升序排序
cluster by 字段 =====> distribute by 字段 + sort by 字段;
set mapreduce.job.reduces = 10;
select * from t_covid distribute by state sort by cases desc;
select * from t_covid cluster by state ; -- 不能等价于以上写法