一、基本查询(select…from…)
列查询
查看列,某几列
--全表查询 (使用 * 查询会返回全部列)
select * from tableA;
--全表查询 指定时间分区(date)
select * from tableA where date = 20210205;
--全表查询 限制100条返回
select * from tableA limit 100;
--指定列 查询
select uid,id_type from tableA;
--指定列 重命名
select uid col1,id_type col2 from tableA;
--DESC
DESC tableA;
过滤语句 使用where进行过滤
--使用where语句指定条件
select * from tableA where date = 20210205 and uid = "12345";
--取某个时间段的数据
select * from tableA where date >= 20210205 and date <= 20210214;
--非空
select * from tableA where uid is not null;
--逻辑判断语句 and 和 or(在hivesql中,and的优先级要比or高,可参考我的另一篇博文)使用or要注意加括号
--与逻辑 使用 and
select * from tableA where date = 20210205 and uid is not null;
--或逻辑 使用or
select * from tableA where uid = "12345" or uid = "678910";
数据量查询 pv uv
使用distinct 进行去重
--PV (查询全量取值不去重)
select count(*) from tableA;
--某个字段uv 使用 distinct 去重
select count(distinct uid) from tableA;
二、group by 语句
--group by 后 某字段的全部取值
select uid,collect_set(id_type) from tableA group by uid;
--group by 后 各取值出现频次
select uid,count(id_type) from tableA group by uid;
--group by 后 最大值
select uid,max(id_type) from tableA group by uid;
使用 collect_set 可以查看某个某个取值对应另一列的全部取值,也可以使用collect_list,但是collect_list不会去重。
--取 groupby 中的全部取值collect_set 也可以使用
select cate1,collect_set(cate2) from tableA where date = 20210119 group by cate1;
三、join 语句
join 默认是内连接
select tableA.id,tableA.name from tableA join tableB on (tableA.id = tableB.id and tableA.name = tableB.name);
- inner join 内连接
- left join 左外连接
- right join 右外连接
- full join 满外连接
五、order by 排序语句
order by 默认升序
1). ASC(ascend) 升序(默认); DESC(descend) 降序
2). order by子句在select语句结尾
-- 查询员工信息按工资升序排列
select * from emp order by sal;
-- 查询员工信息按工资降序排列
select * from emp order by sal desc;
-- 多个列排序
-- 按照部门和工资升序排序
select ename, deptno, sal from emp order by deptno, sal ;
-- 按照别名排序
-- 按照员工薪水的2倍排序
select ename, sal*2 twosal from emp order by twosal;
七、case 连续取值列分桶
按照tableA的amount列取值分桶
select count(case when tableA.amount< 100 then 1 end )as `[0,99]` ,
count(case when tableA.amount >= 100 and tableA.amount < 200 then 1 end ) as `[100,199]` ,
count(case when tableA.amount >= 200 and tableA.amount < 400 then 1 end ) as `[200,399]` ,
count(case when tableA.amount >= 400 and tableA.amount < 1000 then 1 end ) as `[400,999]` ,
count(case when tableA.amount >=1000 then 1 end ) as `[1000,null]`
from tableA;
也可以使用 between and 语句选择在某个区间段内的数据,但是between and 是左闭右闭的,在统计的时候容易造成重复。