Hive相关查询

最新推荐文章于 2024-07-21 00:06:14 发布

早安Naor

最新推荐文章于 2024-07-21 00:06:14 发布

阅读量33

点赞数

分类专栏： Hive学习文章标签： hive hadoop 数据仓库

本文链接：https://blog.csdn.net/m0_73500885/article/details/134436545

版权

Hive学习专栏收录该内容

9 篇文章 0 订阅

订阅专栏

语法结构:

SELECT [ALL | DISTINCT] 字段名, 字段名, ...
FROM 表名 [inner | left outer | right outer | full outer | left semi JOIN 表名 ON 关联条件 ]
[WHERE 非聚合条件]
[GROUP BY 分组字段名]
[HAVING 聚合条件]
[ORDER BY 排序字段名 asc | desc]
[CLUSTER  BY 字段名 | [DISTRIBUTE BY 字段名 SORT BY 字段名]]
[LIMIT x,y];

类sql基本查询

数据准备:

-- 创建订单表
CREATE TABLE orders 
(
    orderId bigint COMMENT '订单id',
    orderNo string COMMENT '订单编号',
    shopId bigint COMMENT '门店id',
    userId bigint COMMENT '用户id',
    orderStatus tinyint COMMENT '订单状态 -3:用户拒收 -2:未付款的订单 -1：用户取消 0:待发货 1:配送中 2:用户确认收货',
    goodsMoney double COMMENT '商品金额',
    deliverMoney double COMMENT '运费',
    totalMoney double COMMENT '订单金额（包括运费）',
    realTotalMoney double COMMENT '实际订单金额（折扣后金额）',
    payType tinyint COMMENT '支付方式,0:未知;1:支付宝，2：微信;3、现金；4、其他',
    isPay tinyint COMMENT '是否支付 0:未支付 1:已支付',
    userName string COMMENT '收件人姓名',
    userAddress string COMMENT '收件人地址',
    userPhone string COMMENT '收件人电话',
    createTime timestamp COMMENT '下单时间',
    payTime timestamp COMMENT '支付时间',
    totalPayFee int COMMENT '总支付金额'
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
-- 加载数据(因为是普通内部表可以直接上传文件到hfds表路径下)

数仓分层思想:

-- 数仓分层: 本质就是分库分表
-- 构建源数据层
create database xls_ods;
-- 构建数数据仓库
create database xls_dw;
-- 构建数据应用层
create database xls_da;

-- 转换应用场景
-- 注意: 在大数据分析中转换完后为了以后方便使用一般存储起来
create table xls_dw.dw_orders as
select
    orderid,
    orderno,
    shopid,
    userid,
    orderstatus,
    goodsmoney,
    delivermoney,
    totalmoney,
    realtotalmoney,
    case
        when payType=0 then '未知'
        when payType=1 then '支付宝'
        when payType=2 then '微信'
        when payType=3 then '现金'
        when payType=4 then '其他'
    end as payType,
    payType,
    username,
    useraddress,
    userphone,
    createtime,
    paytime,
    totalpayfee
from orders;

-- 修改字段类型
alter table dw_orders change orderstatus orderstatus string;
alter table dw_orders change ispay ispay string;
-- 修改后重新修改了,需要覆盖数据
insert overwrite table xls_dw.dw_orders
select
    orderid,
    orderno,
    shopid,
    userid,
    case
        when orderstatus=-3 then '用户拒收'
        when orderstatus=-2 then '未付款的订单'
        when orderstatus=-1 then '用户取消'
        when orderstatus=0 then '待发货'
        when orderstatus=1 then '配送中'
        when orderstatus=2 then '用户确认收货'
    end
    as orderstatus,
    goodsmoney,
    delivermoney,
    totalmoney,
    realtotalmoney,
    case
        when payType=0 then '未知'
        when payType=1 then '支付宝'
        when payType=2 then '微信'
        when payType=3 then '现金'
        when payType=4 then '其他'
    end as payType,

    case
        when isPay=0 then '未支付'
        when isPay=1 then '已支付'
    end as isPay,
    username,
    useraddress,
    userphone,
    createtime,
    paytime,
    totalpayfee
from orders;

课堂练习:

-- 基础查询格式: select distinct 字段名 from 表名;
--     			注意: *代表所有字段  distinct去重  as给表或者字段起别名且可以省略
-- 指定字段查询
select userName,userPhone from orders where userName='邓力夫';
-- 指定字段并且取别名查询
select distinct userName name,userPhone phone from orders where userName='邓力夫';
-- 当然也可以给表起别名(目前单表即使起了也没有多大意义)
select o.userName ,o.userPhone  from orders as o ;
-- 查询支付类型要求去重
select distinct payType from orders;


-- 2.演示where条件查询
-- 查询广东省订单
drop table if exists da_gd_orders;
create table da_gd_orders as
select * from orders where userAddress like '广东省%';


-- 3.演示聚合查询
-- 查询广东省数据量
select count(*) from orders where userAddress like '广东省%';

-- 4.演示分组查询
-- 注意: select后的字段要么在groupby后出现要么在聚合函数内出现,否则报错
-- 统计已支付和未支付各自多少人
select isPay,count(*) cnt from orders group by isPay;

-- 5.演示条件查询,聚合查询,分组查询综合练习
-- 在已支付订单中,统计每个用户最高的一笔消费金额
select userId, username, max(realTotalMoney)
from orders
where isPay = 1
group by userId, username;
-- 统计每个用户的平均消费金额
select userId, username, avg(realTotalMoney)
from orders
where isPay = 1
group by userId, username;
-- 统计每个用户的平均消费金额并且筛选大于10000的
select userId, username, avg(realTotalMoney) as avg_money
from orders
where isPay = 1
group by userId, username
having avg_money > 10000;
-- 统计每个用户的平均消费金额并且筛选大于10000的,平均值要求保留2位小数
select userId, username,round(avg(realTotalMoney),2)
from orders
where isPay = 1
group by userId, username
having round(avg(realTotalMoney),2) > 10000;

-- 6.演示排序查询
-- asc默认升序  desc 降序
-- 查询广东省订单,要求按照总价降序排序
select * from orders where userAddress like '广东省%' order by realTotalMoney desc;

-- 7.演示分页查询
-- limit x,y  注意: x和y都是整数,x是从0开始起始索引,y是查询的条数
-- 查询广东省订单总价最高的前5个订单
select * from orders where userAddress like '广东省%' order by realTotalMoney desc limit 5;

类sql多表查询

数据准备:

-- 创建用户表
CREATE TABLE users (
    userId int,
    loginName string,
    loginSecret int,
    loginPwd string,
    userSex tinyint,
    userName string,
    trueName string,
    brithday date,
    userPhoto string,
    userQQ string,
    userPhone string,
    userScore int,
    userTotalScore int,
    userFrom tinyint,
    userMoney double,
    lockMoney double,
    createTime timestamp,
    payPwd string,
    rechargeMoney double
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
-- 加载数据
load data inpath '/source/itheima_users.txt' into table users;
-- 验证数据
select * from users limit 1;

练习:

-- 交叉连接格式: select 字段名 from 左表 cross join 右表;
-- 			注意: 交叉连接产生的结果叫笛卡尔积,此种方式慎用!!!
select * from users cross join orders;
-- 内连接格式: select 字段名 from 左表 inner join 右表 on 左右表关联条件;
-- 			特点: 相当于只取两个表的交集
select * from users u inner join orders o on u.userId=o.userId ;
-- 左外连接格式:  select 字段名 from 左表 left outer join 右表 on 左右表关联条件;
-- 			特点: 以左表为主,左表数据全部保留,右表只保留和左表有交集的部分
select * from users u left outer join orders o on u.userId=o.userId ;
-- 右外连接格式:  select 字段名 from 左表 right outer join 右表 on 左右表关联条件;
-- 			特点: 以右表为主,右表数据全部保留,左表只保留和右表有交集的部分
select * from users u right outer join orders o on u.userId=o.userId ;

-- 自连接: 本质是一个特殊的内外连接,最大特点就是左右表是同一个表
-- 			应用场景: 比较局限,场景1: 存储省市县三级数据的区域表   场景2: 存储上下级信息的员工表
-- 可以运行下基础班的areas.sql脚本,做以下练习
-- 方式1: 建议
select xian.title
from
    (select * from areas  where title = '北京市' and pid is not null) city
join
    areas xian
on city.id = xian.pid;
-- 方式2:
select xian.title
from
    areas city
join
    areas xian
on city.id = xian.pid
where city.title = '北京市' and city.pid is not null;
-- 子查询: 本质是一个select语句作为另外一个select语句的一部分(表或者条件)
-- 			注意: 子查询作为表使用的话必须取别名
;
select title
from areas
where pid = (select id from areas where title = '北京市' and pid is not null);

hive其他join操作

在Hive中除了支持cross join（交叉连接，也叫做笛卡尔积）
inner join（内连接）
left outer join（左外连接）
right outer join（右外连接）
还支持full outer join（全外连接）
left semi join（左半开连接）

示例:

-- hive不同于mysql的join操作
-- 全外连接(左表 full outer join 右表 on 条件) 大白话就是左外和右外结果合并同时去重
select * from users u full outer join orders o on u.userId = o.userId;

-- 左半开连接(左表 left semi join 右表 on 条件) 大白话就是内连接的一半
select * from users u left semi join orders o on u.userId = o.userId;

hive其他排序操作

示例:

-- 演示4个by区别
-- 创建表
create table students(
    id int,
    name string,
    gender string,
    age int,
    cls string
)row format delimited
fields terminated by ',';
-- 加载数据
load data inpath '/source/students.txt' into table students;
-- 验证数据
select * from students limit 1;


-- 查询reduces的数量
set mapreduce.job.reduces; -- -1代表根据任务实时改变
-- 1.cluster by 字段名 查询的时候分桶且排序
-- 注意: 如果是1个reduces那么cluster by全局升序排序
select * from students cluster by id;
-- 修改reduces数量为3
set mapreduce.job.reduces=3;
-- 再次使用cluster by查询,查看效果
-- 效果: 如果多个reduces那么cluster by桶内局部排序
select * from students cluster by age;


-- 2.distribute by + sort by
-- 设置reduces的数量为-1
set mapreduce.job.reduces = -1;
-- 默认1个ruduces数量,使用distribute by + sort by查询观察结果
-- 注意: 如果是1个ruduces那么distribute by + sort by全局排序
select * from students distribute by name sort by age desc;
-- 修改reduces数量
set mapreduce.job.reduces = 2;
-- 再次distribute by + sort by查询
-- 效果: 如果多个redueces,那么distribute by 分reduces数量个桶,sort by桶内局部排序
select * from students distribute by name sort by age desc;


-- 3.order by
-- 注意: order by 永远都是全局排序,不受reduces数量影响,每次只用1个reduces
select * from students order by age desc;

抽样查询

示例:

-- 随机抽样函数 tablesample
-- 参考字段分桶抽样,快且随机
select * from orders tablesample ( bucket 1 out of 10 on orderid);
-- 参考rand()随机数,快且真正达到随机
select * from orders tablesample ( bucket 1 out of 10 on rand());


-- 快速取前面部分数据 : 快但没有随机
-- 前100条
select  * from orders tablesample ( 100 rows );
-- 前10%数据
select  * from orders tablesample ( 10 percent );
-- 取1k或者1m的数据
select  * from orders tablesample ( 16k );
select  * from orders tablesample ( 167k );
select  * from orders tablesample ( 1m );

-- 随机取100条: 随机但是不快
select * from orders distribute by rand() sort by rand() limit 100;

正则模糊查询

sql模糊查询关键字: like      任意0个或者多个:  %     任意1个: _    

正则模糊查询关键字: rlike     任意0个或者多个: .*     任意1个: .     正则语法还有很多...

-- 正则表达式查询
-- 1.查询广东省数据
-- sql模糊查询
select * from orders where userAddress like '广东省%';
-- 正则模糊查询
select * from orders where userAddress rlike '广东省.*';

-- 2. 查询满足'xx省 xx市 xx区'格式的信息
-- sql模糊查询
select * from orders where userAddress like '__省 __市 __区';
-- 正则模糊查询
select * from orders where userAddress rlike '..省 ..市 ..区';

-- 3.查询所有姓张王邓的用户信息
-- sql模糊查询
select * 
from orders 
where username like '张%' or username like '王%' or username like '邓%' ;
-- 正则模糊查询
select * from orders where username rlike '[张王邓].*';
select * from orders where username rlike "[张王邓].+";

-- 4.查找所有188开头的手机号
-- sql模糊查询
select * from orders where userPhone like '188________' ;
-- 正则模糊查询
select * from orders where userPhone rlike '188........' ;
select * from orders where userPhone rlike '188.{8}' ;
select * from orders where userPhone rlike '188\\*{4}[0-9]{4}' ;
select * from orders where userPhone rlike '188\\*{4}\\d{4}' ;

union联合查询

示例:

-- 插入数据
insert into product values('p1','联想','c1'),('p2','小米','c2'),('p3','华为',null);
-- 创建分类表
create table category(
    cid varchar(100),
    cname varchar(100)
);
-- 插入数据
insert into category values('c1','电脑'),('c2','手机'),('c3','服饰');

-- 1.如果在mysql中,使用union实现全外连接
-- 使用union关键字,自动去重
-- 左外 union 右外
select pid,pname,p.cid,cname from product p left join category c on p.cid = c.cid
union
select pid,pname,c.cid,cname from product p right join category c on p.cid = c.cid;


-- 注意: 如果不想去重使用 union all
-- 左外 union all 右外
select pid,pname,p.cid,cname from product p left join category c on p.cid = c.cid
union  all
select pid,pname,c.cid,cname from product p right join category c on p.cid = c.cid;

-- 2.在hive中使用full outer join实现全外连接
select pid,pname,c.cid,cname from product p full join category c on p.cid = c.cid;

CTE表达式

CTE: 公用表表达式（CTE）是一个在查询中定义的临时命名结果集将在from子句中使用它。
注意: 每个CTE仅被定义一次（但在其作用域内可以被引用任意次）,仅适用于当前运行的sql语句
语法如下:
    with 临时结果集的别名1  as (子查询语句),
   		 临时结果集的别名2	as (子查询语句)
   		 ...
    select 字段名 from (子查询语句);

内置虚拟列

示例:

-- 演示内置虚拟列
-- 打开ROW__OFFSET__INSIDE__BLOCK
SET hive.exec.rowoffset=true;
-- 演示查询
SELECT *, 
	INPUT__FILE__NAME, 
	BLOCK__OFFSET__INSIDE__FILE, 
	ROW__OFFSET__INSIDE__BLOCK 
FROM students_bucket;