day05-Hive语法补充

萌神想

于 2024-09-13 18:36:33 发布

阅读量524

点赞数 12

文章标签： hive hadoop 数据仓库大数据 sql

本文链接：https://blog.csdn.net/sxwwudi/article/details/142217557

版权

day05-Hive语法补充

一、表的修改

1-1 alter 关键字

表名和字段的修改

使用 alter 关键字

-- 切换数据库
use ecut;

-- 创建表
create table tb_old(
    id int,
    name string,
    age int,
    gender string
);

修改表名

-- 修改表名
alter table tb_old rename to tb_new;

增加字段

--表中增加字段
alter table tb_new add columns (address string);

字段修改

-- 字段名修改
alter table tb_new change name username string;

字段类型修改

-- 字段类型修改
alter table tb_new change age age string;

字段替换，可以实现删除

-- 字段的替换，可以用来实现删除
alter table tb_new replace columns (
    id int,
    username string,
    gender string,
    address string
);

二、表的数据加载

2-1 方法一：直接插入数据

-- 创建表
create table stu(id int,name string); 

-- 向表中插入数据
insert into stu values(1,'zhangsan’); 
insert into stu values(2,'lisi');

-- 查询数据 
select * from stu;

该方式每次插入都会在表目录中生成对应的数据文件，不推荐使用。

2-2 方法二：load数据加载

Load命令用于将外部数据加载到Hive表中

在Linux虚拟机中创建一个测试文件 stu.txt ，下图为文件内容和文件所在目录

在这里插入图片描述

根据文件内容创建对应的表

create table tb_stu7(
    id int,
    name string,
    gender string,
    age int
)
row format delimited fields terminated by ',';

加载数据

--向表加载数据
load data local inpath '/home/stu.txt' into table tb_stu7;

查看数据是否加载成功

select * from tb_stu7;

在这里插入图片描述

三、表的基本查询

3-1 SELECT查询

使用 select 完成表的查询计算

格式

select 字段 from 表名
join 关联的表 on 关联的字段
where 根据指定的条件过滤表数据
group by 指定分组的字段
having 对分组计算后的数据进行过滤
order by 指定排序的字段 默认是升序 从小到大排序
limit 指定返回多少条数据进行展示

select 使用

查询指定字段

-- 指定查询字段
select id,name,age,gender from tb_stu8;

对字段修改别名

-- 对字段修改别名
select id,name as username,gender from tb_stu8;

在这里插入图片描述

指定一个常量字段数据

-- 指定一个常量字段数据
select id,name as username,666 as data from tb_stu8;

在这里插入图片描述

数据去重，内部执行计算（耗时较长）

-- 数据的去重展示
select distinct gender from tb_stu8;

在这里插入图片描述

3-2 过滤查询（where）

对表进行过滤

-- 等值判断过滤
select * from tb_stu8 where gender = '男';

-- 大小判断
-- 数据类型数据镜像大小判断
select * from tb_stu8 where age<20;
select * from tb_stu8 where age>20;
select * from tb_stu8 where age <= 20;

-- 空值判断 hive中null代表空值
select * from tb_stu8 where name is not null;
select * from tb_stu8 where name is null;

-- 模糊查询
-- 对字符串数据，按照指定字符内容查询
-- % 表示省略多个字符
-- _ 表示一个字符
select * from tb_stu8 where name like '王%';
select * from tb_stu8 where name like '%丽';
select * from tb_stu8 where name like '%小%';
select * from tb_stu8 where name like '王_';

3-3 范围条件过滤（between、in以及or）

-- 范围条件过滤
-- 对数据类型 int float 的数据进行过滤
-- 将18岁~20岁学生数据查询出来
select * from tb_stu8 where age between 18 and 20;

-- 根据指定的数据内容查询
-- 不限制数据类型
select * from tb_stu8 where age in (18,20);  -- 只查询age为18和20的
-- 不在限定范围内
select * from tb_stu8 where age not in (18,19);
select * from tb_stu8 where cls in ('CS','MA');  -- 只查询cls为CS和MA的

-- 多条件过滤
-- and的多条件过滤 多个条件必须都符合，返回对应的数据
-- 查询性别为男性，而且年龄为19岁的学生数据
select * from tb_stu8 where gender = '男' and age=19;
-- or的多条件
select * from tb_stu8 where gender = '男' or age=19;

3-4 group by使用

对数据进行分组计算

不同性别的年龄平均值

先按照性别进行分组，将相同性别的数据放在一起，然后在对相同分组内的年龄进行计算

分组需要配合聚合方法一起使用

聚合方法
sum（计算的字段）-- 求和
avg() – 求平均
max() – 最大值
min() – 最小值
count() – 求一组内数据的数量

使用聚合计算的时候，最好只写 group by后面有的列，不然容易报错

-- 不同性别的年龄平均值
-- hive进行计算时，时间较长
-- 1. 需要转化为MapReduce
-- 2. 需要申请计算资源
select gender,avg(age) from tb_stu8 group by gender;

在这里插入图片描述

-- 多字段分组
-- 不同性别不同年级的年龄平均值
-- 分组时，先按照性别分组，然后在性别内，按照年级分组
select gender,cls,avg(age) from tb_stu8 group by gender, cls;

在这里插入图片描述

聚合方法也可以单独使用

不进行分组，对整个表进行计算

-- 求所有学生的平均年龄
select avg(age) from tb_stu8;

在这里插入图片描述

3-5 having 使用

对分组后的数据进行过滤

having中使用的过滤条件和where的过滤条件一样

HAVING语句的存在弥补了WHERE关键字不能与聚合函数联合使用的不足

-- 查询以cls分组时，平均年龄大于19岁的信息
select gender,cls,avg(age) as ave_age from tb_stu8
group by gender, cls
having ave_age>19;

在这里插入图片描述

3-6 order by排序

指定排序字段

默认升序，从小到大

-- 默认升序 asc
select * from tb_stu8 order by age;

-- 按照年龄从大到小排序(下图为降序) desc
select * from tb_stu8 order by age desc;

在这里插入图片描述

3-7 limit使用

返回指定条数据

-- 返回前三条数据
select * from tb_stu8 limit 3;

分页展示

select * from tb_stu8 limit 页数,每页展示的数量;

-- 页数 m 表示
-- 每页数量 n 表示
-- 分页展示公式
limit (m-1)*n,n 

每页展示3条数据
第1页展示 
limit 0,3
第2页展示 
limit 3,3
第3页展示 
limit 6,3

-- 每页展示3条数据
-- 第1页展示
select * from tb_stu8 limit 0,3;
-- 第2页展示
select * from tb_stu8 limit 3,3;
-- 第3页展示
select * from tb_stu8 limit 6,3;

在这里插入图片描述

四、表的关联（join）

多张表数据合并的方式

列合并
要求两张表至少有一列相同的字段，可以使用相同字段作为关联的依据
关键字 join

4-1 数据准备

-- 2024.9.13 
-- 直接复制粘贴到DataGrip里，运行之后就会生成四个表和基本数据了
USE `ecut`;

-- 表格如果存在则删除
DROP TABLE IF EXISTS `course`;
-- 创建course表
CREATE TABLE `course` (
  `c_id` string ,
  `c_name` string ,
  `t_id` string
) ;
-- 向course中插入数据
insert  into `course`(`c_id`,`c_name`,`t_id`) values ('01','语文','02'),('02','数学','01'),('03','英语','03');

-- 如果存在则删除score表
DROP TABLE IF EXISTS `score`;
-- 创建score表
CREATE TABLE `score` (
  `s_id` string ,
  `c_id` string ,
  `s_score` int
) ;
-- 向score表中插入数据
insert  into `score`(`s_id`,`c_id`,`s_score`) values ('01','01',80),('01','02',90),('01','03',99),('02','01',70),('02','02',60),('02','03',80),('03','01',80),('03','02',80),('03','03',80),('04','01',50),('04','02',30),('04','03',20),('05','01',76),('05','02',87),('06','01',31),('03','03',34),('07','02',89),('07','03',98);

-- 如果存在student表则删除
DROP TABLE IF EXISTS `student`;
-- 创建student表
CREATE TABLE `student` (
  `s_id` string,
  `s_name` string ,
  `s_birth` string ,
  `s_sex` string
);
-- 向student表中插入数据
insert  into `student`(`s_id`,`s_name`,`s_birth`,`s_sex`) values ('01','赵雷','1990-01-01','男'),('02','钱电','1990-12-21','男'),('03','孙风','1990-05-20','男'),('04','李云','1990-08-06','男'),('05','周梅','1991-12-01','女'),('06','吴兰','1992-03-01','女'),('07','郑竹','1989-07-01','女'),('08','王菊','1990-01-20','女');

-- 如果存在teacher表则删除
DROP TABLE IF EXISTS `teacher`;
-- 创建teacher表
CREATE TABLE `teacher` (
  `t_id` string ,
  `t_name` string
) ;
-- 向techer表中插入数据
insert  into `teacher`(`t_id`,`t_name`) values ('01','张三'),('02','李四'),('03','王五');

4-2 四张表的关系

查看表结构

在这里插入图片描述

用DataGrip查看，表的关系

在这里插入图片描述

4-3 表关联

内关联 join

找两表相同的字段数据进行展示

学生表和成绩表采用内联表，关联的字段是s_id

学生表中的s_id是 1,2,3,4,5,6,7,8

成绩表中的s_id是 1,2,3,4,5,6,7

内关联之后，只会显示相同字段数据内容，即 1,2,3,4,5,6,7

-- 内关联
select * from student stu join score s on stu.s_id=s.s_id;

在这里插入图片描述

左关联 left join

以 join 左边的表为主表

进行数据查询时，左表数据全显示，右表字段中不存在左表有的字段，则会以 null 代替

左表学生表中的s_id是 1,2,3,4,5,6,7,8

右表成绩表中的s_id是 1,2,3,4,5,6,7

左关联后，左表的学生信息全显示，右表 1~7 的数据会显示，但是8的数据会被 null 替换（也就是显示为空）

-- 左关联
select * from student stu left join score s on stu.s_id=s.s_id;

在这里插入图片描述

右关联 right join

与左关联同理，只是主表变为右表，其它规则依旧遵循

-- 右关联
select * from student stu right join score s on stu.s_id = s.s_id;

在这里插入图片描述

4-4 on 实现数据过滤

在关联中 on 可以当作 where 进行使用，对关联的数据进行过滤

-- on实现数据过滤（功能与where一样）
select * from student t1 join score t2 on t1.s_id=t2.s_id and t1.s_sex='女';

在这里插入图片描述

4-5 多表关联

四张表通过相同字段 s_id 进行内关联

-- 四张表关联
select * from student t1
join score t2 on t1.s_id=t2.s_id
join course t3 on t3.c_id = t2.c_id
join teacher t4 on t4.t_id = t3.t_id;

在这里插入图片描述

五、表的合并（union）

行合并

要求两张表字段内容完全一样
关键字 union

表tb_stu7中的数据

在这里插入图片描述

表tb_stu8中的数据

在这里插入图片描述

-- union合并  分别查询两张表的数据，然后进行合并
select * from tb_stu7
union
select * from tb_stu8;

在这里插入图片描述

六、表的子查询

将一个查询后的结果（计算过）作为一个新的表数据再次进行查询计算

将一个查询计算的结果作为另一个查询的条件判断

6-1 查询结果作为新的表数据使用

格式

select * from (select * from 子查询) 子查询的表名（自己起一个）

-- 查询数学平均成绩（子查询）
select t2.c_id,c_name,avg(s_score) as avg_data from student t1
    join score t2 on t1.s_id = t2.s_id
    join course t3 on t2.c_id = t3.c_id and t3.c_name='数学'
group by c_name,t2.c_id) tb_avg

-- 筛选数学成绩大于数学平均成绩的学生信息
select * from 
-- 将刚刚的子查询作为一张表进行使用
(
select t2.c_id,c_name,avg(s_score) as avg_data from student t1
    join score t2 on t1.s_id = t2.s_id
    join course t3 on t2.c_id = t3.c_id and t3.c_name='数学'
group by c_name,t2.c_id) tb_avg
	-- 重新关联学生和成绩表，然后筛选数据
    join score t2 on t2.c_id=tb_avg.c_id
    join student t_stu on t_stu.s_id = t2.s_id
-- 筛选成绩大于平均成绩的学生
where t2.s_score > avg_data;

在这里插入图片描述

6-2 将查询结果作为过滤条件

-- 筛选数学成绩大于平均数学的成绩的学生信息
select * from student
join score s on student.s_id = s.s_id and s.c_id='02'
where s.s_score > (
    -- 计算数学平均成绩
    select avg(s_score) as avg_data from student t1
    join score t2 on t1.s_id = t2.s_id
    join course t3 on t2.c_id = t3.c_id and t3.c_name='数学'
group by c_name
);