Hive 数仓工具(3) 插入加载导入导出数据分区表操作表的修改操作查询对null处理基础语法关联查询二级分区 where group by having order limit

本文链接：https://blog.csdn.net/weixin_43139004/article/details/106891220

本文详细介绍了Hive的数据操作，包括插入数据的多种方式，如插入新数据、覆盖表数据以及通过SELECT语句插入；加载和导出数据，包括使用HDFS命令和Hive的export功能；分区表的概念和操作，以及如何进行表的修改，如重命名、更新列等；查询语句的使用，包括基础查询、分区查询、关联查询、分组和排序。此外，还讨论了如何处理NULL值以及数据导出到本地和HDFS的选项。

摘要由CSDN通过智能技术生成

编程 -->数据 + 运算(JAVA SQL)
hive:数据查看工具,不是数据库服务(HQL -->MR)
核心组件
数据(结构,位置) -->hdfs
表(结构数据类型) 元数据 -->mysql
插入数据 insert方式
1建表
2插入数据
每次insert会在hdfs中生成一个小文件
insert into tb_xxxx values(1,“zss”)
insert into tb_xxx values(1, “lss”),(2,“ls”)…;
将后面的select运算结果保存到某个表中
****insert into tb_xxxx1 select id_10, name from tb_xxxx;
插入覆盖表
insert overwrite table tb_xxxx1 select id+10, nmae from tb_xxxx;
插入数据 as select 方式
将结果数据直接保存到一个新表中全量
create table if not exists tb_name as select id, name from tb_xxxx;
create table if not exists tb_new
as select
cate,
avg(price) as avg_price
from tb_product
group by cate;
加载数据 import(不常用)
对数据有要求,必须是先用export导出的数据才能导入
导出数据到HDFS中
export table tb_product to “/user/hive/warehouse/export/name”;
导入数据方式
import table tb_name from “/user/hive/warehouse/export/product”;
数据导出
1)数据导出到本地
将查询的结果导出到本地
insert overwrite local directory “/opt/xxxxxx” select * from tb_name;
将查询的结果格式化导出到本地
insert overwrite local directory “/opt/xxxxx”
row format delimited fields terminated by “/t” select * from tb_name;
2)数据导出到HDFS
没有指定分隔符
insert overwrite directory “/opt/xxxxxx” select * from tb_name;
指定分隔符格式化
insert overwrite directory “/opt/xxxxx”
row format delimited fields terminated by “/t” select * from tb_name;
3)使用HDFS提供的命令
1)hdfs dfs -get xxxxx
2)hive命令行中 dfs -get /user/hive/warehouse/db_name/tb_name/0000_0 /hive/phone
4)shell hive命令导出
hive -e “user db_name;select * from tb_name;” >> /opt/xxxx.txt;
hive -f hive.sql(将sql命令写到文件中)
使用hive shell 脚本调用sql任务时使用
5)export 导出数据(default hive)> export table tb_name to “/user/hive/xxxx”;
6)sqoop数据迁移工具将mysql中的数据导入到HDFS中(hive表的目录下)
将HDFS中的结构化数据导入到mysql的表中
用来结构化数据的数据迁移
命令底层 MR程序

分区表
hive出来的数据都在HDFS中
select *from tb_name;
查询表中的数据是加载HDFS文件夹下的数据
文件夹下的数据很多
select from tb_name where dt= “2030-06-17”; 查询昨天数据
select from tb_name where dt= “2030-06-18”; 查询今天数据
按照日期维度查询全局检索数据过滤慢
分区表将数据分文件夹管理静态分区
tb_name/
2020-16-17/
文件1
文件2
2020-16-18/
文件3
2020-16-19/
文件4

create table if not exists tb_p_order(
oid int,
dt string,
cost double
)
partition by (dy_string)
row format delimited fields terminated by "," ;
加载数据到分区表
load data local inpath "/hive/data/06-18.txt" into table tb_p_order partition(dy="06-18");
load data local inpath "/hive/data/06-19.txt" into table tb_p_order partition(dy="06-19");
查询分区表中数据
select * from tb_name where dy="06-18";
增加分区
alter table tb_name add partition(dy="06-17") partition(dy="06-17");
删除分区
alter table tb_name drop partition (dy="06-17");
alter table tb_name drop partition (dy="06-17"), partition (dy="06-17");
查看分区
show partitions tb_p_order; 
查看分区表结构
desc formatted tb_name;

表的修改操作
修改表名
alter table tb_name rename to new_tb_name;
更新列
alter table tb_name change [column] col_old_name col_new_name column_type
增加和替换列
alter table tb_name add|replace columns (col_name data_type,…)
删除表
drop table tb_name;
查询表语句
show functions; 查看表可以使用的函数
select
expr[字段, 常量, 运算, 函数]
id, name, price,
“hello”,
price*2 as dobule_price,
pnum+10 as n_num,
current_data(),
sum(),
max(),
自定义函数()
from
tb_name;

原始数据在hive文档中
create table if not exists tb_emp(
empno int,
ename string,
job string,
mrg int,
dt string,
sal double,
comm double,
deptno int
)
row format delimited fields terminated by “\t”;
load data local inpath “/hive/data/emp.txt” into table tb_emp;

create external table if not exists tb_dept(
deptno int,
dname string,
loc int
)
row format delimited fields terminated by “\t”;
load data local inpath “/hive/data/dept.txt” into table tb_dept;
1查询每个人的姓名和工作以及工资
select
ename,
job,
sal
from
tb_emp;
2查询奖金不大于0的人
select
empno,
ename,
comm
from
tb_emp
where comm is not null
and comm >0;
3查询每个人的姓名工作和工作(包括奖金)
如何数字和nnull运算都是null
select
ename,
job,
(sal+nvl(comm,0)) as total_sal
from
tb_emp;

select current_data() --查询当前时间
select nvl(xxxx, 0) from tb_name
对null的处理
nvl(要判断的字段,如果为=是null的默认值) 如果xxx是null,则为0;
4求每个部门的平均工资
4.1每个人的工资和部门名称
select
tb_emp.ename,
tb_emp.sal,
tb_emp.deptno,
tb_dept.dname
from
tb_emp
join
tb_dept
on tb_dept.deptno=tb_emp.deptno;
4.2按照部门名称分组求平均工资
select
dname,
avg(sal) avg_dpt_sal
from
(select
tb_emp.ename,
tb_emp.sal,
tb_emp.deptno,
tb_dept.dname
from
tb_emp
join
tb_dept
on tb_emp.deptno = tb_dept.deptno) t
group by dname;
5求每个部门下每个岗位的平均工资
5.1 姓名工作工资部门
select
dname,
job,
avg(sal) as avg_dep_job_sal
from
(select
tb_emp.ename,
tb_emp.job,
tb_emp.sal,
tb_emp.deptno,
tb_dept.dname
from
tb_emp
join
tb_dept
on tb_emp.deptno =tb_dept.deptno) t
group by dname, job
order by dname;
5求每个部门下每个岗位的平均工资且每个部门每个岗位平均薪资在2000以上的
select
dname,
job,
avg(sal) as avg_dep_job_sal
from
(select
tb_emp.ename,
tb_emp.job,
tb_emp.sal,
tb_emp.deptno,
tb_dept.dname
from
tb_emp
join
tb_dept
on tb_emp.deptno =tb_dept.deptno) t
group by dname, job
having avg_dep_job_sal>2000
order by dname;

where和having的区别
where是对原始数据的统计
having是对分组group by后的统计

select
from
tbl
where
group by 字段1,字段
having
order by 字段1,字段2
limit
join
on

left join
right join
full join
union
unoin all

关联查询
1join 语法在hive中使用join进行内关联一定要避免笛卡尔积
on 条件拼接等值连接不运行不等值的连接
select
*
from
tb_emp
join
tb_dept
on
tb_dept.deptno=tb_emp.deptno;
–order by tb_emp.deptno

left join
right join
full join

select
*
from
tb_emp
left join
tb_dept
on
tb_dept.deptno=tb_emp.deptno;

select
*
from
tb_emp
right join
tb_dept
on
tb_dept.deptno=tb_emp.deptno;

select
*
from
tb_emp
full join
tb_dept
on
tb_dept.deptno=tb_emp.deptno;

使用unoin union两边查询字段个数和数据类型一样
union 去重
union all 不去重

select * from tb_product
union
select * from tb_product;

基本语法
select * from emp;
select empno, ename from emp;
select ename AS name, deptno dn from emp;
select sal + 1 from emp;
求总行数（count）
求工资的最大值（max）
求工资的最小值（min）
求工资的总和（sum）
求工资的平均值（avg）
select * from emp limit 2, 5;
参数一起始的行数 0开始计数参数2 每页显示的条数
select * from emp where sal >1000;
select * from emp where sal between 500 and 1000;
select * from emp where comm is null;
select * from emp where sal IN (1500, 5000);
select * from emp where sal LIKE ‘2%’;
select * from emp where sal>1000 and deptno=30;
select t.deptno, avg(t.sal) avg_sal from emp t group by t.deptno;
select deptno, avg(sal) avg_sal from emp group by deptno having
avg_sal > 2000;
select * from emp order by sal desc;

分区表(二级分区)
将数据分文件夹管理静态分区
create table tb_partition22(
id int,
name string,
gender string,
birthday string
)
partitioned by (y string, m string)
row format delimited fields terminated by “,”;
load data local inpath “/hive/data/a.txt” into table tb_partition22 partition(y=“90”,m=“01”)
load data local inpath “/hive/data/b.txt” into table tb_partition22 partition(y=“90”,m=“02”)
load data local inpath “/hive/data/c.txt” into table tb_partition22 partition(y=“95”,m=“01”)
load data local inpath “/hive/data/d.txt” into table tb_partition22 partition(y=“95”,m=“02”)