Hive 核心知识点

最新推荐文章于 2025-02-27 16:45:41 发布
未来世界～未来之星
最新推荐文章于 2025-02-27 16:45:41 发布
阅读量822
点赞数
分类专栏： hadoop Hive 大数据文章标签： hive hadoop mysql
本文链接：https://blog.csdn.net/qq_43256584/article/details/121010161
版权
大数据同时被 3 个专栏收录
9 篇文章 0 订阅
订阅专栏
hadoop
8 篇文章 0 订阅
订阅专栏
Hive
2 篇文章 0 订阅
订阅专栏
本文介绍了如何初始化Hive Metastore，启动Hive及Hadoop，进行数据加载、表结构操作，创建分区表、视图，并演示了Flume日志处理与外部分区表的应用。涵盖了SQL查询、数据安全与数据分析等关键步骤。
摘要由CSDN通过智能技术生成
--初始化Hive的Metastore命令
bin/ schematool -dbType mysql -initSchema 

--hive启动
--1 bin/hive
--启动hive需要先启动hadoop
---先hiveserver2
--后 bin/beeline -u jdbc:hive2://localhost:10000
--关闭防火墙
systemctl stop firewalld.service
--hdfs文件授权
hadoop fs -chmod 777 /tmp/hadoop-yarn

--设置全部权限hdfs dfs -chmod -R 777 /tmp

--#查看历史命令
more ~/.hivehistory

SELECT  *
FROM    t2_bak;



--从本地导数据到hive
load data local inpath '/data/soft/hivedata/t2.data' into table t2_bak;
--查询数据
select * from t2_bak;
--添加一个字段
alter table t2_bak add columns (name string);
--创建一个表
create table t2(age int comment '年龄')comment '测试';
--显示表的详细信息
show create table t2;
--查看表的信息
desc t2;
--登录数据库
mysql -uroot -pAdmin@123
--创建数据库并且添加注释
create table t2(
    age int comment '年龄'
)comment '测试'
--直接这样是会产生中文乱码的，接下来我们解决乱码问题
--1、登录mysql数据库mysql -uroot -pAdmin@123
--2、use hive;
--3、show create table COLUMNS_V2; 
--4、show create table TABLE_PARAMS;
--5、alter table COLUMNS_V2 modify column COMMENT varchar(256) character set utf8;
--6、alter table TABLE_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;
--7、alter table PARTITION_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;
--8、lter table PARTITION_KEYS modify column PKEY_COMMENT varchar(4000) character set utf8;
--这样就修改了hive的编码了，修改为utf8


--创建一个多字段的的表，接下来我们就创建一个多字段的表t3

create table t3(
    id int comment 'ID',
    stu_name string comment 'name',
    stu_birthday date comment 'birthday',
    online boolean comment 'is online'
);

-- 创建一个表，表中的字段数据类型为,array的使用

create table stu(
    id int,
    name string,
    favors array<string>
)row format delimited
fields terminated by '\t'
collection items terminated by ','
lines terminated by '\n';

--Map的使用
--建表语句如下：指定scores字段类型为map格式 
--通过collection items terminated by ','指定了map中元素之间的分隔符 
--通过map keys terminated by ':'指定了key和value之间的分隔符

create table stu2(
    id int,
    name string,
    scores map<string,int>
)row format delimited
fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n';

-- 查询所有学生的语文和数学成绩
select id,name,scores['chinese'],scores['math'] from stu2;

select id,name,scores['chinese'] as chinese,scores['math'] as math,scores['english'] as english from stu2;
-- 再来介绍最后一种复合类型struct，有点像java中的对象，举个例子说明一下， 
--某学校有2个实习生，zhangsan、lisi，每个实习生都有地址信息，一个是户籍地所在的城市，一个是公
-- 司所在的城市，
-- 我们来组织一下数据

create table stu3(
    id int,
    name string,
    address struct<home_addr:string,office_addr:string>
)row format delimited
fields terminated by '\t'
collection items terminated by ','
lines terminated by '\n';

-- 案例：
-- 复合数据类型的使用
create table student(
    id int comment 'id',
    name string comment 'name',
    favors array<string>,
    scores map<string,int>,
    address struct<home_addr:string,office_addr:string>
)row format delimited
fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n';



--创建一个外部表
create external table external_table(
    key string
)location '/data/external';
--按照上面的分析，我们来创建一个分区表，使用partitioned by指定区分字段，分区字段的名称为dt，类
--型为string 
create table partition_1 ( 
id int, 
name string  
) partitioned by (dt string) 
row format delimited 
fields terminated by '\t'; 


--向分区表中加载数据
load data local inpath '/data/soft/hivedata/partition_1.data' overwrite into table partition_1 partition(dt='2021-10-27');
--当然我也可以手动在表中只创建分区
alter table partition_1 add partition (dt='2021-89-56')

--查看分区表有那些 
show partitions partition_1;
--删除分区（删除分区以后呢，表中的数据也会被删除，因为是内部表）
alter table partition_1 drop partition(dt='2020-01-02'); 
alter table partition_1 drop partition(dt='2021-10-27'); 
--需要创建多个分区
create table partition_2(
    id  int,
    name string
)partitioned by (year int,school string)
row format delimited
fields terminated by '\t';

--加载数据的时候数据只需要有创建表的时候的字段就行了，具体的分区的数据我们需要在加载数据时进行指定
load data local inpath '/data/soft/hivedata/partition_2.data' 
into table partition_2 partition (year='2021',school='english');

--查询分区的数据
-- hive中的分区相当于一个查询条件，直接在后面加上一个where即可
select * from partition_2;--全表扫描
select * from partition_2 where year=2020;--用到了一个分区
select * from partition_2 where year=2020 and school='xk';--用到了两个字段进行过滤

--外部分区表
--创建一个外部分区表
create external table ex_par(
    id int,
    name string
)partitioned by (dt string)
row format delimited
fields terminated by '\t'
location '/data/ex_par';

--加载数据
load data local inpath '/data/soft/hivedata/ex_par.data' into table ex_par partition (dt='2020-12-12');
--删除分区
alter table ex_par drop partition(dt='2020-12-12');

-- 如果数据已经上传上去了，但是查不到是因为没有在表中添加分区信息，（使用前面的alter table ex_par add partition 命令），这里需要通过location指定分区目录
alter table ex_par add partition(dt='2020-12-12') location '/data/ex_par/dt=2020-12-12';
--添加分区(绑定数据和分区之间的关系)


--桶表 对数据进行哈希取值，然后放到不同文件中存储，物理上，每个桶就是一个表里的一个文件
--创建 一个桶表，这个表是按照id进行分桶的，分成4个桶
create table bucket_tb(
    id int
)clustered by (id) into 4 buckets;

insert into table … select … from …;

--在插入数据之前需要开启设置桶的操作，不然数据是无法分到不同的桶里面的，其实这里的分桶就是设置reduce任务的数量，最终结果中文件的数量和reduce任务数量是挂钩的
--设置 
set hive.enforce.bucketing =true--自动控制reduce的数据从而适配bucket的个数
--初始化一个表，用于向桶表中加载数据
create table b_source(id int); 

--向桶表中加载数据
insert into table bucket_tb select id from b_source;
insert into table bucket_tb select id from b_source where id =1;


--创建视图
create view v1 as select t3_new.id,t3_new.stu_name from t3_new;
--注意：视图在/user/hive/warehouse中是不存在的。因为它只是一个虚拟的表。 视图只是一张虚表


--综合案例
-- 需求：Flume按天把日志数据采集到HDFS中的对应目录中，使用SQL按天统计每天数据的相关指标 
--分析一下：
--Flume按天把日志数据保存到HDFS中的对应目录中 
--针对Flume的source可以使用execsource、channel可以使用基于文件的或者内存的，sink使用
--hdfssink，在hdfssink的path路径中需要使用%Y%m%d获取日期，将每天的日志数据采集到指定的hdfs
--目录中
--这个是我们在前面学习Flume的时候已经讲过的了，这个倒不难
--后面就是需要对按天采集的日志数据建表，由于这份数据可能会被多种计算引擎使用，所以建议使用外部
--表，这样就算我们不小心把表删了，数据也还是在的，不影响其他人使用，还有就是这份数据是按天分目
--录存储的，在实际工作中，离线计算的需求大部分都是按天计算的，所以在这里最好在表中增加日期这个
--分区字段，所以最终决定使用外部分区表。
--前面FLume采集数据的流程我们就不再演示了，在这我就直接使用之前我们使用hdfs落盘的数据了。 
--我们之前有一个案例是分类型，分目录，把多种类型的数据存储到不同的目录下
--目录结构是这样的，首先是按天，然后是按照类型
--创建表
create external table ex_par_more_type(
    log string
)partitioned by(dt string,d_type string)
row format delimited
fields terminated by '\t'
location '/moreType';
--添加分区
alter table ex_par_more_type add partition(dt='20211022',d_type='giftRecord') location '/moreType/20211022/giftRecord';
alter table ex_par_more_type add partition(dt='20211022',d_type='userInfo') location '/moreType/20211022/userInfo';
alter table ex_par_more_type add partition(dt='20211022',d_type='videoInfo') location '/moreType/20211022/videoInfo';


select * from ex_par_more_type where dt='20211022' and d_type='giftRecord';
select * from ex_par_more_type where dt='20211022' and d_type='userInfo';
select * from ex_par_more_type where dt='20211022' and d_type='giftRecord';

--创建视图
create view gift_record_view as select get_json_object(log,'$.send_id') send_id,
get_json_object(log,'$video_id') as video_id,get_json_object(log,'$.gold') as gold,
dt from ex_par_more_type where d_type='giftRecord';


create view user_info_view as select get_json_object(log,'$.uid') as uid,
get_json_object(log,'$.nickname') as nickname,get_json_object(log,'$.usign') as usign,
get_json_object(log,'$.sex') as sex,
dt from ex_par_more_type where d_type='userInfo';


create view video_info_view as select get_json_object(log,'$.id') as id,
get_json_object(log,'$.uid') as uid,
get_json_object(log,'$.lat') as lat,
get_json_object(log,'$.lnt') as lnt, 
dt from ex_par_more_type where d_type = 'videoInfo'; 


--查询视图
select *  from user_info_view where dt='20211022';
select *  from gift_record_view where dt='20211022';
select *  from video_info_view where dt='20211022';


--创建一个定时器进程调度
#!/bin/bash 
# 每天凌晨1点定时添加当天日期的分区 
if [ "a$1" = "a" ] 
then 
  dt=`date +%Y%m%d` 
else 
  dt=$1 
fi 
# 指定添加分区操作 
hive -e "
alter table ex_par_more_type add if not exists partition(dt='${dt}',d_type='giftRecord') location '/moreType/${dt}/giftRecord';
alter table ex_par_more_type add if not exists partition(dt='${dt}',d_type='userInfo') location '/moreType/${dt}/userInfo';
alter table ex_par_more_type add if not exists partition(dt='${dt}',d_type='videoInfo') location '/moreType/${dt}/videoInfo';
"
--显示分区
show partitions ex_par_more_type;
--删除分区
alter table ex_par_more_type drop partition(dt='20211022',d_type='giftRecord');
alter table ex_par_more_type drop partition(dt='20211022',d_type='userInfo');
alter table ex_par_more_type drop partition(dt='20211022',d_type='videoInfo');


alter table ex_par_more_type add if not exists partition(dt='20211022',d_type='giftRecord') location '/moreType/${dt}/giftRecord';

--这个脚本需要配置一个定时任务，每天凌晨1点执行，可以使用crontab
00 01 * * * root /bin/bash /data/soft/hivedata/addPartition.sh >> /data/soft/hivedata/addPartition.log



--我们的需求是这样，有一份学生的考试分数信息，语文、数学、英语这三门，需要计算出班级中单科排名
-- 前三名学生的姓名

create external table student_score(
    id int,
    name string,
    sub string,
    scores int
)row format delimited
fields terminated by '\t'
location '/data/student_score';

row_number,row_number
不能单独使用，在这里需要加上over()

--查询数据
select *,row_number() over() from student_score;
--在这里相当于给表里面的所有数据编了一个号，从1开始,但是对这些数据进行先分组，再对组内数据进行排序，再编号
-- 就需要在over函数内部添加partion by进行分组，添加order by进行排序，最终给生成的编号起了别名num
select *,row_number() over (partition by sub order by scores desc) as num from student_score;
--取前3名
select * from (select *,row_number() over (partition by sub order by scores desc) as num from student_score)
s WHERE s.num<=3;

--rank()
select *,rank() over (partition by sub order by scores desc) as num from student_score;

--dense_rank()
select *,dense_rank() over (partition by sub order by scores desc) as num from student_score;

--总结一下：
--row_number() over() 是正常排序 
--rank() over()是跳跃排序，有两个第一名时接下来就是第三名（在各个分组内） 
--dense_rank() over()是连续排序，有两个第一名时仍然跟着第二名（在各个分组内）
--列转行
create external table student_favors( 
name string, 
favor string 
)row format delimited 
fields terminated by '\t' 
location '/data/student_favors'; 


select * FROM  student_favors;
--对字段name进行分组，把favor转化为一个数组
SELECT name,collect_list(favor) as favor_list FROM student_favors GROUP BY name;

--然后再使用concat_ws把数组中的元素按照指定分隔符转成字符串 
-- 这样就实现了多行数据转为一列数据了
-- 执行行转列操作

SELECT name,concat_ws(',',collect_list(favor)) as favor_list FROM student_favors GROUP BY name;

SELECT name,concat_ws(',',collect_set(favor)) as favor_list FROM student_favors GROUP BY name;

--行转列
create external table student_favors_2( 
name string,   
favorlist string 
)row format delimited  
fields terminated by '\t' 
location '/data/student_favors_2'; 

--先使用split对favorlist字段进行切割
select split(favorlist,',') FROM student_favors_2;
-- 再使用explode对数据进行操作
select explode(split(favorlist,',')) from student_favors_2;


select name,favor_new from student_favors_2 lateral view
 explode(split(favorlist, ',')) table1 as favor_new ;

 select id,from t2_bak sort by id;

 --设置reduce任务数为2
 set mapreduce.job.reduces = 2; 

 --两者结合使用的时候distribute by必须要写在sort by之前


 select id from t2_bak distribute by id sort by id;