【博学谷学习记录】超强总结，用心分享|狂野大数据课程【Hive框架三】的总结分析

ZLWQ

已于 2022-10-25 10:45:46 修改

阅读量112

点赞数

分类专栏： java 博学谷IT技术支持文章标签： hive 大数据学习

于 2022-10-25 10:08:25 首次发布

本文链接：https://blog.csdn.net/qq_42198232/article/details/127507904

版权

博学谷IT技术支持同时被 2 个专栏收录

40 篇文章 1 订阅

订阅专栏

java

26 篇文章 0 订阅

订阅专栏

Hive的复杂类型

array类型

-- 1、准备数据
zhangsan    beijing,shanghai,tianjin,hangzhou
wangwu  changchun,chengdu,wuhan,beijing

-- 2、创建表
create external table hive_array
(
    name           string,
    work_locations array<string>
)
row format delimited fields terminated by '\t'
collection items terminated by ',';

-- 3、加载数据
load data local inpath '/export/data/hivedatas/work_locations.txt' into table hive_array;

select * from hive_array;

-- 4、查询数据
-- 查询所有数据
select * from hive_array;
-- 查询work_locations数组中第一个元素
select name, work_locations[0] location from hive_array;
-- 查询location数组中元素的个数
select name, size(work_locations) location_size from hive_array;
-- 查询location数组中包含tianjin的信息
select * from hive_array where array_contains(work_locations,'tianjin');

map类型

-- 1、准备数据

1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelababy,26

-- 2、建表
create table hive_map
(
    id      int,
    name    string,
    members map<string,>,
    age     int
)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';

-- 3、加载数据

load data local inpath '/export/data/hivedatas/hive_map.txt' into table hive_map;
select * from hive_map;

-- 4、查询操作
select * from hive_map;
-- 根据键找对应的值
select id, name, members['father'] father, members['mother'] mother, age from hive_map;

-- 获取所有的键
select id, name, map_keys(members) as relation from hive_map;

-- 获取所有的值
select id, name, map_values(members) as relation from hive_map;

-- 获取键值对个数
select id,name,size(members) num from hive_map;

-- 获取有指定key的数据
-- 判断亲属关系中哪一个包含brother
select * from hive_map where array_contains(map_keys(members), 'brother');

-- 查找包含brother这个键的数据，并获取brother键对应的值
select id,name, members['brother'] brother from hive_map where array_contains(map_keys(members), 'brother');

struct类型

class  类名{
   String name,
   int  age ,
   double score
}

-- 1、准备数据
192.168.1.1#zhangsan:40
192.168.1.2#lisi:50
192.168.1.3#wangwu:60
192.168.1.4#zhaoliu:70

-- 2、创建表
create table hive_struct(
    ip string,
    info struct<name:string, age:int>
)
row format delimited fields terminated by '#'
collection items terminated by ':';

-- 3、给表加载数据

load data local inpath '/export/data/hivedatas/hive_struct.txt' into table hive_struct;

select * from hive_struct;


-- 4、查询表数据
select  ip,info.name,info.age from hive_struct;
select  ip,info from hive_struct;

（重点) Hive表操作2-分区表

介绍

1、分区表就是对一个表的文件数据进行分类管理，表现形式就是有很多的文件夹(dt=2019-02-27)
2、分区表的作用是以后查询时，我们可以手动指定对应分区的数据，避免全表扫描，提高查询效率
3、专业的介绍
所谓的分区表，指的就是将数据按照表中的某一个字段进行统一归类，并存储在表中的不同的位置，也就是说，一个分区就是一类，这一类的数据对应到hdfs存储上就是对应一个目录。当我们需要进行处理的时候，可以通过分区进行过滤，从而只取部分数据，而没必要取全部数据进行过滤，从而提升数据的处理效率。且分区表是可以分层级创建。
select * from 表 where dt = '2019-03-13'

4、分区表的关键字是Partition，这里的分区是MR中的分区没有关系
5、分区表可以有内部分区表，也可以有外部分区表
6、什么时候表数据不用分区：  
   1）几乎在实际应用中所有的表数据都要分区
   2）如果你的数据量很小，而且数据很单一，此时可以不用分区

静态分区

----------------------单级分区----------------------------------
-- 1、创建单分区表
create table score
(
    sid    string,
    cid    string,
    sscore int
)
partitioned by (dt string)  -- 这个dt是分区字段和表字段没有关系，理论上可以随便写
row format delimited fields terminated by '\t';

-- 2、给分区表加载数据
-- 第一件事：在HDFS的表目录下创建文件夹:dt=2022-10-13  第二件事：将score.txt复制到该文件夹下
load data local inpath '/export/data/hivedatas/score.txt' into table score partition (dt='2022-10-13');

select * from score;

-- 再添加一个分区
load data local inpath '/export/data/hivedatas/score2.txt' into table score partition (dt='2022-10-14');
select * from score;


-- 3、查询数据

-- 查找dt=2022-10-13分区数据
select * from score where dt='2022-10-13';

-- 查找dt=2022-10-14分区数据
select * from score where dt='2022-10-14';

desc score; -- 查看哪个是分区列


----------------------多级分区----------------------------------
-- 1、创建多级分区表
create table score2
(
    sid    string,
    cid    string,
    sscore int
)
partitioned by (year string, month string ,dt string)  -- 这个dt是分区字段和表字段没有关系，理论上可以随便写
row format delimited fields terminated by '\t';



-- 2、给分区表加载数据
-- 第一件事：在HDFS的表目录下创建三级文件夹:year=2022/month=10/dt=13 第二件事：将score.txt复制到该文件夹下
load data local inpath '/export/data/hivedatas/score.txt'
into table score2 partition (year='2022',month='10',dt='13');

select * from score2;

-- 再添加一个分区
load data local inpath '/export/data/hivedatas/score2.txt'
    into table score2 partition (year='2022',month='11',dt='13');
-- 再添加一个分区
load data local inpath '/export/data/hivedatas/score2.txt'
    into table score2 partition (year='2023',month='11',dt='13');

select * from score2;


-- 3、查询分区数据：查询 2022年 10月13号数据
select * from  score2 where year='2022' and month = '10' and dt = '13';


----------------------分区相关的SQL----------------------------------

show  partitions  score; -- 查看表所有分区情况
alter table score add partition(dt='2022-01-01');  -- 手动添加一个分区
alter table score drop partition(dt='2022-01-01'); -- 手动删除一个分区

动态分区

在这里插入图片描述

单级分区

-- -----------------------单级分区：按照日进行分区---------------------------------
-- 1、开启动态分区
set hive.exec.dynamic.partition=true;  -- 开启动态分区
set hive.exec.dynamic.partition.mode=nonstrict;-- 设置为非严格格式


-- 2、模拟数据
/*
1	2022-01-01	zhangsan	80
2	2022-01-01	lisi	70
3	2022-01-01	wangwu	90
1	2022-01-02	zhangsan	90
2	2022-01-02	lisi	65
3	2022-01-02	wangwu	96
1	2022-01-03	zhangsan	91
2	2022-01-03	lisi	66
3	2022-01-03	wangwu	96
*/
-- 3、创建一个中间普通表（该表用来存入原始数据）
create table test1
(
    id       int,
    date_val string,
    name     string,
    score    int
)
row format delimited fields terminated by '\t';

-- 4、给普通表加载数据
load data local inpath '/export/data/hivedatas/partition.txt' into table test1;

-- 5、来创建最终的分区表
create table test2
(
    id    int,
    name  string,
    score int
)
partitioned by (dt string) -- 这个分区字段的名字随便写，它来决定HDFS上文件夹的名字：day=2022-01-01
row format delimited fields terminated by ',';

-- 6、查询普通表，将数据插入到分区表
insert overwrite table test2 partition (dt)
select id, name, score, date_val  from test1;

select * from test2;



-- -----------------------单级分区：按照月进行分区---------------------------------
1       2022-01-01      zhangsan        80
2       2022-01-01      lisi    70
3       2022-01-01      wangwu  90
1       2022-01-02      zhangsan        90
2       2022-01-02      lisi    65
3       2022-01-02      wangwu  96
1       2022-01-03      zhangsan        91
2       2022-01-03      lisi    66
3       2022-01-03      wangwu  96
1       2022-02-01      zhangsan        80
2       2022-02-01      lisi    70
3       2022-02-01      wangwu  90
1       2022-02-02      zhangsan        90
2       2022-02-02      lisi    65
3       2022-02-02      wangwu  96
1       2022-02-03      zhangsan        91
2       2022-02-03      lisi    66
3       2022-02-03      wangwu  96

load data local inpath '/export/data/hivedatas/partition2.txt' overwrite into table test1;

drop  table test2_1;
create table test2_1
(
    id    int,
    date_val string,
    name  string,
    score int
)
partitioned by (month string) -- 这个分区字段的名字随便写，它来决定HDFS上文件夹的名字：day=2022-01-01
row format delimited fields terminated by ',';


-- 6、查询普通表，将数据插入到分区表
insert overwrite table test2_1 partition (month)
select id, date_val,name, score, substring(date_val,1,7)   from test1;

多级分区

-- 1、创建普通表

drop table if exists test3;
create table test3
(
    id       int,
    date_val string,
    name     string,
    sex      string,
    score    int
)
    row format delimited fields terminated by '\t';
;


-- 2、给普通表加载数据
load data local inpath '/export/data/hivedatas/partition3.txt' overwrite into table test3;
select * from test3;

-- 3、创建最终的分区表
drop table test4;
create table test4
(
    id    int,
    name  string,
    score int
)
    partitioned by (xxx string, yyy string)
    row format delimited fields terminated by '\t'
;

-- 4、去普通表查询，将查询后的结果插入到最终的分区表

insert overwrite table test4
select id, name, score,date_val,sex from test3;  -- 这里的动态分区是看最后的两个字段

Hive表操作3-分桶表

在这里插入图片描述

概念

1、分桶就是MR的分区
2、分桶表的表现形式就是分文件,可以通俗的理解为将一个大的表文件拆分成多个小文件
3、分桶的作用有两个：
  作用1：主要是来提高多张表join的效率
  作用2：主要是用于数据的抽样
4、分桶的方式就是拿到分桶字段的值，然后取hash值对分桶的个数取模  
  
  
专业说法：
在表或者分区中使用分桶通常有两个原因，一个是为了高效的join查询，另一个则是为了高效的抽样。
桶其实是在表中加入了特殊的结构，hive在查询的时候可以利用这些结构来提高查询效率。比如，
如果两个表根据相同的字段进行分桶，则在对这两个表进行join关联的时候可以使用map-side关联高效实现。

分桶表的操作

-- 1、创建分桶表
create table course
(
    cid    string,
    c_name string,
    tid    string
)
clustered by (cid) into 3 buckets
row format delimited fields terminated by '\t';

-- 解释：
clustered by (cid) into 3 buckets  表示按照cid 这一列进行分桶，并且将表数据分到3个桶中（3个文件中）


-- 2、创建普通表
create table course_common
(
    cid    string,
    c_name string,
    tid    string
) row format delimited fields terminated by '\t';

-- 3、给普通表加载数据
load data local inpath '/export/data/hivedatas/course.txt' into table course_common;

select * from course_common;


-- 4、将普通表的数据进行查询插入到普通表
insert overwrite table course
select * from course_common cluster by (cid);

select * from course;

在这里插入图片描述

作用
- 作用1-提高join的效率

在这里插入图片描述

作用2 -可以用于数据的抽样

1、有时候在大数据分析时，我们并不需要全部的数据参与分析，而只需要抽取一部分具有代表性的数据参与分析，这样可以提高分析的效率，此时就可以使用分桶表来完成

1   zs
2   ls
3   ww
4   zl
5   zq
6   mb
7   lf
8   we
9   zz
10  qw



-- 1、创建分桶表
drop table sample_test;
create table sample_test
(
    sid    int,
    s_name string
)
clustered by (sid) into 6 buckets
row format delimited fields terminated by '\t';

-- 2、创建普通表
create table sample_common
(
    sid    int,
    s_name string
) row format delimited fields terminated by '\t';

-- 3、给普通表加载数据
load data local inpath '/export/data/hivedatas/sample.txt' overwrite into table sample_common;

select * from sample_common;

set hive.stats.column.autogather=false;
set hive.exec.mode.local.auto=true;  --开启本地mr


-- 4、将普通表的数据进行查询插入到普通表
insert overwrite table sample_test
select * from sample_common cluster by (sid);

select * from course;


-- 5、对数据进行抽样(先保留)
-- TABLESAMPLE (BUCKET x OUT OF y [ON colname])  6 / 2  = 3
select * from sample_test tablesample ( bucket  1 out of 2 on sid);


/*

select * from student tablesample(bucket x out of y on id);

n：总桶数

x：从第几个桶开始抽取

y：必须是总桶数的因数或倍数（自定义）

z：共需抽取出的桶数（z=n/y）


select * from student tablesample(bucket 1 out of 2 on id);
z 数据属于第几个桶

1 第1个分桶的数据（1）

2 第3个分桶的数据（1+y）

3 第5个分桶的数据（3+y）

4 第7个分桶的数据（5+y）

5 第9个分桶的数据（7+y）
*/

表结构操作

-- 删表
drop table score4;  #内部表和外部表的删除是不一样的

-- 清空表数据
truncate table score4; #只能清空内部表（管理表）

ZLWQ

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【博学谷学习记录】超强总结，用心分享|狂野大数据课程【Hive框架三】的总结分析

Hive表操作
复制链接

扫一扫

专栏目录

【博学谷学习记录】超强总结，用心分享|狂野大数据课程【Hive框架三】的总结分析

Hive的复杂类型

（重点) Hive表操作2-分区表

Hive表操作3-分桶表

表结构操作

“相关推荐”对你有帮助么？