Hive表的类型

happy进

已于 2023-10-10 20:48:15 修改

阅读量46

点赞数

文章标签： hive

于 2023-10-10 20:46:55 首次发布

本文链接：https://blog.csdn.net/weixin_56447071/article/details/133755853

版权

Hive表的类型

# 启动hadoop
start-all.sh

# 启动hive - 元数据存储服务 远程连接服务
hive --service metastore &
hive --service hiveserver2 &

-- insert, 在光标前面插入, 从左往右插入, 插入后ESC退出
-- delete, 在光标后面删除, 从左往右删除, 删除后ESC退出
-- o, 在光标下方插入一行, 插入后ESC退出
-- dd, 删除光标所在行, 删除后ESC退出
-- 写入后保存退出, shift+: -> wq -> enter

-- 删除跑路, 直接删除所有目录, 删除前不再询问, 谨慎使用
-- sudo rm -rf /*

-- 强制删除
drop database if exists test cascade;

create database if not exists test;

2.1 Hive 内部表

除了外部表，数据均存储在配置文件hive-site.xml指定的hive.metastore.warehouse.dir目录下。

-- 在分布式文件系统中创建文件夹person
hadoop fs -mkdir /user/hive/warehouse/import/person
-- 将本地文件local文件 person.txt上传到person文件夹
hadoop fs -put /data/test/person.txt /user/hive/warehouse/import/person

create table if not exists test.person(
	id int comment '学号',
	name string comment '姓名',
	sex string comment '性别')comment '学生表'
row format delimited
fields terminated by ','
-- 文件路径指向
location '/user/hive/warehouse/import/person';

select * from test.person ;
 -- 删除内部表 会一起删除掉元数据以及数据文件
drop table test.person ;

2.2 Hive 外部表

-- 在分布式文件系统中创建文件夹person
hadoop fs -mkdir /user/hive/warehouse/import/person
-- 将本地文件local文件 person.txt上传到person文件夹
hadoop fs -put /data/test/person.txt /user/hive/warehouse/import/person

-- 外部表关键字 external
create external table if not exists test.person(
	id int comment '学号',
	name string comment '姓名',
	sex string comment '性别')comment '学生表'
row format delimited
fields terminated by ','
-- 文件路径指向
location '/user/hive/warehouse/import/person';
-- 删除外部表 只删除元数据 而不删除数据文件和文件夹本身
drop table test.person ;

2.3 Hive 分区表

-- 创建分区表
create table if not exists test.person(
	id int comment '学号',
	name string comment '姓名',
	sex string comment '性别')comment '学生表'
partitioned by (
	province string comment '省份',
	city string comment '城市')
row format delimited
fields terminated by ',';

2.4 Hive 分桶表

分桶表就是按指定列进行哈希(hash)计算，然后根据hash值进行切分，将具有不同hash值的数据写入每个桶对应的文件中。

-- 创建分桶表
create table test.person_b(
	id int comment '学号',
	name string comment '姓名',
	sex string comment '性别')comment '学生表'
clustered by (sex) sorted by (id) into 2 buckets
row format delimited
fields terminated by ',';

分桶和分区的区别

表现形式：分区表是一个目录，而分桶表是文件
创表语句：分区表是partitioned by子句指定的，以指定字段为伪列，需要指定数据类型；而分桶表clustered by子句指定，指定字段是真实字段，需要指定桶的个数
数量上：分区表的文件夹个数（分区个数）可以增长，而分桶表一旦指定不能再增长
作用上：分区避免全表扫描，根据分区查询指定的目录提高查询速度；分桶表保存分桶结果的分桶结构，（实际上数据已经经过了hash散射），分桶表可以进行分桶抽样，以及在join的时候可以提高MapReduce的效率。

2.5 Hive 视图

-- 为什么要使用视图
# 1.通过隐藏复杂的操作过程来隐藏查询（表关联，子查询，分组，筛选）
# 2.脱敏字段，可以将敏感字段过滤掉然后创建视图，将视图开放给其他人员

# 创建视图
create view test.score_view as 
select s.c_id ,AVG(s.s_score) as s_score 
from edu.score s
group by s.c_id
HAVING AVG(s.s_score) >= 60;


-- 查询视图
select * from test.score_view; 

-- 显示定义视图的时候的sql语句
show create table test.score_view;

-- 删除视图
drop view test.score_view;