hive数据库的使用

你很棒滴

已于 2023-06-05 17:18:54 修改

阅读量204

点赞数

文章标签：数据库 hive hadoop

于 2023-06-02 18:44:58 首次发布

本文链接：https://blog.csdn.net/RayMand168/article/details/131005013

版权

hive的基本语法

# 创建数据库/指定数据库的位置
#默认位置在/usr/hive/warehouse
create database if not exists myhive [location position] [comment 'ssss'];
# 使用数据库
use myhive;
# 创建表
create table test (id int);
# 删除表/强制删除表
drop database test [cascade];
# 查询当前正在使用的数据库
select current_database();

在这里插入图片描述

创建数据库表的语法

在这里插入图片描述

表字段的类型

在这里插入图片描述

内部表和外部表的区别

在这里插入图片描述

内部表和外部表的应用

外部表：先有数据后有表，现有表后有数据都是可以的。删除表不会影响表位置的文件，删除文件也不会影响表里面的元数据，可用做外部数据的导入。
内部表：元数据和文件相同，删除元数据就会删除文件。

# 创建内部表/外部表(外部表的分隔符和位置信息是必须要填的)，用文件建表时，location只用文件所在的文件夹
create table inner_tab;
create external table outer_tab row format delimited fields terminated by '\t' location '/tmp/outer_lab'
# 内部表和外部表的转换(区分大小写)
alter table inner_tab set tblproperties('EXTERNAL'='TRUE')
alter table outer_tab set tblproperties('EXTERNAL'='FALSE')
# 查看表试内部表还是外部表
desc formatted t1;

hive使用load 和hdfs加载数据

# 使用load，从文件向表里面加载数据
load data [local] inpath 'linux本地'|'hdfs' [overwrite] into table tablename;
# 使用insert,从表向其他表里加载数据
insert into | overwrite table tablename select * from other_table_name;

hive表数据导出

# 加local是导出linux本地
insert overwrite [local] directory '/data/result' [row format delimited fields terminated by '\t'] select * from table_name

# 使用bin/hive加载数据
/export/server/hive/bin/hive -e 'select * from table' > file.txt
/export/server/hive/bin/hive -f 'test.sql' > file.txt

在这里插入图片描述

hive建立分区表

# 创建单分区
load data local inpath '/data/partitioned_txt.txt' INTO table myhive.test_partitions partition(month=12);
SELECT * FROM myhive.test_partitions;
# 创建多分区
CREATE table myhive.test_partitions2(id int,name string) partitioned by (year string,month string,day string) row format delimited fields terminated by ',';
SELECT * FROM myhive.test_partitions2;
load data local inpath '/data/partitioned_txt2.txt' INTO table myhive.test_partitions2 partition(year='ss',month='dd',day='aa');
# 注意点：建表时是partitioned by ,插入数据时是partition,建表时分区是什么类型，插入数据时就是什么类型，建字段时，最后一个字段尾部不要带逗号

在这里插入图片描述

hive建立分桶表

# 开启分桶自动优化
set hive.enfore.bucketing=true;
# 创建分桶表
CREATE table myhive.custer(id int,name string) clustered by (id) INTO 3 buckets row format delimited fields terminated by ',';

# 先创建一个三方表，从数据中加载数据到三方表中，注意三方表的分割方式要与分桶表一直
CREATE table myhive.test_partitions3(id int,name string)row format delimited fields terminated by ',';
load data local inpath '/data/partitioned_txt2.txt' INTO table myhive.test_partitions3 ;
# 然后从三方表中读取数据到分桶表
INSERT overwrite table myhive.custer select * FROM myhive.test_partitions3 cluster by(id);

# 不能用load data加载分桶表，因为分桶表分桶方式时安装hash值除以桶数量进行分桶，期间涉及到mapreduc的计算过程，load data不涉及计算，所以没法用load data

在这里插入图片描述

分桶列能带来性能提升的地方

单值过滤
join
group by

修改表的操作

# 修改表名
alter table old_name rename to new_name;
# 修改表里面的值
alter table table_name set tblproperties('EXTENNAL'='TRUE');  # 修改内外部表
alter table table_name set tblproperties('comment'='这是一个表') # 修改表的注释

# 添加表的分区字段（新分区里面没有数据，需要主动添加或者上传）
alter table table_name add partition(分区字段=值）
# 修改表分区字段的值（尽量少动，因为只是修改了指向，元数据中的名称是不变的）
alter table table_name partition('老分区值') rename  to partition(’新分区值');
# 删除分区名
alter table table_name drop partition(分区值);

# 清空一个表,只能清楚内部表，不能清除外部表
truncate table table_name;

array字符型的使用

#数据：1	beijing,tinajin,henan
#创建表
create table test_array(id int,address array<string>) row format delimited fields terminated by '\t' collection items terminated by ',';

#加载数据
load data local inpath '/data/test_array.txt' into table test_array;

#获取数据地址中的第一个元素
SELECT id,address[0] FROM test_array ta 
#查询地址中的个数
select id,size(address) from test_array;
#查找谁在天津工作过
select * from test_array where array_contains(address,'tianjin');

map字符串类型的使用

#数据类型 1		hhha	student:laowang,teacher:gouzi
create table test_map(id int,name string,relation map<string,string>) row format delimited fields terminated by '\t' collection items terminated by ',' map keys terminated by ':';
#加载数据
load data local inpath '/data/map_test.txt' into table test_map;
#查看一条数据中的成员信息
select id,name,relation['teacher] as teacher,relation['student'] ad student from test_map;
#取出map字段中所有的keys,values,返回值是一个array
select map_keys(relation) from test_map;
select map_values(relation) from test_map;
#查看student值是否在map字符串中
select * from test_map where array_contains(map_keys(relation),'student')
select * from test_map where array_contains(map_values(relation),'gouzi')
#查看relatio里面字段的键值对的数量
select size(relation) from test_map;

strut字符串类型的使用

#数据类型1		gouzi:111
#创建表
create table test_struct(id int,info struct<name:string,age:int>) row format delimited fields terminated by '\t' collection items terminated by ':';
#加载数据
load data local inpath '/data/test_struct' into table test_struct;
#查看数据
select id info.name,info.age from test_struct;
#struct类型的字符串只需要collection items terminated by ',' 作为分隔符，数据只记录values不用记录keys，keys在创建表时就已经固定为两列name,age,可以在定义时根据数据类型多定义几个。

在这里插入图片描述

hive的查询语法

#找出地址中带广东的数据
select * from table_name where useraddress like '%广东%';
#统计已支付和未支付的订单总数
select id,count(*) from table_name group by is_ply;
#在已支付订单中，统计每个用户的最高消费金额
select id,max(totalmoney) from table_name where is_pay =1 group by id;
#统计每个用户的平均消费额
select id avg(totalmoney) from table_name group by id
#统计每个用户的平均消费额，并过来出大于10000的
select id avg(totalmoney) as avg_price from table_name group by id having avg_price > 10000;
#订单表和用户表join,找出用户的username
select o.order_id,u.username from orders o join users u on o.id =u.id;
select o.order_id,u.username from orders o left join users u on o.id =u.id;


#hive特有的使用 rlike的使用
select * from table_name where useraddress rlike '..省..市';
select * from table_name where useraddress rlike '[广东河南]\\S+';

#union的使用（把两个执行语句的查询结构合并)，union all去重，union不去重,使用前提：两个查询语句查询出来的字段名一致
select * form table_a where id='wangwang'
		union [all]
select * from table_b where='ssss'
#union写到from中（子查询中）
select * from (
select * form table_a where id='wangwang'
		union [all]
select * from table_b where='ssss'
) as u group by id;
#union写到insert中
insert overwrite table table_name
 (
select * form table_a where id='wangwang'
		union [all]
select * from table_b where='ssss'
) 

#使用hive进行随机采样

#以username字段进行随机抽样，执行结果是固定的（如果样本为分桶表，此方法执行速度是最快的）
select * from table_name tablesample(bucket 3 out of 10 on username);
#完全随机，每次执行结果不一样
select * from table_name tablesample(bucket 3 out of 10 on rand())
#按照具体数目，具体比例，具体尺寸
select * from table_name tablesample(100 rows);
select * from table_name tablesample(10 percent);
select * from table_name tablesample(1k|1M|1G) #取1kb，取1Mb，取1GB

hive虚拟列的使用 (可以查看行级别的详细参数，用于where，group by的各类计算，协助进行错误排查）

#显示数据行所在的具体文件
INPUT_FILE_NAME 
#显示数据行所在的文件偏移量
BLOCK_OFFSET_INSIDE_FILE
#显示数据所在的HDFS块的偏移量；这个函数的前提先执行 set hive.exec.rowoffset=true
ROW_OFFSET_INSIDE_BLOCK

select username,INPUT_FILE_NAME,BLOCK_OFFSET_INSIDE_FILE,ROW_OFFSET_INSIDE_BLOCK from table_name;
select username,BLOCK_OFFSET_INSIDE_FILE from table_name where BLOCK_OFFSET_INSIDE_FILE < 1000;
select INPUT_FILE_NAME,count(*) from table_name group by INPUT_FILE_NAME;

hive常用的函数

#查看支持的所有函数
show functions;
#查看函数的具体使用方式
describe function extended count;

数学函数

在这里插入图片描述

二进制函数

日期函数

在这里插入图片描述

判读语句的用法

在这里插入图片描述

字符串相关用法

在这里插入图片描述

脱敏函数
select mask_hash(‘hadoop’);
#输出一长串的hash值

功能性函数

在这里插入图片描述

你很棒滴

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
hive数据库的使用

hive数据库的使用
复制链接

扫一扫

hive数据库的使用

hive的基本语法

创建数据库表的语法

表字段的类型

内部表和外部表的区别

内部表和外部表的应用

hive使用load 和hdfs加载数据

hive表数据导出

hive建立分区表

hive建立分桶表

分桶列能带来性能提升的地方

修改表的操作

array字符型的使用

map字符串类型的使用

strut字符串类型的使用

hive的查询语法

hive虚拟列的使用 (可以查看行级别的详细参数，用于where，group by的各类计算，协助进行错误排查）

hive常用的函数

“相关推荐”对你有帮助么？