大数据学习之HIVE_SQL DDL(Data Definition Language,数据定义语言)

热爱编程的小魏

已于 2024-01-16 21:44:32 修改

阅读量553

点赞数 25

文章标签： hive sql hadoop

于 2024-01-13 22:03:20 首次发布

本文链接：https://blog.csdn.net/m0_62371702/article/details/135576443

版权

一、对数据库的操作

-- 创建数据库
create database day05;

-- 查看数据库
show databases ;

-- 切换数据库
use day05;

-- 指定创建位置，location
create database day06;  --文件路径默认为HDFS的 /user/hive/warehouse/数据库名.db

create database day06 location '/bb';  --文件指定存在HDFS的/bb目录下

-- 查看指定的数据库
show  create database day06;  --简单的查看day06这个数据库的信息（位置）

desc database day06;   --详细的查看day06这个数据库的信息

-- 删除数据库
drop database day06;  --数据库day06里没有表，可以直接删除

drop database day06 cascade ;  --数据库day06里如果有表存在，可以加上cascade，将数据库连同里面的表一起删除

二、对表的操作

1.完整格式

中括号代表里面的内容可以省略不写

-- hive sql 完整的建表格式为
/*
create [external] table 表名(
  列名1,列类型,[comment '字段的描述信息'],
  列名2,列类型,[comment '字段的描述信息'],
  列名3,列类型,[comment '字段的描述信息']
) [comment] '表的描述信息'
[分区 partitioned by(分区字段1,类型,[comment '字段的描述信息',分区字段2,类型,[comment '字段的描述信息'],......)]
[分桶 clustered by(分桶字段1,分桶字段2,......) [sorted by(排序字段1 asc | desc,排序字段1 asc | desc,......)] into 桶的个数 buckets]
[行切割方式 row format delimited 指定的行切割方式]
[存储方式  stored as TextFile 行存储或者列存储]
[存储位置  location hdfs的文件路径]
[表属性信息 tblproperties('属性名'='属性值') 例如: 内外部表, 创建者信息, 压缩协议...]
;
 */

2.创建简单表

-- 创建简单建表
create table student(
    id int,
    name string,
    age int,
    class_num int
);

-- 查询表数据
select * from student;  --查看表中所有数据

select id from student; --查看id列的数据

-- 删除表
drop table student;

3.内部表和外部表（关键词external）

3.1内部表

-- 创建内部表
create table student_in(
    id int,
    name string,
    age int,
    class_num int
);
-- 删除内部表
drop table student_in;

3.2外部表

-- 创建外部表
create external table student_out(
    id int,
    name string,
    age int,
    class_num int
);
-- 删除外部表
drop table student_out;

4.分区表

语法：分区 partitioned by(分区字段1,类型,[comment '字段的描述信息',分区字段2,类型,[comment '字段的描述信息'],......)

4.1.静态分区

-- 创建表
create table student_class_part(
    id int,
    name string,
    age int,
    class_num int
)
partitioned by (class string); --细节：这里的分区字段必须是表中没有的字段，例如：这里的class不能写成id或者name或者age或者class_num

-- 创建student分区表之后，表里面一共有4个字段，3个基础字段+1个分区字段

-- 静态分区表中添加数据，就是手动指定分区字段和字段值
load data local inpath '/export/hivedata/class1.txt' into table student_class_part partition (class = '1班');  --这里的路径是写的Linux上文件路径
load data local inpath '/export/hivedata/class2.txt' into table student_class_part partition (class = '2班');   --保存的位置是在HDFS上
load data local inpath '/export/hivedata/class3.txt' into table student_class_part partition (class = '3班');

-- 查询分区表的数据.
select * from student_class_part;

-- 查询1班的学生
select * from student_class_part where class_num = 1; --依然全表进行扫描
select * from student_class_part where class = '1班';   -- 精准扫描某个分区(目录), 避免全表扫描.

4.2.动态分区

--动态分区
--创建表
create table student(
    id int,
    name string,
    age int,
    class_num int
)
partitioned by (class string); --细节：这里的分区字段必须是表中没有的字段，例如：这里的class不能写成id或者name或者age或者class_num

-- 动态分区添加数据
load data local inpath '/export/hivedata/class1.txt' into table student partition(class); --报错，不能用load加载数据，只能用insert into

-- 使用动态分区前，需关闭严格模式.
set hive.exec.dynamic.partition.mode=nonstrict;     -- nonstrict 非严格模式, strict: 严格模式(默认)

insert into table student_class_part partition(class) select *, class_num from student;

-- 设置了分区之后，建议查询时带上分区字段
select * from student_class_part where class = '1班';   -- 精准扫描某个分区(目录), 避免全表扫描.

5.分桶

语法：分桶 clustered by(分桶字段1,分桶字段2,......) [sorted by(排序字段1 asc | desc,排序字段1 asc | desc,......)] into 桶的个数 buckets

-- 创建表
create table student_buckets(
    id int,
    name string,
    age int,
    class_num string
)
clustered by (id) into 3 buckets;       -- 按照学生id进行分桶, 分成 3 个桶.  即: 根据sid的哈希值 和 3取余, 余数为几,就进哪一个桶

--分桶排序
create table student_buckets_sort(
    id int,
    name string,
    age int,
    class_num string
)
clustered by (id) sorted by (id) into 3 buckets; --按照学生id进行分桶, 分成 3 个桶, 桶内部按照 sid 升序排列(注意：asc是升序，desc是降序，不写的话默认升序).

--例如也可以写成clustered by (sid) sorted by (age desc) into 3 buckets    -- 按照学生id进行分桶, 分成 3 个桶, 桶内部按照 age 降序排列.

6.切割方式和comment

语法：行切割方式 row format delimited 指定的行切割方式

/*
 数据内容为
 1,联想,5000,c001
2,海尔,3000,c001
3,雷神,5000,c001
4,杰克琼斯,800,c002
5,真维斯,200,c002
6,花花公子,440,c002
7,劲霸,2000,c002
8,香奈儿,800,c003
9,相宜本草,200,c003
10,面霸,5,c003
11,好想你枣,56,c004
12,香飘飘奶茶,1,c005
13,海澜之家,1000,c002
 这里我们可以看到我们的数据都是以逗号隔开，我们需要拆分成每一列
 */
 --创建表
create table products(
    id int comment '商品序号',
    name string comment '商品名称',
    price string comment '商品价格',
    cid string comment '商品类型id'
)comment '商品表'
row format delimited fields terminated by ','; --这样我们就能将数据拆分开,这里表示以逗号拆分,如果是空格我们就换成\t，是什么我们就换成什么

7.复杂类型

7.1.复杂类型array

-- 数据格式为: "zhangsan	beijing,shanghai,tianjin,hangzhou"
-- 创建表
create table t_array(
    name string comment '姓名',
    city array<string> comment '旅游城市'
)comment '旅游信息'
row format delimited fields terminated by '\t'
collection items terminated by ',';

-- 查询city中第一个元素
select name,city[0] from t_array;

7.2.复杂类型struct

-- 数据格式为： 1#周杰轮:11
-- 创建表
create table t_struct(
    id int comment '编号',
    info struct<name:string,age:int>
)row format delimited fields terminated by '#'
collection items terminated by ':';

-- 获取到的数据为: 1,"{""name"":""周杰轮"",""age"":11}"

7.3.复杂类型map

-- 数据格式为: 1,林杰均,father:林大明#mother:小甜甜#brother:小甜,28
-- 1.创建表
create table t_map(
   id int comment '编号',
   name string comment '姓名',
   member map<string,string>,
   age int comment '年龄'
)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';

-- 获取到的数据为: 1,林杰均,"{""father"":""林大明"",""mother"":""小甜甜"",""brother"":""小甜""}",28

hive_sql中DDL大概内容就这么多，不太理解的朋友可以去看看我后面的帖子，我会详细说一下关于分区和分桶的知识

三、建表图解

热爱编程的小魏

关注

25
点赞
踩
8

收藏

觉得还不错? 一键收藏
0
评论
大数据学习之HIVE_SQL DDL(Data Definition Language,数据定义语言)

语法：分区 partitioned by(分区字段1,类型,[comment '字段的描述信息',分区字段2,类型,[comment '字段的描述信息'],......)[分区 partitioned by(分区字段1,类型,[comment '字段的描述信息',分区字段2,类型,[comment '字段的描述信息'],......)]列名1,列类型,[comment '字段的描述信息'],列名2,列类型,[comment '字段的描述信息'],列名3,列类型,[comment '字段的描述信息']
复制链接

扫一扫