hive sql 官方文档:LanguageManual - Apache Hive - Apache Software Foundationhttps://cwiki.apache.org/confluence/display/Hive/LanguageManual
# 展示所有数据库
show databases;
#创建test数据库;这个操作会在对用的配置目录下生成test_db
create database test;
# 切换数据库
use database_name;
# 查看数据库结构信息
desc demo;
# 查看数据库结构明细信息
desc formatted demo;
create table
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name -- (Note: TEMPORARY available in Hive 0.14.0 and later)
[(col_name data_type [column_constraint_specification] [COMMENT col_comment], ... [constraint_specification])]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[SKEWED BY (col_name, col_name, ...) -- (Note: Available in Hive 0.10.0 and later)]
ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)
[STORED AS DIRECTORIES]
[
[ROW FORMAT row_format]
[STORED AS file_format]
| STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)] -- (Note: Available in Hive 0.6.0 and later)
]
[LOCATION hdfs_path]
[TBLPROPERTIES (property_name=property_value, ...)] -- (Note: Available in Hive 0.6.0 and later)
[AS select_statement]; -- (Note: Available in Hive 0.5.0 and later; not supported for external tables)
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
LIKE existing_table_or_view_name
[LOCATION hdfs_path];
data_type
: primitive_type
| array_type
| map_type
| struct_type
| union_type -- (Note: Available in Hive 0.7.0 and later)
primitive_type
: TINYINT
| SMALLINT
| INT
| BIGINT
| BOOLEAN
| FLOAT
| DOUBLE
| DOUBLE PRECISION -- (Note: Available in Hive 2.2.0 and later)
| STRING
| BINARY -- (Note: Available in Hive 0.8.0 and later)
| TIMESTAMP -- (Note: Available in Hive 0.8.0 and later)
| DECIMAL -- (Note: Available in Hive 0.11.0 and later)
| DECIMAL(precision, scale) -- (Note: Available in Hive 0.13.0 and later)
| DATE -- (Note: Available in Hive 0.12.0 and later)
| VARCHAR -- (Note: Available in Hive 0.12.0 and later)
| CHAR -- (Note: Available in Hive 0.13.0 and later)
array_type
: ARRAY < data_type >
map_type
: MAP < primitive_type, data_type >
struct_type
: STRUCT < col_name : data_type [COMMENT col_comment], ...>
union_type
: UNIONTYPE < data_type, data_type, ... > -- (Note: Available in Hive 0.7.0 and later)
row_format
: DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] [COLLECTION ITEMS TERMINATED BY char]
[MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
[NULL DEFINED AS char] -- (Note: Available in Hive 0.13 and later)
| SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
file_format:
: SEQUENCEFILE
| TEXTFILE -- (Default, depending on hive.default.fileformat configuration)
| RCFILE -- (Note: Available in Hive 0.6.0 and later)
| ORC -- (Note: Available in Hive 0.11.0 and later)
| PARQUET -- (Note: Available in Hive 0.13.0 and later)
| AVRO -- (Note: Available in Hive 0.14.0 and later)
| JSONFILE -- (Note: Available in Hive 4.0.0 and later)
| INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname
column_constraint_specification:
: [ PRIMARY KEY|UNIQUE|NOT NULL|DEFAULT [default_value]|CHECK [check_expression] ENABLE|DISABLE NOVALIDATE RELY/NORELY ]
default_value:
: [ LITERAL|CURRENT_USER()|CURRENT_DATE()|CURRENT_TIMESTAMP()|NULL ]
constraint_specification:
: [, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE RELY/NORELY ]
[, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE RELY/NORELY ]
[, CONSTRAINT constraint_name FOREIGN KEY (col_name, ...) REFERENCES table_name(col_name, ...) DISABLE NOVALIDATE
[, CONSTRAINT constraint_name UNIQUE (col_name, ...) DISABLE NOVALIDATE RELY/NORELY ]
[, CONSTRAINT constraint_name CHECK [check_expression] ENABLE|DISABLE NOVALIDATE RELY/NORELY ]
如何把数据导入到hive
注意:
1、load操作不会对数据做任何的转换修改操作
2、从本地linux load数据文件是复制文件的过程
3、从hdfs load数据文件是移动文件的过程
4、load操作也支持向分区表中load数据,只不过需要添加分区列的值
在hive窗口执行load命令
--加载本地数据到hive表
load data local inpath '/root/data/data' into table psn;--(/root/data/data指的是本地 linux目录)
--加载hdfs数据文件到hive表
load data inpath '/data/data' into table psn;--(/data/data指的是hdfs的目录)
1. 创建普通hive表(不包含行定义格式)
create table demo
(
id int,
name string,
likes array<string>,
address map<string,string>
);
把hdfs数据上传到hive表中
vim /home/hadoop/data
----------
小明1 ["shanghai","beijin"] {"aihao":"no"}
小明2 ["shanghai","beijin"] {"aihao":"no"}
小明3 ["shanghai","beijin"] {"aihao":"yes"}
小明4 ["shanghai","beijin"] {"aihao":"asdas"}
小明5 ["shanghai","beijin","a"] {"aihao":"nosadfsd"}
小明6 ["shanghai","beijin","sad"] {"aihao":"no"}
小明7 ["shanghai","beijin"] {"aihao":"no"}
----------
./hdfs dfs -put -f /home/hadoop/data /usr/test
2. 创建自定义行格式的hive表(指定列的分割符)
create table demo2
(
id int,
name string,
likes array<string>,
address map<string,string>
)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':';
数据示例:
1,王五,王者-英雄联盟-吃鸡,beijing:haidian-shanxi:xian
2,赵六,手机-电脑-笔记本,shanghai:diqu-beijing:changpin
3. 创建默认分隔符的hive表(^A、^B、^C)
create table psn3
(
id int,
name string,
likes array<string>,
address map<string,string>
)
row format delimited
fields terminated by '\001'
collection items terminated by '\002'
map keys terminated by '\003';
# -------------------------------------
create table psn3
(
id int,
name string,
likes array<string>,
address map<string,string>
)
4. 创建hive的外部表(需要添加external和location的关键字)
create external table demo4
(
id int,
name string,
likes array<string>,
address map<string,string>
)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
location '/data';
在之前创建的表(默认创建的都是内部表)都属于hive的内部表(psn,psn2,psn3),而psn4属于hive的外部表,
内部表跟外部表的区别:
1、hive内部表创建的时候数据存储在hive的默认存储目录中,外部表在创建的时候需要指定额外的目录
2、hive内部表删除的时候,会将元数据和数据都删除,而外部表只会删除元数据,不会删除数据
应用场景:
内部表:需要先创建表,然后向表中添加数据,适合做中间表的存储
外部表:可以先创建表,再添加数据,也可以先有数据,再创建表,本质上是将hdfs的某一个目录的数据根
5. 分区表
hive的分区表:
hive默认将表的数据保存在某一个hdfs的存储目录下,当需要检索符合条件的某一部分数据的时候,需要全量
create table demo5
(
id int,
name string,
likes array<string>,
address map<string,string>
)
partitioned by(gender string)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':';
#--创建多分区表
create table psn6
(
id int,
name string,
likes array<string>,
address map<string,string>
)
partitioned by(gender string,age int)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':';
多分区的数据意味着在gender文件加下又有age目录来区分
当需要
注意:
- 当创建完分区表之后,在保存数据的时候,会在hdfs目录中看到分区列会成为一个目录,以多级目录的形式存在
- 当创建多分区表之后,插入数据的时候不可以只添加一个分区列,需要将所有的分区列都添加值
- 多分区表在添加分区列的值得时候,与顺序无关,与分区表的分区列的名称相关,按照名称就行匹配
注意:
- 添加分区列的值的时候,如果定义的是多分区表,那么必须给所有的分区列都赋值
- 删除分区列的值的时候,无论是单分区表还是多分区表,都可以将指定的分区进行删除
修复分区:
在使用hive外部表的时候,可以先将数据上传到hdfs的某一个目录中,然后再创建外部表建立映射关系,如果在上传数据的时候,参考分区表的形式也创建了多级目录,那么此时创建完表之后,是查询不到数据的,原因是分区的元数据没有保存在mysql中,因此需要修复分区,将元数据同步更新到mysql中,此时才可以查询到元数据。具体操作如下:
# 准备外部数据
hdfs dfs -mkdir /data
hdfs dfs -mkdir /data/age=10
hdfs dfs -mkdir /data/age=20
hdfs dfs -put /root/data/data /data/age=10
hdfs dfs -put /root/data/data /data/age=20
# 创建外部表
create external table demo7
(
id int,
name string,
likes array<string>,
address map<string,string>
)
partitioned by(age int)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
location '/data';
# 查询结果(没有数据)
select * from demo7;
# 修复分区
msck repair table demo7;
# 查询结果(有数据)
select * from demo7;
以上面的方式创建hive的分区表会存在问题,每次插入的数据都是人为指定分区列的值,我们更加希望能够根据记录中的某一个字段来判断将数据插入到哪一个分区目录下,此时利用我们上面的分区方式是无法完成操作的,需要使用动态分区来完成相关操作,现在学的知识点无法满足,后续有讲解
2. hive sql DML
3.Hive Serde使用示例
当数据格式比较复杂的时候,使用分隔符切割的时候可能会达不到使用要求,此时就可以使用Serde的方式来创建Hive表
示例数据:access-log日志
185.189.182.234 - - [31/May/2022:13:50:16 +0800] "GET / HTTP/1.1" 400 -
157.245.70.127 - - [31/May/2022:13:53:34 +0800] "GET /ab2g HTTP/1.1" 400 -
157.245.70.127 - - [31/May/2022:13:53:34 +0800] "GET /ab2h HTTP/1.1" 400 -
20.55.53.144 - - [31/May/2022:14:19:41 +0800] "GET / HTTP/1.1" 400 --
162.142.125.7 - - [31/May/2022:14:38:01 +0800] "GET / HTTP/1.1" 200 -
162.142.125.7 - - [31/May/2022:14:38:03 +0800] "GET / HTTP/1.1" 200 -
162.142.125.7 - - [31/May/2022:14:38:04 +0800] "PRI * HTTP/2.0" 400 -
222.175.199.226 - - [31/May/2022:15:33:42 +0800] "GET / HTTP/1.1" 301 -
170.130.187.42 - - [31/May/2022:15:37:43 +0800] "GET / HTTP/1.0" 301 -
216.218.206.68 - - [31/May/2022:15:42:06 +0800] "GET / HTTP/1.1" 200 -
54.209.79.124 - - [31/May/2022:15:42:06 +0800] "GET /.env HTTP/1.1" 404 -
54.209.79.124 - - [31/May/2022:15:42:07 +0800] "POST / HTTP/1.1" 405 --
184.105.247.196 - - [31/May/2022:16:47:25 +0800] "GET / HTTP/1.1" 301 -
198.12.90.146 - - [31/May/2022:16:49:09 +0800] "GET / HTTP/1.1" 301 -
- SerDe 用于做序列化和反序列化。
- 构建在数据存储和执行引擎之间,对两者实现解耦。
- Hive通过ROW FORMAT DELIMITED以及SERDE进行内容的读写。
3.1 Hive Serde正则匹配
org.apache.hadoop.hive.serde2.RegexSerDe这个是内置的正则匹配类,也可以自定义jar包上传后使用自己的类
建表语句:
CREATE TABLE access_logtbone(
host STRING,
identity STRING,
t_user STRING,
time STRING,
request STRING,
referer STRING,
agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) \\[(.*)\\] \"(.*)\" (-|[0-9]*) (-|[0-9]*)"
)
STORED AS TEXTFILE;
导入数据:首先把数据录入所在机器的对应目录文件,使用下方命令导入
load data local inpath '/data/log2' into table access_logtbone;
查看对应的表数据,会看到已上传成功