create语句
创建数据库
在Impala中,数据库不存储数据,也不会描述数据,而是管理一组表,视图和函数。 在HDFS中实际表现为一个目录,目录中包含表分区和实际存储数据的文件。
CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name [COMMENT 'database_comment'] [LOCATION hdfs_path]; # 中括号中的参数为可选参数
当数据库被创建之后,会在HDFS上的impala数据文件夹中为你创建一个带有.db
后缀的文件夹。
[quickstart.cloudera:21000] > CREATE DATABASE IF NOT EXISTS my_database;
Query: create DATABASE my_database
Fetched 0 row(s) in 0.21s
[quickstart.cloudera:21000] > show databases; # 展示集群中所有的数据库
Query: show databases
+-----------------------------------------------+
| name |
+-----------------------------------------------+
| _impala_builtins |
| default |
| my_db |
+-----------------------------------------------+
Fetched 3 row(s) in 0.20s
创建之后可以使用use
命令切换到指定数据库中,使用show databases
命令展示所有数据库。
创建表
在impala中,表是定义数据格式的结构,在表中要指定一下信息。
- 是内部表还是外部表
- 每个字段的数据类型是什么
- 表分区字段是哪个
- 数据文件的格式是什么
- 数据文件在HDFS中的哪个地方
语法
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
(col_name data_type
[COMMENT 'col_comment']
[, ...]
)
[PARTITIONED BY (col_name data_type [COMMENT 'col_comment'], ...)]
[SORT BY ([column [, column ...]])]
[COMMENT 'table_comment']
[ROW FORMAT row_format]
[WITH SERDEPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
[STORED AS file_format]
[LOCATION 'hdfs_path']
[CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED]
[TBLPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
将查询出来的数据生成表
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] db_name.]table_name
[PARTITIONED BY (col_name[, ...])]
[SORT BY ([column [, column ...]])]
[COMMENT 'table_comment']
+ [ROW FORMAT row_format]
[WITH SERDEPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
+ [STORED AS ctas_file_format]
[LOCATION 'hdfs_path']
+ [CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED]
[TBLPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
AS
select_statement
语法中的选项
基础类型:
TINYINT
| SMALLINT
| INT
| BIGINT
| BOOLEAN
| FLOAT
| DOUBLE
| DECIMAL
| STRING
| CHAR
| VARCHAR
| TIMESTAMP
复杂类型:
struct_type
| array_type
| map_type
struct_type: STRUCT < name : primitive_or_complex_type [COMMENT 'comment_string'], ... >
array_type: ARRAY < primitive_or_complex_type >
map_type: MAP < primitive_type, primitive_or_complex_type >
row格式:
DELIMITED [FIELDS TERMINATED BY 'char' [ESCAPED BY 'char']]
[LINES TERMINATED BY 'char']
行文件格式:
PARQUET
| TEXTFILE
| AVRO
| SEQUENCEFILE
| RCFILE
数据文件格式:
PARQUET
| TEXTFILE
使用查询数据生成表的例子
-- 先生成一张来源表
CREATE TABLE t1 (x INT, y STRING);
INSERT INTO t1 VALUES (1, 'one'), (2, 'two'), (3, 'three');
-- 将当前表完全复制到另外一张表中
CREATE TABLE clone_of_t1 AS SELECT * FROM t1;
+-------------------+
| summary |
+-------------------+
| Inserted 3 row(s) |
+-------------------+
-- 复制表的数据,并使用另一中数据格式存储
CREATE TABLE parquet_version_of_t1 STORED AS PARQUET AS SELECT * FROM t1;
+-------------------+
| summary |
+-------------------+
| Inserted 3 row(s) |
+-------------------+
-- 复制指定数据到另一张表
CREATE TABLE subset_of_t1 AS SELECT * FROM t1 WHERE x >= 2;
+-------------------+
| summary |
+-------------------+
| Inserted 2 row(s) |
+-------------------+
-- 复制来源表的元数据,但是不复制数据
CREATE TABLE empty_clone_of_t1 AS SELECT * FROM t1 WHERE 1=0;
+-------------------+
| summary |
+-------------------+
| Inserted 0 row(s) |
+-------------------+
-- 重命名某些列,添加列,以及使用固定值填充
CREATE TABLE t5 AS SELECT upper(y) AS s, x+1 AS a, 'Entirely new column' AS n FROM t1;
+-------------------+
| summary |
+-------------------+
| Inserted 3 row(s) |
+-------------------+
SELECT * FROM t5;
+-------+---+---------------------+
| s | a | n |
+-------+---+---------------------+
| ONE | 2 | Entirely new column |
| TWO | 3 | Entirely new column |
| THREE | 4 | Entirely new column |
+-------+---+---------------------+
创建分区表
create table partitions_no (year smallint, month tinyint, s string);
insert into partitions_no values (2016, 1, 'January 2016'),
(2016, 2, 'February 2016'), (2016, 3, 'March 2016');
-- 数据源表并没有任何分区
show partitions partitions_no;
ERROR: AnalysisException: Table is not partitioned: ctas_partition_by.partitions_no
-- 基于元数据表创建一个带有分区的表
create table partitions_yes partitioned by (year, month)
as select s, year, month from partitions_no;
+-------------------+
| summary |
+-------------------+
| Inserted 3 row(s) |
+-------------------+
-- 查看生成的表的分区信息,可以看到已经被分区了
show partitions partitions_yes;
+