1.创建内部表
create table if not exists 表名(
属性名 属性类型,
...
比如:
name struct<first:string,last:string>,
age int,
hobbies array<string>,
deliveryAdd map<string,string>
)
row format delimited
fields terminated by '|'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
stored as textfile
;
2.创建外部表:
create external table if not exists 表名(
属性名 属性类型,
...
比如:
name struct<first:string,last:string>,
age int,
hobbies array<string>,
deliveryAdd map<string,string>
)
row format delimited
fields terminated by '|'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
stored as textfile
;
创建外部表需要注意的是,表中的数据文件存在hdfs文件系统上,所以在数据库中删除只会删除表结构,表中数据依然存在。如需删除,需要使用以下命令:
hdfs dfs -rm -rf /文件路径;
3.创建分区表
create external table if not exists 表名(
属性名 属性类型,
...
比如:
age int,
hobbies array<string>,
deliveryAdd map<string,string>
)
partitioned by(username string)
row format delimited
fields terminated by '|'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
stored as textfile
;
这里需要注意的是,上述分区是按照username来分区的。上传文件时使用以下命令:
load data local inpath '/文件路径/表1.log' into table 表名partition(username='表1');
load data local inpath '/文件路径/表2.log' [overwrite覆盖] into table 表名partition(username='表2');
若需要查看分区表结构,使用以下命令:
show partitions 表名;
4.创建分桶表(抽象的,方便抽样,提高join查询效率)
二选一:
set hive.enforce.bucketing = true;//优化
set mapreduce.reduce.tasks = num;//优化。设置mapreduce的数量和分桶数量一致
create external table 表名(
属性名 属性类型,
...
比如:
name struct<first:string,last:string>,
age int,
hobbies array<string>,
deliveryAdd map<string,string>
)
clustered by(name) into n buckets
row format delimited
fields terminated by '|'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
stored as textfile
;
创建表之后,需要做以下操作:
在表创建好后,需要将表中数据上传,放至表中:
load data [local] inpath '文件路径' into table 表名;
local:本地上传
将数据文件挂到hdfs文件系统上用以下命令:
hdfs dfs -put 数据文件 /目录
5.with语法:可以理解成视图。目的:封装重用。是一个临时结果集
with
临时表名 as (select ... from 表名 where 属性名=' '),
select *from 临时表名;