Hive支持的数据类型如下:
原生类型:
- TINYINT
- SMALLINT
- INT
- BIGINT
- BOOLEAN
- FLOAT
- DOUBLE
- STRING
- BINARY (Hive 0.8.0以上才可用)
- TIMESTAMP (Hive 0.8.0以上才可用)
复合类型:
- arrays: ARRAY<data_type>
- maps: MAP<primitive_type, data_type>
- structs: STRUCT<col_name : data_type [COMMENT col_comment], ...>
- union: UNIONTYPE<data_type, data_type, ...>
我没有从数据类型本身来讲Hive支持的数据类型,这里只是举几个有关复合类型的例子。有一点面向对象编程知识的,很容易理解这几种复合数据类型,ARRAY类似于java数组,Map类似于java集合,struct类似于json。下面的例子转发自:http://blog.csdn.net/wf1982/article/details/7474601
在Hive 中如何使用符合数据结构 maps,array,structs
1. Array的使用
创建数据库表,以array作为数据类型
create table person(name string,work_locations array<string>)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ',';
数据
biansutao beijing,shanghai,tianjin,hangzhou
linan changchu,chengdu,wuhan
数据入库
LOAD DATA LOCAL INPATH '/home/hadoop/person.txt' OVERWRITE INTO TABLE person;
查询
hive> select * from person;
输出:
biansutao ["beijing","shanghai","tianjin","hangzhou"]
linan ["changchu","chengdu","wuhan"]
Time taken: 0.355 seconds
hive> select name from person;
输出:
linan
biansutao
Time taken: 12.397 seconds
hive> select work_locations[0] from person;
输出:
changchu
beijing
Time taken: 13.214 seconds
hive> select work_locations from person;
输出:
["changchu","chengdu","wuhan"]
["beijing","shanghai","tianjin","hangzhou"]
Time taken: 13.755 seconds
hive> select work_locations[3] from person;
输出:
NULL
hangzhou
Time taken: 12.722 seconds
hive> select work_locations[4] from person;
输出:
NULL
NULL
Time taken: 15.958 seconds
2. Map 的使用
创建数据库表
create table score(name string, score map<string,int>)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':';
要入库的数据:
biansutao '数学':80,'语文':89,'英语':95
jobs '语文':60,'数学':80,'英语':99
入库数据
LOAD DATA LOCAL INPATH '/home/hadoop/score.txt' OVERWRITE INTO TABLE score;
查询
hive> select * from score;
输出:
biansutao {"数学":80,"语文":89,"英语":95}
jobs {"语文":60,"数学":80,"英语":99}
Time taken: 0.665 seconds
hive> select name from score;
输出:
jobs
biansutao
Time taken: 19.778 seconds
hive> select t.score from score t;
输出:
{"语文":60,"数学":80,"英语":99}
{"数学":80,"语文":89,"英语":95}
Time taken: 19.353 seconds
hive> select t.score['语文'] from score t;
输出:
60
89
Time taken: 13.054 seconds
hive> select t.score['英语'] from score t;
输出:
99
95
Time taken: 13.769 seconds
3 Struct 的使用
创建数据表
CREATE TABLE test(id int,course struct<course:string,score:int>)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ',';
数据
1 english,80
2 math,89
3 chinese,95
入库
LOAD DATA LOCAL INPATH '/home/hadoop/test.txt' OVERWRITE INTO TABLE test;
查询
hive> select * from test;
OK
1 {"course":"english","score":80}
2 {"course":"math","score":89}
3 {"course":"chinese","score":95}
Time taken: 0.275 seconds
hive> select course from test;
输出:
{"course":"english","score":80}
{"course":"math","score":89}
{"course":"chinese","score":95}
Time taken: 44.968 seconds
select t.course.course from test t;
输出:
english
math
chinese
Time taken: 15.827 seconds
hive> select t.course.score from test t;
输出:
80
89
95
Time taken: 13.235 seconds
4. 数据组合 (不支持组合的复杂数据类型)
LOAD DATA LOCAL INPATH '/home/hadoop/test.txt' OVERWRITE INTO TABLE test;
create table test1(id int,a MAP<STRING,ARRAY<STRING>>)
row format delimited fields terminated by '\t'
collection items terminated by ','
MAP KEYS TERMINATED BY ':';
1 english:80,90,70
2 math:89,78,86
3 chinese:99,100,82
LOAD DATA LOCAL INPATH '/home/hadoop/test1.txt' OVERWRITE INTO TABLE test1;