一、内部表和外部表
1、内部表
(1)source file
1,罗大佑,60
2,田馥甄,40
(2)DDL
create table super_star(
id int,
name string,
age int
)
row format delimited fields terminated by ','
stored as textfile ;
(3)加载数据
0: jdbc:hive2://master:10000> load data local inpath '/root/SuperStar.txt' into table test.super_star
(4)查询数据
select * from test.super_star;
2、外部表
(1)source file
1,周杰伦,43
2,林俊杰,42
(2)DDL
create external table pop_singer(
id int,
name string,
age int
)
row format delimited fields terminated by ','
stored as TEXTFILE ;
(3)加载数据
0: jdbc:hive2://master:10000> load data local inpath '/root/PopSinger.txt' into table test.pop_singer;
(4)查询数据
select * from pop_singer;
(5)Drop table
drop table pop_singer;
(6)重建表,数据仍然还在
create external table test.pop_singer(
id int,
name string,
age int
)
row format delimited fields terminated by ','
stored as TEXTFILE ;
select * from test.pop_singer;
二、分区表
1、静态分区
(1)数据源
1,Maple,网球-手游,age:29-gender:male
2,Max,足球-抠脚,age:30-gender:male
(2)DDL
create table partition_table(
id int
,name string
,hobby array<string>
,add map<String,string>
)
partitioned by (pt_d string)
row format delimited fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
;
(3)加载数据
0: jdbc:hive2://master:10000> load data local inpath '/root/partitionData.txt' into table test.partition_table partition (pt_d = '20230501');
(4)查询数据
select * from partition_table;
2、动态分区
- 加载方式:insert into 分区表 select *,需要的分区字段 from 普通表
(1)数据源
u001 ZSS 23 M beijing
u002 YMM 33 F nanjing
u003 LSS 43 M beijing
u004 ZY 23 F beijing
u005 ZM 23 M beijing
u006 CL 23 M dongjing
u007 LX 23 F beijing
u008 YZ 23 M beijing
u009 YM 23 F nanjing
u010 XM 23 M beijing
u011 XD 23 F beijing
u012 LH 23 M dongjing
u013 FTM 23 F dongjing
(2)普通表DDL
create table if not exists tb_person(
uid string ,
name string ,
age int ,
gender string ,
address string
)
row format delimited fields terminated by "\t" ;
(3)加载数据
0: jdbc:hive2://master:10000> load data local inpath '/root/tbPerson.txt' into table test.tb_person;
(4)查询数据
(5)创建分区表
create table if not exists tb_person_p(
uid string ,
name string ,
age int ,
gender string ,
address string
)
partitioned by (addr string)
row format delimited fields terminated by "\t" ;
(6)数据动态插入分区表
set hive.exec.dynamic.partition=true ;
set hive.exec.dynamic.partition.mode=nonstrick;
insert into tb_person_p partition(addr)
select uid , name , age , gender , address , address from tb_person ;
(7)查询分区表数据
select * from tb_person_p;
三、Row Formats
1、Array
(1)Source File
class01,maple:kelly:max,3
class02,Avery:Paul:jack:Vina,4
(2)DDL
create table array_table(
class string,
stus array<string>,
num int
)
row format delimited fields terminated by ','
collection items terminated by ':'
stored as textfile;
(3)数据加载
0: jdbc:hive2://master:10000> load data local inpath '/root/ArrayData.txt' into table test.array_table;
(4)数据查询
select
class
, stus[0] as first_stu
, stus[1] as second_stu
, stus[2] as third_stu
from array_table;
2、Map
(1)Source File
1,name:maple#gender:male,10
2,name:max#gender:female,20
(2)DDL
create table map_table(
id int,
stus map<string,string>,
age int
)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':'
stored as textfile;
(3) 数据加载
0: jdbc:hive2://master:10000> load data local inpath '/root/MapData.txt' into table test.map_table;
(4) 数据查询
select id,stus['name'] as stu_name,stus['gender'] as stu_gender,age
from map_table;
3、Struct
(1) Source File
1,maple:male:18
2,max:female:22
(2)DDL
create table struct_table(
id int,
stu struct<name:string,gender:string,age:int>
)
row format delimited fields terminated by ','
collection items terminated by ":"
stored as textfile ;
(3) 数据加载
0: jdbc:hive2://master:10000> load data local inpath '/root/StuctData.txt' into table test.struct_table;
(4) 数据查询
select id,stu.name,stu.gender,stu.age from struct_table;
以上三种Row Format其实是一种简化写法,底层是通过定义SERDE类实现的,对于文本类型,对应的SERDE类包括org.apache.hadoop.hive.serde2.OpenCSVSerde、org.apache.hadoop.hive.serde2.RegexSerDe和org.apache.hive.hcatalog.data.JsonSerDe,如果source 数据源不方便直接通过简化方式直接定义,可以考虑通过定义Serde类的方式实现,以下举例Json SERDE和正则SERDE。
4、Json SERDE
(1)Source File
{"name":"Maple","city":"北京","gender":"male"}
{"name":"Kelly","city":"上海","gender":"female"}
{"name":"max","city":"广州","gender":"male"}
(2)DDL
create table JsonTable(
name string,
city string,
gender string
)
ROW FORMAT SERDE
'org.apache.hive.hcatalog.data.JsonSerDe';
(3)数据加载
0: jdbc:hive2://master:10000> load data local inpath '/root/JsonData' into table test.jsontable;
(4) 数据查询
5、RegEx SERDE
(1)Source File
zhangsan_29:beijing|Java
lisi_30:beijing|Java
(2)DDL
CREATE table regex_table(
name string,
age int,
city string,
job_title string
)
row format SERDE
'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES
(
"input.regex" = "(.*)_(.*):(.*)\\|(.*)"
)
stored as TEXTFILE;
(3)数据加载
0: jdbc:hive2://master:10000> load data local inpath '/root/RegExData.txt' into table test.regex_table;