先创建了一张表按照tradeDate分区的外部表
DROP TABLE IF EXISTS t_hft;
CREATE EXTERNAL TABLE t_hft(
SecurityID STRING,
tradeTime STRING,
PreClosePx DOUBLE
) PARTITIONED BY (tradeDate INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION'/user/hive/warehouse/test.db/endTest/';
然后加入2个分区并引入location
ALTER TABLE t_hft ADD PARTITION (tradeDate = 1) LOCATION '/user/hive/warehouse/test.db/endTest/ttest1';
ALTER TABLE t_hft ADD PARTITION (tradeDate = 2) LOCATION '/user/hive/warehouse/test.db/endTest/ttest2';
备注:hive不支持下划线类似_test.csv文件导入,表中字段如果有中文需要加入``反引号,例如`tradeTime` STRING
hive中ods是操作数据层,是最底层】
hive查看表信息:
desc extend test_table;
删除所有行:
truncate table test_table;
分桶作用:
1.数据sampling(采样)
2.提升某些查询操作效率,列如Map Side join
create table tb_user (
id int ,name string,age int
) partitioned by (timeflag bigint)
clustered by (id)
sorted by (age asc)
into 4 buckets
row format delimited fields
terminated by ',';
show databases;
show tables;
desc test;
-------------
分桶表示例:
#创建分桶表
drop table stu_buck;
create table stu_buck(Sno int,Sname string,Sex string,Sage int,Sdept string)
clustered by(Sno)
sorted by(Sno DESC)
into 4 buckets
row format delimited
fields terminated by ',';
#设置变量,设置分桶为true, 设置reduce数量是分桶的数量个数
set hive.enforce.bucketing = true;
set mapreduce.job.reduces=4;
insert overwrite table student_buck
select * from student cluster by(Sno) sort by(Sage); 报错,cluster 和 sort 不能共存
#开会往创建的分通表插入数据(插入数据需要是已分桶, 且排序的)
#可以使用distribute by(sno) sort by(sno asc) 或是排序和分桶的字段相同的时候使用Cluster by(字段)
#注意使用cluster by 就等同于分桶+排序(sort)
insert into table stu_buck
select Sno,Sname,Sex,Sage,Sdept from student distribute by(Sno) sort by(Sno asc);
insert overwrite table stu_buck
select * from student distribute by(Sno) sort by(Sno asc);
insert overwrite table stu_buck
select * from student cluster by(Sno);
------------------------
保存select查询结果的几种方式:
1、将查询结果保存到一张新的hive表中
create table t_tmp
as
select * from t_p;
2、将查询结果保存到一张已经存在的hive表中
insert into table t_tmp
select * from t_p;
3、将查询结果保存到指定的文件目录(可以是本地,也可以是hdfs)
insert overwrite local directory '/home/hadoop/test'
select * from t_p;
insert overwrite directory '/aaa/test'
select * from t_p;
-----------------------------------
关于hive中的各种join
准备数据
1,a
2,b
3,c
4,d
7,y
8,u
2,bb
3,cc
7,yy
9,pp
建表:
create table a(id int,name string)
row format delimited fields terminated by ',';
create table b(id int,name string)
row format delimited fields terminated by ',';
导入数据:
load data local inpath '/home/hadoop/a.txt' into table a;
load data local inpath '/home/hadoop/b.txt' into table b;
实验:
** inner join
select * from a inner join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| 2 | b | 2 | bb |
| 3 | c | 3 | cc |
| 7 | y | 7 | yy |
+-------+---------+-------+---------+--+
**left join
select * from a left join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| 1 | a | NULL | NULL |
| 2 | b | 2 | bb |
| 3 | c | 3 | cc |
| 4 | d | NULL | NULL |
| 7 | y | 7 | yy |
| 8 | u | NULL | NULL |
+-------+---------+-------+---------+--+
**right join
select * from a right join b on a.id=b.id;
**
select * from a full outer join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| 1 | a | NULL | NULL |
| 2 | b | 2 | bb |
| 3 | c | 3 | cc |
| 4 | d | NULL | NULL |
| 7 | y | 7 | yy |
| 8 | u | NULL | NULL |
| NULL | NULL | 9 | pp |
+-------+---------+-------+---------+--+
**
select * from a left semi join b on a.id = b.id;
+-------+---------+--+
| a.id | a.name |
+-------+---------+--+
| 2 | b |
| 3 | c |
| 7 | y |
+-------+---------+--+
-------------
多重插入:
from student
insert into table student_p partition(part='a')
select * where Sno<95011;
insert into table student_p partition(part='a')
select * where Sno<95011;
--------------
导出数据到本地:
insert overwrite local directory '/home/hadoop/student.txt'
select * from student;
-------------
UDF案例:
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;
drop table if exists t_rating;
create table t_rating(movieid string,rate int,timestring string,uid string)
row format delimited fields terminated by '\t';
insert overwrite table t_rating
select split(parsejson(line),'\t')[0]as movieid,split(parsejson(line),'\t')[1] as rate,split(parsejson(line),'\t')[2] as timestring,split(parsejson(line),'\t')[3] as uid from rat_json limit 10;
-------
内置jason函数
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json limit 10;
-----------
transform案例:
1、先加载rating.json文件到hive的一个原始表 rat_json
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;
2、需要解析json数据成四个字段,插入一张新的表 t_rating
insert overwrite table t_rating
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json;
3、使用transform+python的方式去转换unixtime为weekday
先编辑一个python脚本文件
########python######代码
vi weekday_mapper.py
#!/bin/python
import sys
import datetime
for line in sys.stdin:
line = line.strip()
movieid, rating, unixtime,userid = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([movieid, rating, str(weekday),userid])
保存文件
然后,将文件加入hive的classpath:
hive>add FILE /home/hadoop/weekday_mapper.py;
hive>create TABLE u_data_new as
SELECT
TRANSFORM (movieid, rate, timestring,uid)
USING 'python weekday_mapper.py'
AS (movieid, rate, weekday,uid)
FROM t_rating;
select distinct(weekday) from u_data_new limit 10;