hive按照文件夹分区

最新推荐文章于 2024-08-28 21:04:12 发布

Resemble_

最新推荐文章于 2024-08-28 21:04:12 发布

阅读量2.5k

点赞数

文章标签： hive location 子文件夹分区

本文链接：https://blog.csdn.net/qq_27657429/article/details/62042977

版权

先创建了一张表按照tradeDate分区的外部表

DROP TABLE IF EXISTS t_hft;
CREATE EXTERNAL TABLE t_hft(
SecurityID STRING,
tradeTime STRING,
PreClosePx DOUBLE
) PARTITIONED BY (tradeDate INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION'/user/hive/warehouse/test.db/endTest/';

然后加入2个分区并引入location

ALTER TABLE t_hft ADD PARTITION (tradeDate = 1) LOCATION '/user/hive/warehouse/test.db/endTest/ttest1';
ALTER TABLE t_hft ADD PARTITION (tradeDate = 2) LOCATION '/user/hive/warehouse/test.db/endTest/ttest2';

备注：hive不支持下划线类似_test.csv文件导入，表中字段如果有中文需要加入``反引号，例如`tradeTime` STRING

hive中ods是操作数据层，是最底层】

hive查看表信息：

desc extend test_table;

删除所有行：

truncate table test_table;

分桶作用：

1.数据sampling(采样)
2.提升某些查询操作效率，列如Map Side join

create table tb_user (

id int ,name string,age int

) partitioned by (timeflag bigint)

 clustered by (id)

sorted by (age asc)

 into 4 buckets

row format delimited fields

terminated by ',';

show databases;
show tables;
desc test;

-------------
分桶表示例：

#创建分桶表
drop table stu_buck;
create table stu_buck(Sno int,Sname string,Sex string,Sage int,Sdept string)
clustered by(Sno)
sorted by(Sno DESC)
into 4 buckets
row format delimited
fields terminated by ',';

#设置变量,设置分桶为true, 设置reduce数量是分桶的数量个数
set hive.enforce.bucketing = true;
set mapreduce.job.reduces=4;

insert overwrite table student_buck
select * from student cluster by(Sno) sort by(Sage); 报错,cluster 和 sort 不能共存

#开会往创建的分通表插入数据(插入数据需要是已分桶, 且排序的)
#可以使用distribute by(sno) sort by(sno asc) 或是排序和分桶的字段相同的时候使用Cluster by(字段)
#注意使用cluster by 就等同于分桶+排序(sort)
insert into table stu_buck
select Sno,Sname,Sex,Sage,Sdept from student distribute by(Sno) sort by(Sno asc);

insert overwrite table stu_buck
select * from student distribute by(Sno) sort by(Sno asc);

insert overwrite table stu_buck
select * from student cluster by(Sno);

------------------------
保存select查询结果的几种方式：
1、将查询结果保存到一张新的hive表中
create table t_tmp
as
select * from t_p;

2、将查询结果保存到一张已经存在的hive表中
insert into table t_tmp
select * from t_p;

3、将查询结果保存到指定的文件目录（可以是本地，也可以是hdfs）
insert overwrite local directory '/home/hadoop/test'
select * from t_p;

insert overwrite directory '/aaa/test'
select * from t_p;

-----------------------------------
关于hive中的各种join

准备数据
1,a
2,b
3,c
4,d
7,y
8,u

2,bb
3,cc
7,yy
9,pp

建表：
create table a(id int,name string)
row format delimited fields terminated by ',';

create table b(id int,name string)
row format delimited fields terminated by ',';

导入数据：
load data local inpath '/home/hadoop/a.txt' into table a;
load data local inpath '/home/hadoop/b.txt' into table b;

实验：
** inner join
select * from a inner join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| 2 | b | 2 | bb |
| 3 | c | 3 | cc |
| 7 | y | 7 | yy |
+-------+---------+-------+---------+--+

**left join
select * from a left join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| 1 | a | NULL | NULL |
| 2 | b | 2 | bb |
| 3 | c | 3 | cc |
| 4 | d | NULL | NULL |
| 7 | y | 7 | yy |
| 8 | u | NULL | NULL |
+-------+---------+-------+---------+--+

**right join
select * from a right join b on a.id=b.id;

**
select * from a full outer join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| 1 | a | NULL | NULL |
| 2 | b | 2 | bb |
| 3 | c | 3 | cc |
| 4 | d | NULL | NULL |
| 7 | y | 7 | yy |
| 8 | u | NULL | NULL |
| NULL | NULL | 9 | pp |
+-------+---------+-------+---------+--+

**
select * from a left semi join b on a.id = b.id;
+-------+---------+--+
| a.id | a.name |
+-------+---------+--+
| 2 | b |
| 3 | c |
| 7 | y |
+-------+---------+--+

-------------
多重插入：

from student
insert into table student_p partition(part='a')
select * where Sno<95011;
insert into table student_p partition(part='a')
select * where Sno<95011;

--------------
导出数据到本地：

insert overwrite local directory '/home/hadoop/student.txt'
select * from student;

-------------
UDF案例：
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;

drop table if exists t_rating;
create table t_rating(movieid string,rate int,timestring string,uid string)
row format delimited fields terminated by '\t';

insert overwrite table t_rating
select split(parsejson(line),'\t')[0]as movieid,split(parsejson(line),'\t')[1] as rate,split(parsejson(line),'\t')[2] as timestring,split(parsejson(line),'\t')[3] as uid from rat_json limit 10;

-------
内置jason函数
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json limit 10;

-----------
transform案例:

1、先加载rating.json文件到hive的一个原始表 rat_json
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;

2、需要解析json数据成四个字段，插入一张新的表 t_rating
insert overwrite table t_rating
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json;

3、使用transform+python的方式去转换unixtime为weekday
先编辑一个python脚本文件
########python######代码
vi weekday_mapper.py
#!/bin/python
import sys
import datetime

for line in sys.stdin:
line = line.strip()
movieid, rating, unixtime,userid = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([movieid, rating, str(weekday),userid])

保存文件
然后，将文件加入hive的classpath：
hive>add FILE /home/hadoop/weekday_mapper.py;
hive>create TABLE u_data_new as
SELECT
TRANSFORM (movieid, rate, timestring,uid)
USING 'python weekday_mapper.py'
AS (movieid, rate, weekday,uid)
FROM t_rating;

select distinct(weekday) from u_data_new limit 10;