Hive是基于Hadoop的一个数据仓库工具,可以将结构化的数据文件映射为一张数据库表,并提供类SQL查询功能。本文描述了HIve的一些基本操作,如有错误之处还请指出。
常用语法
-
#显示相关信息
-
show tables;
-
show databases;
-
show partitions;
-
show functions;
-
desc extended table_name;
-
desc formatted table_name;
-
#创建库
-
create database test_db;
-
#删除库
-
drop database 库名;
-
#删除表
-
drop table 表名;
-
#重命名表名
-
ALTER TABLE table_name RENAME TO new_table_name;
-
#清空表数据
-
truncate table 表名;
建表语句
-
CREATE [
EXTERNAL]
TABLE [
IF
NOT
EXISTS] table_name
-
[(col_name data_type [
COMMENT col_comment], ...)]
-
[
COMMENT table_comment]
-
[PARTITIONED
BY (col_name data_type [
COMMENT col_comment], ...)]
-
[CLUSTERED
BY (col_name, col_name, ...)
-
[SORTED
BY (col_name [
ASC|
DESC], ...)]
INTO num_buckets BUCKETS]
-
[
ROW
FORMAT row_format]
-
[
STORED
AS file_format]
-
[LOCATION hdfs_path]
创建内部表
-
create
table
if
not
exists my_tb(
id
int,
name
string)
-
row
format
delimited
fields
terminated
by
',';
创建外部表
-
#创建外部表要指明表的数据所在路径
-
create table
if
not
exists my_ex_tb(id
int,name string)
-
row
format delimited fields terminated by
','
-
location
'hdfs://192.168.38.3:9000/externdb/my_ex_tb/';
在删除表的时候,内部表的元数据和数据会被一起删除,而外部表只删除元数据,不删除数据。
加载数据到表目录下
-
#在表的最后插入或覆盖整个表的数据(into/overwrite)
-
load data
local inpath
'/root/1.txt' into(overwrite) table my_ex_tb;
创建分区表
-
create table if not exists my_par_tb(id int,name string)
-
partitioned by(country string)
-
row format delimited fields terminated by ',';
-
-
load data local inpath
'/root/1.txt'
into table my_par_tb partition(country='China');
-
load data local inpath
'/root/1.txt.us'
into table my_par_tb partition(country='US');
-
-
#1.txt中的数据
-
-
1,张三
-
2,李四
-
3,王五
-
-
#1.txt.us中的数据
-
-
1,张三
-
2,李四
-
3,王五
-
-
#select * from my_par_tb显示的数据
-
-
1 张三 China
-
2 李四 China
-
3 王五 China
-
1 张三 US
-
2 李四 US
-
3 王五 US
-
-
#查某个分区里的数据
-
-
select *
from my_par_tb
where country=
'China'
-
-
1 张三 China
-
2 李四 China
-
3 王五 China
添加删除分区
-
#添加分区
-
alter table my_par_tb add partition(country='Eng') partition(country='Ame');
-
#删除分区
-
alter table my_par_tb drop partition(country='Eng') partition(country='Ame');
-
-
#显示表中的分区
-
show partitions my_par_tb;
-
-
country=China
-
country=US
创建分桶表
-
create table
if
not exists my_buck_tb(id int,name
string)
-
clustered
by(id) sorted
by(id)
-
into
4 buckets
-
row format delimited fields terminated
by
',';
-
-
#指定开启分桶
-
set hive.enforce.bucketing=
true;
-
#分了几个桶就设置几个reduce,将从其他表中查出来多个文件,分表放入到多个桶里。
-
set mapreduce.job.reduces=
4;
-
-
#从my_tb表中查出数据插入到分桶表里
-
insert
into table my_buck_tb
-
#指定map输出的数据根据id去分区,排序(cluster by等价于distribute by+sort by的效果)
-
select id,name
from my_tb cluster
by(id);
保存查询结果
默认情况下查询结果显示在屏幕上,可以将查询结果保存到表里。
-
#将查询结果保存到一张新创建的表中
-
create table tmp_tb
as
select *
from my_tb;
-
-
#将查询结果保存到一张已经存在的表中
-
insert
into table tmp_tb
select *
from my_tb;
-
-
#将查询结果保存到指定目录下(本地或hdfs上)
-
#本地
-
insert overwrite local directory
'/root/out_tb/'
-
select *
from my_tb;
-
#hdfs
-
insert overwrite directory
'/out_tb/'
-
select *
from my_tb;
join操作
-
a表数据:
-
-
1,张三
-
2,李四
-
3,c
-
4,a
-
5,e
-
6,r
-
-
b表数据:
-
-
1,绿间
-
3,青峰
-
4,黑子
-
9,红发
-
-
建表:
-
create table a(id int,name string)
-
row format delimited fields terminated by
',';
-
-
create table b(id int,name string)
-
row format delimited fields terminated by
',';
-
-
导入数据:
-
load data local inpath
'/root/a.txt' into table a;
-
load data local inpath
'/root/b.txt' into table b;
-
-
#内连接(交集)
-
select * from a inner join b on a.id=b.id;
-
+-------+---------+-------+---------+--+
-
| a.id | a.name
| b.id | b.name
|
-
+-------+---------+-------+---------+--+
-
|
1
| 张三 |
1
| 绿间 |
-
| 3 | c
| 3 | 青峰
|
-
|
4
| a |
4
| 黑子 |
-
+-------+---------+-------+---------+--+
-
-
#左连接
-
select * from a left join b on a.id=b.id;
-
+-------+---------+-------+---------+--+
-
| a.id | a.name
| b.id | b.name
|
-
+-------+---------+-------+---------+--+
-
|
1
| 张三 |
1
| 绿间 |
-
| 2 | 李四
| NULL | NULL
|
-
|
3
| c |
3
| 青峰 |
-
| 4 | a
| 4 | 黑子
|
-
|
5
| e | NULL
| NULL |
-
| 6 | r
| NULL | NULL
|
-
+-------+---------+-------+---------+--+
-
-
#右连接
-
select * from a right join b on a.id=b.id;
-
+-------+---------+-------+---------+--+
-
| a.id
| a.name | b.id
| b.name |
-
+-------+---------+-------+---------+--+
-
| 1 | 张三
| 1 | 绿间
|
-
|
3
| c |
3
| 青峰 |
-
| 4 | a
| 4 | 黑子
|
-
| NULL
| NULL |
9
| 红发 |
-
+-------+---------+-------+---------+--+
-
-
#全连接
-
select * from a full outer join b on a.id=b.id;
-
+-------+---------+-------+---------+--+
-
| a.id | a.name
| b.id | b.name
|
-
+-------+---------+-------+---------+--+
-
|
1
| 张三 |
1
| 绿间 |
-
| 2 | 李四
| NULL | NULL
|
-
|
3
| c |
3
| 青峰 |
-
| 4 | a
| 4 | 黑子
|
-
|
5
| e | NULL
| NULL |
-
| 6 | r
| NULL | NULL
|
-
| NULL
| NULL |
9
| 红发 |
-
+-------+---------+-------+---------+--+
-
-
-
#左半连接(内连接的结果中只取左边的表的数据)
-
select * from a left semi join b on a.id = b.id;
-
+-------+---------+--+
-
| a.id | a.name
|
-
+-------+---------+--+
-
|
1
| 张三 |
-
| 3 | c
|
-
|
4
| a |
-
+-------+---------+--+
select查询语句
-
SELECT [ALL |
DISTINCT] select_expr, select_expr, ...
-
FROM table_name[..join..on(a.id=b.id)]
-
[
WHERE where_condition]
-
[
GROUP
BY col_list [
HAVING condition]]
-
[CLUSTER
BY col_list
-
| [
DISTRIBUTE
BY col_list] [
SORT
BY|
ORDER
BY col_list]
-
]
-
[
LIMIT
number]
-
-
#指定
map输出的数据根据
id去分区,排序(cluster
by等价于
distribute
by+
sort
by的效果)
-
select
id,
name
from my_tb cluster
by(
id);
自定义函数
hive内置函数
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
pom文件
-
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-
<modelVersion>4.0.0
</modelVersion>
-
<groupId>com.xiaojie.mm
</groupId>
-
<artifactId>my_hive
</artifactId>
-
<version>0.0.1-SNAPSHOT
</version>
-
-
<properties>
-
<hadoop.version>2.6.5
</hadoop.version>
-
<hive.version>1.2.1
</hive.version>
-
</properties>
-
-
-
<dependencies>
-
<!-- Hadoop -->
-
<dependency>
-
<groupId>org.apache.hadoop
</groupId>
-
<artifactId>hadoop-common
</artifactId>
-
<version>2.6.5
</version>
-
</dependency>
-
<!-- Hive -->
-
<dependency>
-
<groupId>org.apache.hive
</groupId>
-
<artifactId>hive-exec
</artifactId>
-
<version>1.2.1
</version>
-
</dependency>
-
<dependency>
-
<groupId>org.apache.hive
</groupId>
-
<artifactId>hive-metastore
</artifactId>
-
<version>1.2.1
</version>
-
</dependency>
-
<dependency>
-
<groupId>org.apache.hive
</groupId>
-
<artifactId>hive-pdk
</artifactId>
-
<version>0.10.0
</version>
-
</dependency>
-
<dependency>
-
<groupId>javax.jdo
</groupId>
-
<artifactId>jdo2-api
</artifactId>
-
<version>2.3-eb
</version>
-
</dependency>
-
<dependency>
-
<groupId>commons-logging
</groupId>
-
<artifactId>commons-logging
</artifactId>
-
<version>1.1.1
</version>
-
</dependency>
-
<dependency>
-
<groupId>junit
</groupId>
-
<artifactId>junit
</artifactId>
-
<version>4.7
</version>
-
<scope>test
</scope>
-
</dependency>
-
<dependency>
-
<groupId>jdk.tools
</groupId>
-
<artifactId>jdk.tools
</artifactId>
-
<version>1.7
</version>
-
<scope>system
</scope>
-
<systemPath>/home/miao/apps/install/jdk1.7.0_45/lib/tools.jar
</systemPath>
-
</dependency>
-
</dependencies>
-
</project>
自定义将大写转为小写的方法
-
package com.xiaojie.mm;
-
-
import org.apache.hadoop.hive.ql.exec.UDF;
-
-
public
class ToLower extends UDF{
-
// 重载该方法
-
public String evaluate(String field) {
-
return field.toLowerCase();
-
}
-
}
导出jar包,并放到hive所在的机器上
scp tolower.jar mini1:/root/apps/
hive客户端添加自定义函数
-
#第一步
-
add JAR /root/apps/tolower.jar;
-
#第二步 引号里是自定义方法的全名(临时方法,只在该回话窗口有效)
-
create temporary function tolower as
'com.xiaojie.mm.ToLower';
-
#第三步使用
-
select * from a;
-
+-------+---------+--+
-
| a.id | a.name
|
-
+-------+---------+--+
-
|
7
| AAAAA |
-
| 1 | 张三
|
-
|
2
| 李四 |
-
| 3 | c
|
-
|
4
| a |
-
| 5 | e
|
-
|
6
| r |
-
+-------+---------+--+
-
-
select id,tolower(name) from a;
-
+-----+--------+--+
-
| id | _c1
|
-
+-----+--------+--+
-
|
7
| aaaaa |
-
| 1 | 张三
|
-
|
2
| 李四 |
-
| 3 | c
|
-
|
4
| a |
-
| 5 | e
|
-
|
6
| r |
-
+-----+--------+--+
自定义获取手机归属地
-
package com.xiaojie.mm;
-
import java.util.HashMap;
-
import org.apache.hadoop.hive.ql.exec.UDF;
-
public
class GetProvince extends UDF{
-
public static HashMap<String,String> provinceMap = new HashMap<String,String>();
-
static {
-
provinceMap.put(
"183",
"hangzhou");
-
provinceMap.put(
"186",
"nanjing");
-
provinceMap.put(
"187",
"suzhou");
-
provinceMap.put(
"188",
"ningbo");
-
}
-
public String evaluate(int phonenumber) {
-
String phone_num = String.valueOf(phonenumber);
-
#取手机号码前三位
-
String phone = phone_num.substring(
0,
3);
-
return provinceMap.get(phone)==null?
"未知"
:provinceMap.get(phone);
-
}
-
}
-
-
原数据:
-
+----------------------+---------------------+--+
-
| flow_province.phone | flow_province.flow
|
-
+----------------------+---------------------+--+
-
|
1837878
| 12m |
-
| 1868989 |
13m
|
-
|
1878989
| 14m |
-
| 1889898 |
15m
|
-
|
1897867
| 16m |
-
| 1832323 |
78m
|
-
|
1858767
| 88m |
-
| 1862343 |
99m
|
-
|
1893454
| 77m |
-
+----------------------+---------------------+--+
-
-
调用自定义方法后:
-
-
select phone,getpro(phone),flow from flow_province;
-
+----------+-----------+-------+--+
-
| phone | _c1
| flow |
-
+----------+-----------+-------+--+
-
| 1837878 | hangzhou
| 12m |
-
| 1868989 | nanjing
| 13m |
-
| 1878989 | suzhou
| 14m |
-
| 1889898 | ningbo
| 15m |
-
| 1897867 | 未知
| 16m |
-
| 1832323 | hangzhou
| 78m |
-
| 1858767 | 未知
| 88m |
-
| 1862343 | nanjing
| 99m |
-
| 1893454 | 未知
| 77m |
-
+----------+-----------+-------+--+
自定义解析json格式的数据
-
#创建表
-
create table json_tb(line string);
-
#导入数据
-
load data local inpath
'/root/test_data/a.json'
into table json_tb;
-
#显示原数据
-
select line
from my_tb limit
10;
-
+----------------------------------------------------------------+--+
-
| json_tb.line |
-
+----------------------------------------------------------------+--+
-
| {
"movie":
"1193",
"rate":
"5",
"timeStamp":
"978300760",
"uid":
"1"} |
-
| {
"movie":
"661",
"rate":
"3",
"timeStamp":
"978302109",
"uid":
"1"} |
-
| {
"movie":
"914",
"rate":
"3",
"timeStamp":
"978301968",
"uid":
"1"} |
-
| {
"movie":
"3408",
"rate":
"4",
"timeStamp":
"978300275",
"uid":
"1"} |
-
| {
"movie":
"2355",
"rate":
"5",
"timeStamp":
"978824291",
"uid":
"1"} |
-
| {
"movie":
"1197",
"rate":
"3",
"timeStamp":
"978302268",
"uid":
"1"} |
-
| {
"movie":
"1287",
"rate":
"5",
"timeStamp":
"978302039",
"uid":
"1"} |
-
| {
"movie":
"2804",
"rate":
"5",
"timeStamp":
"978300719",
"uid":
"1"} |
-
| {
"movie":
"594",
"rate":
"4",
"timeStamp":
"978302268",
"uid":
"1"} |
-
| {
"movie":
"919",
"rate":
"4",
"timeStamp":
"978301368",
"uid":
"1"} |
-
+----------------------------------------------------------------+--+
-
-
#自定义函数
-
package com.xiaojie.mm;
-
import org.apache.hadoop.hive.ql.exec.UDF;
-
import parquet.org.codehaus.jackson.map.ObjectMapper;
-
-
public
class
JsonParse
extends
UDF{
-
public String evaluate(String jsonLine) {
-
ObjectMapper objectMapper =
new ObjectMapper();
-
try {
-
MovieBean bean = objectMapper.readValue(jsonLine, MovieBean.class);
-
return bean.toString();
-
}
catch(Exception e){
-
-
}
-
return
"";
-
}
-
}
-
-
package com.xiaojie.mm;
-
public
class
MovieBean {
-
// 电影id
-
private String movie;
-
// 电影评分
-
private String rate;
-
// 评分时间
-
private String timeStamp;
-
// 用户id
-
private String uid;
-
-
public String getMovie() {
-
return movie;
-
}
-
-
public void setMovie(String movie) {
-
this.movie = movie;
-
}
-
-
public String getRate() {
-
return rate;
-
}
-
-
public void setRate(String rate) {
-
this.rate = rate;
-
}
-
-
public String getTimeStamp() {
-
return timeStamp;
-
}
-
-
public void setTimeStamp(String timeStamp) {
-
this.timeStamp = timeStamp;
-
}
-
-
public String getUid() {
-
return uid;
-
}
-
-
public void setUid(String uid) {
-
this.uid = uid;
-
}
-
-
@
Override
-
public String
toString(
) {
-
return
this.movie +
"\t" +
this.rate +
"\t" +
this.timeStamp +
"\t" +
this.uid;
-
}
-
-
}
-
-
#打jar包上传到hive所在机器,创建函数
-
add JAR /root/test_data/json_parse.jar;
-
create temporary function json_parse
as
'com.xiaojie.mm.JsonParse';
-
-
#使用自定义的json解析函数
-
select json_parse(line) from json_tb limit 10;
-
+---------------------+--+
-
| _c0 |
-
+---------------------+--+
-
|
1193
5
978300760
1 |
-
|
661
3
978302109
1 |
-
|
914
3
978301968
1 |
-
|
3408
4
978300275
1 |
-
|
2355
5
978824291
1 |
-
|
1197
3
978302268
1 |
-
|
1287
5
978302039
1 |
-
|
2804
5
978300719
1 |
-
|
594
4
978302268
1 |
-
|
919
4
978301368
1 |
-
+---------------------+--+
-
-
#将json解析的数据保存到一张新创建的表里
-
create table json_parse_tb as
-
select
split(
json_parse(line),'\t')[0]
as movieid,
-
split(
json_parse(line),'\t')[1]
as rate,
-
split(
json_parse(line),'\t')[2]
as time,
-
split(
json_parse(line),'\t')[3]
as userid
-
from json_tb limit 100;
-
-
#内置json函数
-
select get_json_object(line,'$.movie') as moiveid,
-
get_json_object(
line,'$.rate')
as rate,
-
get_json_object(
line,'$.timeStamp')
as time,
-
get_json_object(
line,'$.uid')
as userid
-
from json_tb limit 10;
Transform(调用自定义脚本)
Hive的 TRANSFORM 关键字提供了在SQL中调用自写脚本的功能,适合实现Hive中没有的功能又不想写UDF的情况。
自定义python脚本(vim time_parse.py)
-
#!/bin/python
-
import sys
-
import datetime
-
-
for line
in sys.stdin:
-
line = line.strip()
-
movieid, rate, unixtime,userid = line.
split('\t')
-
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
-
print '\t'.
join([movieid, rate, str(weekday),userid])
将py文件导入到hive的工作目录下
add file time_parse.py
使用transform调用自定义的py代码
-
create
TABLE json_parse_time_tb
as
-
SELECT
-
#根据transform括号中的参数,将json_parse_tb表的对应数据取出
-
TRANSFORM (movieid, rate,
time, userid)
-
USING
'python time_parse.py'
-
AS (movieid, rate,
weekday,userid)
-
FROM json_parse_tb;
查看新表数据
-
select * from json_parse_time_tb;
-
+-----------------------------+--------------------------+-----------------------------+----------------------------+--+
-
| json_parse_time_tb.movieid | json_parse_time_tb.rate
| json_parse_time_tb.weekday | json_parse_time_tb.userid
|
-
+-----------------------------+--------------------------+-----------------------------+----------------------------+--+
-
|
1690
| 3 |
1
| 2 |
-
| 589 |
4
| 1 |
2
|
-
|
3471
| 5 |
1
| 2 |
-
| 1834 |
4
| 1 |
2
|
-
|
2490
| 3 |
1
| 2 |
-
| 2278 |
3
| 1 |
2
|
-
|
110
| 5 |
1
| 2 |
-
| 3257 |
3
| 1 |
2
|
-
|
3256
| 2 |
1
| 2 |
-
| 3255 |
4
| 1 |
2
|
-
+-----------------------------+--------------------------+-----------------------------+----------------------------+--+
案例
原数据(用户名,月份,点击量)
A,2015-01,5 A,2015-01,15 B,2015-01,5 A,2015-01,8 B,2015-01,25 A,2015-01,5 A,2015-02,4 A,2015-02,6 B,2015-02,10 B,2015-02,5
求每个人每个月的点击量,以及点击量累计
第一步:创建表,导入数据
-
#建表
-
create table click_tb(username string,month string,click int)
-
row format delimited fields terminated by ',';
-
#导入数据
-
load data local inpath ‘/root/test_data/click.txt’
into click_tb;
第二步:求每个用户每个月的点击量
-
select username,
month,
sum(click)
as click_count
from click_tb
group
by username,
month;
-
-
+
-----------+----------+--------------+--+
-
| username | month | click_count |
-
+
-----------+----------+--------------+--+
-
| A | 2015-01 | 33 |
-
| A | 2015-02 | 10 |
-
| B | 2015-01 | 30 |
-
| B | 2015-02 | 15 |
-
+
-----------+----------+--------------+--+
第三步:自己和自己内连接(求交集)
-
select * from
-
(select username,month,sum(click) as click_count from click_tb group by username,month) A
-
inner join
-
(select username,month,sum(click) as click_count from click_tb group by username,month) B
-
on
-
A.username=B.username;
-
-
+-------------+----------+----------------+-------------+----------+----------------+--+
-
| a.username | a.month
| a.click_count | b.username
| b.month | b.click_count
|
-
+-------------+----------+----------------+-------------+----------+----------------+--+
-
| A
| 2015-01 |
33
| A |
2015-
01
| 33 |
-
| A |
2015-
01
| 33 | A
| 2015-02 |
10
|
-
| A
| 2015-02 |
10
| A |
2015-
01
| 33 |
-
| A |
2015-
02
| 10 | A
| 2015-02 |
10
|
-
| B
| 2015-01 |
30
| B |
2015-
01
| 30 |
-
| B |
2015-
01
| 30 | B
| 2015-02 |
15
|
-
| B
| 2015-02 |
15
| B |
2015-
01
| 30 |
-
| B |
2015-
02
| 15 | B
| 2015-02 |
15
|
-
+-------------+----------+----------------+-------------+----------+----------------+--+
第四步:求出最终所需结果
-
select a.username,a.month,
min(a.click_count)
as click_count,
sum(b.click_count)
as sum_count
from
-
(
select username,
month,
sum(click)
as click_count
from click_tb
group
by username,
month) a
-
inner
join
-
(
select username,
month,
sum(click)
as click_count
from click_tb
group
by username,
month) b
-
on
-
A.username=B.username
-
where b.month<=a.month
-
group
by a.username,a.month
-
order
by a.username,a.month;
-
-
+
-------------+----------+--------------+------------+--+
-
| a.username | a.month | click_count | sum_count |
-
+
-------------+----------+--------------+------------+--+
-
| A | 2015-01 | 33 | 33 |
-
| A | 2015-02 | 10 | 43 |
-
| B | 2015-01 | 30 | 30 |
-
| B | 2015-02 | 15 | 45 |
-
+
-------------+----------+--------------+------------+--+