Hive函数
-
手册:https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
-
hive数据库下标从1开始,含头含尾
-
查看所有内部函数:show functions;
-
直接用常量测试函数:select xxx(x,x,x);
select substr('abcdefg',1,3);
+------+
| _c0 |
+------+
| abc |
+------+
文章目录
常用内置函数
类型转换
cast
select cast(‘xx’ as xxType);
select cast("22" as int);
select cast("400" as float);
select cast("2019-01-22" as date);
时间截
时间截常量
#Hive系统时间
select current_timestamp;
+--------------------------+
| _c0 |
+--------------------------+
| 2019-08-27 01:53:11.948 |
+--------------------------+
#unix系统时间
select unix_timestamp();
+-------------+
| _c0 |
+-------------+
| 1566870794 |
+-------------+
都是一个字符串常量,hive时间可以cast转换成date,unix时间截不能转。
select cast(current_timestamp as date);
+-------------+
| _c0 |
+-------------+
| 2019-08-27 |
+-------------+
select cast(unix_timestamp() as date);
Error: Error while compiling statement: FAILED: SemanticException Line 0:-1 Wrong arguments 'unix_timestamp': CAST as DATE only allows date,string, or timestamp types (state=42000,code=40000)
时间截转换
时间截转换字符串
函数:from_unixtime(bigint,“format”)
select from_unixtime(unix_timestamp(),"yyyy/MM/dd HH:mm:ss");
+----------------------+
| _c0 |
+----------------------+
| 2019/08/27 02:02:26 |
+----------------------+
字符串转时间截
函数:unix_timestamp(date,pattern)
select unix_timestamp("2019/8/27 10:04:44","yyyy/MM/dd HH:mm:ss");
+-------------+
| _c0 |
+-------------+
| 1566900284 |
+-------------+
函数:to_date(string 只能传标准格式日期)
select to_date("2019-05-21 12:10:55");
+-------------+
| _c0 |
+-------------+
| 2019-05-21 |
+-------------+
数学运算符
保留几位小数: round(5.123456, 3) #5.123
向上取整 ceil(5.4) #6
向下取整 floor(5.4) #5
绝对值 abs(-5.4) #5.4
最大值 greatest(1,2,3) #3 最少两个参数
最小值 least(1,2,3) #1 最少两个参数
max/min是聚合函数,可以求日期,多列最大、小值。
字符串函数
substr(字符串,起始,结束) #下标1开始,含头含尾,可省略结束index截取到最后
substring_index(字符串,分隔符,个数) #字符串分割为数组返回N个元素
concat(字符串,字符串,…) #拼接字符串
concat(分隔符,字符串,字符串,…) #拼接字符串,中间加分隔符
split(字符串,regx) #切割字符串,返回数组
upper(str) #转大写
to_date(string str) #字符串转日期,只能传标准日期格式"2019-05-21 12:10:55"
hive中的数组可以直接用[n]取值,split(“a,b”,",")[1]
集合函数
sort_array(array) #排序
size(array/map) #array、map大小
map_keys(map) #返回map key数组
map_values(map) #返回map value数组
条件控制函数
case when … then … else … end #与sql一样
if(条件,条件成立返回值,条件不成立返回值) #条件可以使用函数(array_contains())
表生成函数
行转列,结构化数据打散
data
1,zhangsan,化学:物理:数学:语文
2,lisi,化学:数学:生物:物理:卫生
3,wangwu,化学:语文:英语:体育:生物
create table
create table subject(id int,name string,subject array<string>)
row format delimited fields terminated by ','
collection items terminated by ':';
+-------------+---------------+-----------------------------+
| subject.id | subject.name | subject.subject |
+-------------+---------------+-----------------------------+
| 1 | zhangsan | ["化学","物理","数学","语文"] |
| 2 | lisi | ["化学","数学","生物","物理","卫生"] |
| 3 | wangwu | ["化学","语文","英语","体育","生物"] |
+-------------+---------------+-----------------------------+
explode
爆炸,炸开
select explode(subject) from subject;
+------+
| col |
+------+
| 化学 |
| 物理 |
| 数学 |
| 语文 |
| 化学 |
| 数学 |
| 生物 |
| 物理 |
| 卫生 |
| 化学 |
| 语文 |
| 英语 |
| 体育 |
| 生物 |
+------+
lateral view
select id,name,tmp.sub
from subject
lateral view
explode(subject) tmp as sub
+-----+-----------+----------+
| id | name | tmp.sub |
+-----+-----------+----------+
| 1 | zhangsan | 化学 |
| 1 | zhangsan | 物理 |
| 1 | zhangsan | 数学 |
| 1 | zhangsan | 语文 |
| 2 | lisi | 化学 |
| 2 | lisi | 数学 |
| 2 | lisi | 生物 |
| 2 | lisi | 物理 |
| 2 | lisi | 卫生 |
| 3 | wangwu | 化学 |
| 3 | wangwu | 语文 |
| 3 | wangwu | 英语 |
| 3 | wangwu | 体育 |
| 3 | wangwu | 生物 |
+-----+-----------+----------+
结构化数据打散,可以方便聚合函数使用。
窗口分析函数
分组Top N:row_number() over()
data
1,18,a,male
2,19,b,male
3,22,c,female
4,16,d,female
5,30,e,male
6,26,f,female
create table
create table man(id int,age int,namestring,sex string)
row format delimited fields terminated by ',';
+---------+----------+-----------+----------+
| man.id | man.age | man.name | man.sex |
+---------+----------+-----------+----------+
| 1 | 18 | a | male |
| 2 | 19 | b | male |
| 3 | 22 | c | female |
| 4 | 16 | d | female |
| 5 | 30 | e | male |
| 6 | 26 | f | female |
+---------+----------+-----------+----------+
row_number() over()
分组排序
select * from (select id,age,name,sex,row_number() over(partition by sex order by age desc) as rn from man) tmp where rn < 3;
+---------+----------+-----------+----------+---------+
| tmp.id | tmp.age | tmp.name | tmp.sex | tmp.rn |
+---------+----------+-----------+----------+---------+
| 6 | 26 | f | female | 1 |
| 3 | 22 | c | female | 2 |
| 5 | 30 | e | male | 1 |
| 2 | 19 | b | male | 2 |
+---------+----------+-----------+----------+---------+
累积报表
data
A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
C,2015-01,10
C,2015-01,20
A,2015-02,4
A,2015-02,6
C,2015-02,30
C,2015-02,10
B,2015-02,10
B,2015-02,5
A,2015-03,14
A,2015-03,6
B,2015-03,20
B,2015-03,25
C,2015-03,10
C,2015-03,20
create table
create table times(name string,month string,counts int)
row format delimited fields terminated by ',';
sun()over()
sum(?)over(partition by ? order by ? rows between unbounded preceding and current row)
分组排序然后把每组起始行到每行记录累加。
select name,month,amount,
sum(amount)over
(
partition by name order by month rows between unbounded preceding and current row
)
as accumulate
from
(
select name,month,sum(counts) as amount from times group by name,month
) tmp;
+-------+----------+---------+-------------+
| name | month | amount | accumulate |
+-------+----------+---------+-------------+
| A | 2015-01 | 33 | 33 |
| A | 2015-02 | 10 | 43 |
| A | 2015-03 | 20 | 63 |
| B | 2015-01 | 30 | 30 |
| B | 2015-02 | 15 | 45 |
| B | 2015-03 | 45 | 90 |
| C | 2015-01 | 30 | 30 |
| C | 2015-02 | 40 | 70 |
| C | 2015-03 | 30 | 100 |
+-------+----------+---------+-------------+
自定义函数
步骤
- 写一个java程序实现函数的功能,继承UDF,重载evaluate方法(传入json和下标,返回一个值)(依赖包hive-exec);
- 打包java程序上传到hive服务器;
- 在hive命令行中把jar包添加到hive classpath,add jar;
- 在hive命令行中创建一个函数,关联自定义java类 create temporary function myjson as ‘MyJsonParser’;。
自定义json解析函数
data
{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
{"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"}
{"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"}
{"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"}
{"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"}
create table
create table json(json string);
+----------------------------------------------------+
| json.json |
+----------------------------------------------------+
| {"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"} |
| {"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"} |
| {"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"} |
| {"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"} |
| {"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"} |
| {"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"} |
| {"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"} |
| {"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"} |
+----------------------------------------------------+
java code
import org.apache.hadoop.hive.ql.exec.UDF;
public class MyJsonParser extends UDF {
//重载父类中的evaluate方法
public String evaluate(String json, int index) {
//复杂数据可以用fastjson等json解析包,这里用split随便实现一下
//{"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"}
String[] split = json.split("\"");
return split[2 * index + 1];
}
}
打包,上传到hive服务器
demo1.jar上传到hive服务器某文件夹
add jar
进入hive客户端,将jar包添加到hive classpath
add jar /root/test/demo1.jar
关联自定义函数
create temporary function myjson as 'MyJsonParser';
使用:
select myjson(json,1),myjson(json,3) from json;
+-------+------+
| _c0 | _c1 |
+-------+------+
| 1193 | 5 |
| 661 | 3 |
| 914 | 3 |
| 3408 | 4 |
| 2355 | 5 |
| 1197 | 3 |
| 1287 | 5 |
| 2804 | 5 |
| 594 | 4 |
| 919 | 4 |
+-------+------+
Json解析函数
hive内部自带json解析函数: json_tuple()
还是用上面自定义函数demo中的data,json表。
直接查询:
select json_tuple(json,'movie','rate') as (movie,rate) from json;
+--------+-------+
| movie | rate |
+--------+-------+
| 1193 | 5 |
| 661 | 3 |
| 914 | 3 |
| 3408 | 4 |
| 2355 | 5 |
| 1197 | 3 |
| 1287 | 5 |
| 2804 | 5 |
| 594 | 4 |
| 919 | 4 |
+--------+-------+