一.内置函数
1.类型转换
cast(expr as <type>)
例:
Select cast(‘1’ as double);
2.切割
split(string str, string pat)
例:select spilt(‘zhangsan,lisi,wangwu’,’\\,’);
3.正则表达截取字符串
regexp_extract(string subject, string pattern, int index)
例:select regexp_extract('hello<B>nice</B>haha','<B>(.*)</B>',1);
4.将字符串前后的空格去掉
trim(string A)
例:select trim(“ asdasda”);
5.聚合函数
sum(col)
avg(col)
min(col)
max(col)
求和:
.求最小值:
6.拼接字符串
concat(string A, string B...)
例:select concat(“asdghkajhsd”,”asdujhakdj”);
7.字符串的截取
例:
select substr(‘ajksd’,4,2);
8.炸裂函数
select explode(split("nice|good|well","\\|"));
9.Case when(打分、评级的时候使用)
select name,score,
case when score >= 90 then 'very good'
when score >= 80 and score <90 then 'double good'
when score >= 70 and score <80 then 'good'
when score >= 60 and score <70 then 'go on'
else 'zhencai'
end level
from student_level order by score desc;
10.炸裂函数的实际应用
select name,sub
from test1
LATERAL VIEW explode(split(subject,','))temp as sub;
11.日期处理函数
1.date_format('日期')函数(根据格式整理日期)
Select data_format(‘2020-03-05’,’yyyy-MM’)
2.date_add('日期')函数(加减日期)
Select data_add(‘2022-11-14’,-1)
3.date_sub('日期')函数(加减日期)
Select data_sub(‘2022-11-14’,1)
4.next_day('日期')函数(取当前天的下一个周一)
Select next_day(‘2022-11-14’,’MO’)
5.date_add(next_day('日期','MO'),-7)函数
Select data_add(next_day(‘2020-11-14’,’MO’))
6.last_day('日期')函数(求当月最后一天的函数)
Select last_day(‘2020-11-14’)
二.处理json数据
现有json数据
{“movie”:“1193”,“rate”:“5”,“timeStamp”:“978300760”,“uid”:“1”}
{“movie”:“661”,“rate”:“3”,“timeStamp”:“978302109”,“uid”:“1”}
{“movie”:“914”,“rate”:“3”,“timeStamp”:“978301968”,“uid”:“1”}
{“movie”:“3408”,“rate”:“4”,“timeStamp”:“978300275”,“uid”:“1”}
{“movie”:“2355”,“rate”:“5”,“timeStamp”:“978824291”,“uid”:“1”}
{“movie”:“1197”,“rate”:“3”,“timeStamp”:“978302268”,“uid”:“1”}
{“movie”:“1287”,“rate”:“5”,“timeStamp”:“978302039”,“uid”:“1”}
{“movie”:“2804”,“rate”:“5”,“timeStamp”:“978300719”,“uid”:“1”}
{“movie”:“594”,“rate”:“4”,“timeStamp”:“978302268”,“uid”:“1”}
创建表,并load数据。
create table json(data string);
load data local inpath '/opt/testData/hive/json.txt' into table json;
查询json数据。
select get_json_object(data,'$.movie') as movie,
get_json_object(data,'$.rate') as rate
from json;
三.窗口函数
3.1窗口聚合
sum(pv) over()
1.数据准备
cookie1,2015-04-10,1
cookie1,2015-04-13,3
cookie1,2015-04-11,5
cookie1,2015-04-12,2
cookie1,2015-04-15,4
cookie1,2015-04-16,4
cookie1,2015-04-14,4
cookie2,2015-04-14,3
cookie2,2015-04-15,9
cookie2,2015-04-16,7
cookie2,2015-04-10,2
cookie2,2015-04-11,3
cookie2,2015-04-12,5
cookie2,2015-04-13,6
数据准备地址,后面数据地址相同
2.创建新表
create table cookie1(cookieid string, createtime string, pv int) row format delimited fields terminated by ',';
3.加载数据
load data local inpath "/opt/software/testData/cookie1.txt" into table cookie1;
4.查看数据
select * from cookie1;
5.函数实现
select cookieid,createtime,
pv,
sum(pv) over (partition by cookieid order by createtime rows between unbounded preceding and current row) as pv1, --默认起点到当前行
sum(pv) over (partition by cookieid order by createtime) as pv2, -- 从起点到当前行
sum(pv) over (partition by cookieid) as pv3, -- 分组内所有行
sum(pv) over (partition by cookieid order by createtime rows between 3 preceding and current row) as pv4, -- 此行+往前3行
sum(pv) over (partition by cookieid order by createtime rows between 3 preceding and 2 following) as pv5, -- 此行+往前3行+后两行
sum(pv) over (partition by cookieid order by createtime rows between current row and unbounded following) as pv6 -- 此行+往后所有行
from cookie1;
3.2 窗口分片
ntile(n) over()
1.数据准备
cookie1,2015-04-10,1
cookie1,2015-04-11,5
cookie1,2015-04-12,7
cookie1,2015-04-13,3
cookie1,2015-04-14,2
cookie1,2015-04-15,4
cookie1,2015-04-16,4
cookie2,2015-04-10,2
cookie2,2015-04-11,3
cookie2,2015-04-12,5
cookie2,2015-04-13,6
cookie2,2015-04-14,3
cookie2,2015-04-15,9
cookie2,2015-04-16,7
2.创建表
create table cookie2(cookieid string, createtime string, pv int)
row format delimited
fields terminated by ',';
3.加载数据
load data local inpath "/opt/software/testData/cookie2.txt" into table cookie2;
4.代码实现
select cookieid,createtime,
pv,
ntile(2) over (partition by cookieid order by createtime) as rn1, --分组内将数据分成2片
ntile(3) over (partition by cookieid order by createtime) as rn2, --分组内将数据分成3片
ntile(4) over (order by createtime) as rn3 -- 将所有数据分为四片
from cookie1
order by cookieid,createtime;
3.3 窗口排序
row_number() over()
row_number() over (partition by cookieid order by pv desc) as rn1
例
100 99 98 98 97 96
1,2,3,4,5,6
rank() over()
rank() over(partition by cookieid order by pv desc) as rn2
例
100 99 98 98 97 96
1,2,3,3,5,6
dense_rank() over()
dense_rank() over(partition by cookieid order by pv desc) as rn3
例
100 99 98 98 97 96
1,2,3,3,4,5
1. 代码具体实现
使用cookie2数据进行实现
select
cookieid,
createtime,
pv,
rank() over (partition by cookieid order by pv desc) as rn1,
dense_rank() over (partition by cookieid order by pv desc) as rn2,
row_number() over (partition by cookieid order by pv desc) as rn3
from cookie2
where cookieid='cookie1';
3.4 上下移动函数
LAG(col,n,DEFAULT)
1.数据准备
cookie1,2015-04-10 10:00:02,url2
cookie1,2015-04-10 10:00:00,url1
cookie1,2015-04-10 10:03:04,url3
cookie1,2015-04-10 10:50:05,url6
cookie1,2015-04-10 11:00:00,url7
cookie1,2015-04-10 10:10:00,url4
cookie1,2015-04-10 10:50:01,url5
cookie2,2015-04-10 10:00:02,url22
cookie2,2015-04-10 10:00:00,url11
cookie2,2015-04-10 10:03:04,url33
cookie2,2015-04-10 10:50:05,url66
cookie2,2015-04-10 11:00:00,url77
cookie2,2015-04-10 10:10:00,url44
cookie2,2015-04-10 10:50:01,url55
2.创建表
create table cookie3(cookieid string, createtime string, url string)
row format delimited fields terminated by ',';
3.加载数据
oad data local inpath "/opt/software/testData/cookie3.txt" into table cookie3;
4.代码实现
用于将当前列往上移n行
第一个参数为列名。
第二个参数为往上第n行(可选,默认为1)。
第三个参数为默认值(当往上第n行为NULL时候,取默认值,如不指定,则为NULL)。
LAG(createtime,1,‘1970-01-01 00:00:00’) over (partition by cookieid order by createtime) as last_1_time
LAG(createtime,2) over (partition by cookieid order by createtime) as last_2_time
select
cookieid,
createtime,
url,
row_number() over (partition by cookieid order by createtime) as rn,
LAG(createtime,1,'1970-01-01 00:00:00') over (partition by cookieid order by createtime) as last_1_time,
LAG(createtime,2) over (partition by cookieid order by createtime) as last_2_time
from cookie3;
3.5 首尾值
FIRST_VALUE(url) over ()
分组排序后截至到当前行的第一个值。
FIRST_VALUE(url) over (partition by cookieid order by createtime desc) as last1
LAST_VALUE(url) over ()
分组排序后截至到当前行的最后一个值。
FIRST_VALUE(url) over (partition by cookieid order by createtime desc) as last2
1. 代码实现
select cookieid,createtime,url,
row_number() over (partition by cookieid order by createtime) as rn,
FIRST_VALUE(url) over (partition by cookieid order by createtime desc) as last1,
LAST_VALUE(url) over (partition by cookieid order by createtime desc) as last2
from cookie3;
四.自定义函数
当 Hive 提供的内置函数无法满足业务处理需要时,此时就可以考虑使用用户自定义函数。
UDF(user-defined function)作用于单个数据行,产生一个数据行作为输出。
UDAF(用户定义聚集函数 User- Defined Aggregation Funcation):接收多个输入数据行,并产生一个输出数据行。类似于max、min。
UDTF(表格生成函数 User-Defined Table Functions):接收一行输入,输出多行。类似于explode。
4.1自定义UDF
1.创建Maven项目,并导入依赖
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.3</version>
<exclusions>
<exclusion>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
2.自定义一个java类继承UDF,重载 evaluate 方法
public class ToLower extends UDF {
public String evaluate(String field) {
String result = field.toLowerCase();
return result;
}
}
3.打成jar包上传到服务器。
4.添加jar包到hive中
使用add JAR导入jar包
add JAR /opt/testData/hive/UDF-1.0-SNAPSHOT.jar;
list jar;
5.创建临时函数与开发好的 class 关联起来
create temporary function tolowercase as ‘ToLower’;
在HQL中使用
这种方式创建的临时函数只在一次hive会话中有效,重启会话后就无效。
6.创建永久函数
create function tolowercase as 'ToLower';
7,删除函数
删除临时函数
drop temporary function tolowercase;
删除永久函数
drop function tolowercase;
五.Hive的Shell操作
hive -e 从命令行执行指定的HQL
例:hive -e “select” * from student_level
hive -f 执行 HQL 脚本
例:echo “select * from student_level” > hive.sql
hive -f hive.sql