文章目录
Hive高级函数
一、Hive函数
1、JSON数据处理
知识点:
get_json_object:解析json内容
优点:能够解析嵌套的json
缺点:每次只能解析一个
json_tuple:
优点:每次能够同时解析多个字段
缺点:不能解析嵌套的json。如果需要解析嵌套的,那么只能一层层解析
示例:
create database day09;
use day09;
/*
get_json_object:解析json内容
优点:能够解析嵌套的json
缺点:每次只能解析一个
*/
select
get_json_object('{"name":"zhangshan","age":18,"addr":{"province":"广东省","city":"广州市"}}','$.name') as name,
get_json_object('{"name":"zhangshan","age":18,"addr":{"province":"广东省","city":"广州市"}}','$.addr.province') as province;
/*
json_tuple:
优点:每次能够同时解析多个字段
缺点:不能解析嵌套的json。如果需要解析嵌套的,那么只能一层层解析
*/
select
json_tuple('{"name":"zhangshan","age":18,"addr":{"province":"广东省","city":"广州市"}}','name','age','addr');
select
json_tuple('{"name":"zhangshan","age":18,"addr":{"province":"广东省","city":"广州市"}}','name','age','addr') as (name,age,addr);
with tmp_1 as (
select json_tuple('{"name":"zhangshan","age":18,"addr":{"province":"广东省","city":"广州市"}}', 'addr') as addr
)
select get_json_object(addr,'$.province') from tmp_1;
with tmp_1 as (
select json_tuple('{"name":"zhangshan","age":18,"addr":{"province":"广东省","city":"广州市"}}', 'addr') as addr
)
select json_tuple(addr,'province') from tmp_1;
2、炸裂函数
知识点:
把一个容器的多个数据炸裂出单独展示: explode(字段名称)
炸裂函数配合侧视图使用如下
格式:
select 字段,侧视图中字段名称 from 原始表名
lateral view UDTF函数名称(原始表名中的字段) 侧视图别名 as 侧视图中字段名称1,侧视图中字段名称
解释:
1- 侧视图别名、侧视图中字段名称自己取名字
2- 侧视图别名前面不能有as
3- 侧视图里面只需要定义字段名称即可,不要设置数据类型
简单示例:
use day09;
-- 基础使用
select array(1,2,3,4,5);
select explode(array(1,2,3,4,5));
select map('a',1,'b',2,'c',3);
select explode(map('a',1,'b',2,'c',3));
实战:
-- NBA例子
create table nba(
team_name string,
year_str array<string>
)row format delimited fields terminated by ','
collection items terminated by '|';
-- 加载导入数据
load data inpath '/dir/The_NBA_Championship.txt' into table nba;
-- 验证数据
select * from nba;
-- 炸裂
select explode(year_str) as `year` from nba;
-- UDTF函数一般会和侧视图一起出现
/*
侧视图语法
select 字段,侧视图中字段名称 from 原始表名
lateral view UDTF函数名称(原始表名中的字段) 侧视图别名 as 侧视图中字段名称1,侧视图中字段名称2....
*/
select team_name,year from nba
lateral view explode(year_str) years as year;
-- 侧视图和json_tuple配合使用
with tmp_1 as (
select json_tuple('{"name":"zhangshan","age":18,"addr":{"province":"广东省","city":"广州市"}}', 'addr') as addr
)
select json_tuple(addr,'province') from tmp_1;
with tmp_1 as (
select 1 as id,'{"name":"zhangshan","age":18,"addr":{"province":"广东省","city":"广州市"}}' as info
)
select id,name,age from tmp_1
lateral view json_tuple(info,'name','age') info_view as name,age
13、行列转换函数
13.1 行转列
知识点:
collect_set(字段名): 把多个数据放到一个set集合中,会对数据进行去重处理
collect_list(字段名): 把多个数据放到一个list集合中,不会对数据进行去重处理
把多个子串用指定分隔符拼接成一个大字符串: concat_ws("分隔符号",字段名称1,字段名称2..)
注意: concat_ws只能针对字符串内容进行拼接,不支持数值,需要使用cast进行类型转换。
需求:
示例:
use day09;
-- 行转列
create table row_2_column(
col1 string,
col2 string,
col3 int
)row format delimited fields terminated by '\t';
-- 导入数据
load data inpath '/dir/r2c2.txt' into table row_2_column;
-- 验证数据
select * from row_2_column;
-- 行转列
/*
sql编写思路:先写整体,再对局部地方进行细化。先易后难,先写你会的,再写你不会
*/
describe function extended collect_list;
select
col1,col2,
collect_set(col3) as set_col3s,
collect_list(col3) as list_col3s,
concat_ws("_",collect_list(cast(col3 as string))) as col3_str
from row_2_column
group by col1, col2;
13.2 列转行
知识点:
把字符串按照指定分隔符切割: split(字段名称,分隔符号)
需求:
示例:
-- 列转行:explode+lateral view
create table column_2_row(
col1 string,
col2 string,
col3 string
)row format delimited fields terminated by '\t';
-- 导入数据
load data inpath '/dir/c2r2.txt' into table column_2_row;
-- 验证数据
select * from column_2_row;
-- 列转行:explode+lateral view
select
col1,col2,new_col3
from column_2_row
lateral view explode(split(col3,",")) col3_view as new_col3;
-- 细节测试语句
select
explode(split(col3,",")) as col3s -- split切分以后得到是array数据类型
from column_2_row;
笔试题:https://blog.csdn.net/weixin_43648241/article/details/109125978
14、开窗函数
官网链接:https://cwiki.apache.org/confluence/display/ASTERIXDB/Window+Functions
14.1 基础使用
知识点:
开窗函数格式: 开窗函数 over(partition by 分组字段名 [order by 排序字段名 asc|desc] [rows between 开窗开始 and 开窗结束])
partition by: 按照谁进行分组
order by: 对分组后的数据进行排序
rows between and: 限定窗口统计数据范围
开窗函数分类:
第一类: 编号相关。
row_number(): 123456。不管数据有没有重复,单调递增往后进行编号
rank(): 123446。如果遇到相同数据,那么会重复编号,并且会占用后续的编号
dense_rank(): 123445。如果遇到相同数据,那么会重复编号,但是不会占用后续的编号
第二类: 聚合函数。count()、sum()、avg()、max()、min()....
第三类: 取值函数。ntile()、lag()、lead()、first_value()、last_value()
示例:
use day09;
-- 创建表
create table pv_tb(
cookieid string,
datestr string,
pv int
)row format delimited fields terminated by ',';
-- 导入数据
load data inpath '/dir/website_pv_info.txt' into table pv_tb;
-- 验证数据
select * from pv_tb;
-- 编号相关的窗口函数
select
cookieid,
datestr,
pv,
-- row_number:用的最多。单调递增的进行编号,不管重复数据
row_number() over(partition by cookieid order by pv asc) as rs1,
-- rank:单调递增的进行编号,如果遇到重复数据,编号是相同,同时会占用后面的编号资格
rank() over(partition by cookieid order by pv asc) as rs2,
-- dense_rank:单调递增的进行编号,如果遇到重复数据,编号是相同,同时不会占用后面的编号资格
dense_rank() over(partition by cookieid order by pv asc) as rs3
from pv_tb;
select
cookieid,
datestr,
pv,
row_number() over(partition by cookieid order by pv asc) as rn,
-- 如果有order by那么窗口的大小是慢慢逐渐放大的
sum(pv) over(partition by cookieid order by pv asc) as sum_result,
-- 如果没有order by那么窗口的大小直接彻底放大到最大
sum(pv) over(partition by cookieid) as sum_result2
from pv_tb;
窗口的运行原理:
针对sum(pv) over(partition by cookieid order by pv asc) as sum_result语句
14.2 控制数据范围
知识点:
开窗函数控制范围: rows between 范围开始 and 范围结束
具体的语法含义:
1- 范围开始
unbounded preceding: 从窗口开始
数字 preceding: 前几行数据
2- 范围结束
unbounded following: 到窗口结束
数字 following: 后几行数据
3- 特殊的,既能够作为范围开始,也能够作为范围结束
current row: 当前行
示例:
-- 控制窗口统计的数据范围
select
cookieid,
datestr,
pv,
sum(pv) over(partition by cookieid order by pv rows between unbounded preceding and current row) as rs1,
sum(pv) over(partition by cookieid order by pv rows between 2 preceding and current row) as rs2,
sum(pv) over(partition by cookieid order by pv rows between unbounded preceding and unbounded following) as rs3,
sum(pv) over(partition by cookieid order by pv rows between 2 preceding and unbounded following) as rs4,
sum(pv) over(partition by cookieid order by pv rows between 2 preceding and 2 following) as rs5,
sum(pv) over(partition by cookieid order by pv rows between current row and unbounded following) as rs6
from pv_tb;
14.3 其他开窗函数
知识点:
ntile(n): 将窗口内的数据分配到n个桶里面去,返回的结果是桶的编号。可以使用在数据抽样中
lag: 取窗口中上一行的数据
lead: 取窗口中下一行的数据
first_value: 取窗口中第一行的数据
last_value : 取窗口中最后一行的数据
示例:
-- 其他函数
select
cookieid,
datestr,
pv,
row_number() over(partition by cookieid order by pv asc) as rn,
-- ntile(n):将窗口内的数据分配到n个桶里面去,返回的结果是桶的编号
ntile(3) over(partition by cookieid order by pv asc) as rs1,
-- 取窗口中上一行的数据
lag(pv) over(partition by cookieid order by pv asc) as rs2,
-- 取窗口中下一行的数据
lead(pv) over(partition by cookieid order by pv asc) as rs3,
-- 取窗口中第一行的数据
first_value(pv) over(partition by cookieid order by pv asc) as rs4,
-- 取窗口中最后一行的数据
last_value(pv) over(partition by cookieid order by pv asc) as rs5
from pv_tb;
-- 取窗口中上一行的数据
lag(pv) over(partition by cookieid order by pv asc) as rs2,
-- 取窗口中下一行的数据
lead(pv) over(partition by cookieid order by pv asc) as rs3,
-- 取窗口中第一行的数据
first_value(pv) over(partition by cookieid order by pv asc) as rs4,
-- 取窗口中最后一行的数据
last_value(pv) over(partition by cookieid order by pv asc) as rs5 from pv_tb;