Hive SQL——explode拆分函数&多行(列)合并为一行(列)&reflect函数

巷子里的猫X

已于 2022-10-14 19:02:48 修改

阅读量4.8k

点赞数 2

分类专栏：数据分析文章标签： hive hadoop 数据仓库

于 2022-10-13 19:17:51 首次发布

本文链接：https://blog.csdn.net/qq_52421831/article/details/127307269

版权

数据分析专栏收录该内容

21 篇文章 32 订阅

订阅专栏

一、拆分 map 和 array

1.执行Linux命令

cd /data/import/
sudo vi test_explode_map_array.txt

添加以下文件内容

小明产品1,产品2,产品3 性别:男,年龄:24
小花产品4,产品5,产品6 性别:女,年龄:22

2.创建表并加载数据

-- 开启智能本地模式
-- set hive.exec.mode.local.auto=true;

-- 创建表
create table test.test_explode_map_array(
    name string, 
    prod_arr array<string>,
    info_map map<string, string>)
row format delimited
-- 字段分隔符为'\t'
fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
stored as textfile;

-- 加载数据（方法一）
load data local inpath '/data/import/test_explode_map_array.txt'
into table test.test_explode_map_array;

-- 加载数据（方法二）
insert into test.test_explode_map_array 
select '小明', array('产品1', '产品2', '产品3'), str_to_map('性别:男,年龄:24');
insert into test.test_explode_map_array 
select '小花', array('产品4', '产品5', '产品6'), str_to_map('性别:女,年龄:22');

3.查询结果1

map_key	map_value
年龄	24
性别	男
年龄	22
性别	女

select
	-- explode拆分后必须加括号
	explode(info_map) as (map_key, map_value)
from test.test_explode_map_array;



-- 查询其他字段, lateral view(侧视图, 虚拟表)
select 
	name, map_key, map_value
from test.test_explode_map_array
-- 必须去掉括号map_key, map_value
lateral view explode(info_map) tmp_table as map_key, map_value;

4.查询结果2

prod_arr_new
产品1
产品2
产品3
产品4
产品5
产品6

select 
	-- explode拆分数组
	explode(prod_arr) as prod_arr_new
from test.test_explode_map_array;

5.查询结果3

name	prod_arr_new
小明	产品1
小明	产品2
小明	产品3
小花	产品4
小花	产品5
小花	产品6


-- 查询其他字段，lateral view(侧视图， 虚拟表)
select
	name, prod_arr_new
from test.test_explode_map_array
lateral view explode(prod_arr) tmp_table as prod_arr_new;

6.Hive三类UDF

UDF：用户自定义函数（user-defined function），输入一个值，返回一个值，一进一出
UDAF：用户自定义聚合函数（user-defined aggregate function），输入多个值，返回一个值，多进一出
UDTF：用户自定义表生成函数（user-defined table-generating function），输入一个值，返回多个值，一进多出

-- UDF：length\year\month\day ...
-- UDAF：sum\count\max ...
-- UDTF：explode

二、拆分 json

1.执行Linux命令

cd /data/import/

sudo vi test_explode_json.txt

添加以下文件内容

a:shandong,b:beijing,c:hebei|1,2,3,4,5,6,7,8,9|[{"source":"7fresh","monthSales":4900,"userCount":1900,"score":"9.9"},{"source":"jd","monthSales":2090,"userCount":78981,"score":"9.8"},{"source":"jdmart","monthSales":6987,"userCount":1600,"score":"9.0"}]

2.创建表并加载数据

-- 开启智能本地模式
-- set hive.exec.mode.local.auto=true;

-- 创建表
create table test.test_explode_json(
    area string,
    goods_id string,
    sale_info string)
row format delimited
-- 字段分隔符为'|'
fields terminated by '|'
stored as textfile;

-- 加载数据（方法一）
load data local inpath '/data/import/test_explode_json.txt'
overwrite into table test.test_explode_json;

-- 加载数据（方法二）
insert into test.test_explode_json
values('a:shandong,b:beijing,c:hebei', 
       '1,2,3,4,5,6,7,8,9', 
       '[{"source":"7fresh","monthSales":4900,"userCount":1900,"score":"9.9"},{"source":"jd","monthSales":2090,"userCount":78981,"score":"9.8"},{"source":"jdmart","monthSales":6987,"userCount":1600,"score":"9.0"}]')

3.查询结果1

goods_id
1
2
3
4
5
6
7
8
9


select
	explode(split(goods_id,',')) as goods_id
from test.test_explode_json;

4.查询结果2

code	area
a	shandong
b	beijing
c	hebei


select
	regexp_extract(area_new, '(.*):(.*)',1) as code
	, regexp_extract(area_new, '(.*):(.*)',2) as arec
from test.test_explode_json
lateral view explode(split(area,',')) tmp_table as area_new

5.侧视图详解

lateral view + explode + split

结果集的行数是如何生成的？

查询1个字段

-- 9个元素= 9行
select
	goods_id_new
from test.test_explode_json
lateral view explode(split(goods_id,',')) tmp_table as goods_id_new;

查询2个字段

-- 9个元素*1个元素 = 9行
select
	goods_id_new,area
from test.test_explode_json
lateral view explode(split(goods_id,',')) tmp_table as goods_id_new;

查询3个字段

-- 9个元素*3个元素*1个元素=27行
select
	goods_id_new,area_new,sale_info 
from test.test_explode_json
lateral view explode(split(goods_id,',')) tmp_table as goods_id_new
lateral view explode(split(area,',')) tmp_table as area_new;

6.查询结果3

source	monthsales	usercount	score
7fresh	4900	1900	9.9
jd	2090	78981	9.8
jdmart	6987	1600	9.0

第一步：将字段sale_info的"[{"和"}]"替换为空字符串

"[{"source":"7fresh","monthSales":4900,"userCount":1900,"score":"9.9"},{"source":"jd","monthSales":2090,"userCount":78981,"score":"9.8"},{"source":"jdmart","monthSales":6987,"userCount":1600,"score":"9.0"}]"

select
	regexp_replace(sale_info, '\\[\\{|\\}\\]','')
from test.test_explode_json

第二步：以"},{"拆分为数组

select
	explode(split(regexp_replace(sale_info, '\\[\\{|\\}\\]',''), '\\},\\{'))
from test.test_explode_json

第三步：将数组拆分为多行

select
	sale_info_new
from test.test_explode_json
lateral view explode(split(regexp_replace(sale_info, '\\[\\{|\\}\\]',''), '\\},\\{')) tmp_table as sale_info_new

第四步：转换为json字符串

select
	concat('{', sale_info_new, '}')
from test.test_explode_json
lateral view explode(split(regexp_replace(sale_info, '\\[\\{|\\}\\]',''), '\\},\\{')) tmp_table as sale_info_new

第五步：将json字符串转换为二维表

select
	get_json_object(concat('{', sale_info_new, '}'), '$.source') as source
	, get_json_object(concat('{', sale_info_new, '}'), '$.monthSales') as monthSales
	, get_json_object(concat('{', sale_info_new, '}'), '$.userCount') as userCount
	, get_json_object(concat('{', sale_info_new, '}'), '$.score') as score
from test.test_explode_json
lateral view explode(split(regexp_replace(sale_info, '\\[\\{|\\}\\]',''), '\\},\\{')) tmp_table as sale_info_new

三、多行(列)合并为一行(列)

region_category	subclass
东北-办公	系固件,纸张,收纳具,信封,器具,美术,用品,装订机,标签
东北-家具	用具,椅子,桌子,书架
东北-技术	电话,配件,复印机,设备

-- concat(str1|col1, str2|col2, …)
-- 字符串合并，支持任意个字符串；

-- concat_ws(sep, str1, str2, ...)
-- 以sep为分隔符合并str1, str2, ...；如分隔符为null，则返回null；跳过合并值为null或空字符串的参数；

-- collect_set(col)去重汇总
-- 只支持基本数据类型(不支持复合数据类型)，将某字段的项去重汇总，返回array数据类型；
-- collect_list(col)不去重汇总



select 
	CONCAT(region, '-',category) as region_category
	, concat_ws(',', collect_set(subclass)) as `产品子类` 
from sm.sm_order_total
group by CONCAT(region, '-',category)

四、一行(列)拆分为多行(列)

-- 通过以上查询语句来创建表
create table test.test_explode_split as 
select 
	CONCAT(region, '-',category) as region_category
	, concat_ws(',', collect_set(subclass)) as `产品子类` 
from sm.sm_order_total
group by CONCAT(region, '-',category)

region	category	subclass_new
东北	办公	系固件
东北	办公	纸张
东北	办公	收纳具
东北	办公	信封
东北	办公	器具
东北	办公	美术
东北	办公	用品
东北	办公	装订机
东北	办公	标签

select 
	regexp_extract(t.region_category, '(.*)-(.*)', 1) as region
	, regexp_extract(t.region_category, '(.*)-(.*)', 2) as category
	, subclass_new 
from test.test_explode_split t
lateral view explode(split(t.subclass, ',')) tmp_table as subclass_new;

五、reflect函数

支持调用java函数

1.执行Linux命令

cd /data/import/
sudo vi test_reflect.txt

添加以下文件内容

11   配饰   7   7
12   配饰   9   5
13   配饰   5   7
14   服饰   9   5
15   服饰   9   4
16   配饰   7   5
17   服饰   8   3
18   配饰   6   5
19   服饰   5   4
20   配饰   9   4

2.调用java的max函数求两列最大值

use test;
-- 创建表
create table test.test_reflect(
    order_id int comment '订单编号',
    product string comment '产品',
    quality int comment '质量',
    service int comment '服务')
row format delimited
fields terminated by '\t';
-- 加载数据
load data local inpath '/data/import/test_reflect.txt'
into table test.test_reflect;



select 
	*
	, reflect('java.lang.Math','max',quality, service) as max_score
from test.test_reflect

3.不同的行执行不同的java函数

配饰：求最大值
服饰：求最小值


select
	t.order_id
	, t.product
	, t.quality 
	, t.service 
	, reflect('java.lang.Math', method_name, t.quality, t.service) as score
from
(
	select
		order_id 
		, product
		, quality
		, service 
		, case product when '配饰' then 'max'
			when '服饰' then 'min' end as method_name
	from test.test_reflect
) as t