更多的场景示例请点击:https://blog.csdn.net/SKY_02/article/details/113394634
-- 原逻辑
select
m1.id
,m1.addtime
,m2.id
,m2.updatetime
from
(
select
id
,addtime
,m2_id
from test.test_table_1 t
) m1
left join
(
select id, updatetime
from test.test_table_2 --里面存在id为0的数据
) m2
on m1.m2_id=m2.id -- m1.m2_id中90%都是0,会产生数据倾斜
limit 10
;
-- 适用于存在可列举的个别值产生数据倾斜的场景
select
m1.id
,m1.addtime
,m2.id
,m2.updatetime
from
(
select
id
,addtime
,m2_id
,case when m2_id in (0) then cast(rand() * 100 as int) else 101 end as rand_num --对异常值划分N个分区,其他数据划分到其他分区
from test.test_table_1 t
) m1
left join
(
select id, updatetime,101 as rand_num -- 固定正常数据在某个分区
from test.test_table_2
where id <> 0 --将异常值拎出来单独处理
union all
select
id
,updatetime
,cast(ex.rand_num as int) as rand_num
from
(
select id, updatetime
from test.test_table_2
where id = 0 --将异常值拎出来单独处理
) m1
--给异常数据扩展N份,要与上面另一张表的随机数分区值N相同
lateral view explode(split('0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100',',')) ex as rand_num
) m2
on m1.m2_id=m2.id -- m1.m2_id中90%都是0,会产生数据倾斜
and m1.rand_num=m2.rand_num -- 通过随机数分区打散异常值的分布
limit 10
;
-- 适用于每次关联时产生数据倾斜的值过多(几十到几百个,无法列举)或者不确定的场景(今天是a、b产生倾斜,明天可能是c、d、e)。
-- 由于某张表的数据量翻倍,会产生更多的临时数据,导致io增多,这种方式不一定能加速,但能让数据更均匀地分布,减少数据倾斜引起的OOM的可能性
select
m1.id
,m1.addtime
,m2.id
,m2.updatetime
from
(
select
id
,addtime
,m2_id
,cast(rand() * 50 as int) as rand_num --对异常值划分N个分区
from test.test_table_1 t
) m1
left join
(
select
id
,tickettype
,money
,updatetime
,cast(ex.rand_num as int) as rand_num
from
(
-- 给整个扩展N份数据,要与上面另一张表的随机数分区值N相同
select id, updatetime
from test.test_table_2 --里面存在id为0的数据
) m1
lateral view explode(split('0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50',',')) ex as rand_num
) m2
on m1.m2_id=m2.id -- m1.m2_id中90%都是0,会产生数据倾斜
and m1.rand_num=m2.rand_num -- 通过随机数分区打散异常值的分布
limit 10
;