更多的场景示例请点击:https://blog.csdn.net/SKY_02/article/details/113394634
原逻辑
select
m1.id
,m1.addtime
,m2.id
,m2.updatetime
from
(
select
id
,addtime
,m2_id
from test.test_table_1 t
) m1
left join
(
select id, updatetime
from test.test_table_2 --里面不存在id为0的数据
) m2
on m1.m2_id=m2.id -- m1.m2_id中90%都是0,会产生数据倾斜
limit 10
;
优化后写法1
select
m1.id
,m1.addtime
,m2.id
,m2.updatetime
from
(
select
id
,addtime
,m2_id
-- 对关联列作处理,若产生数据倾斜的值并不需要展示信息,用随机数将其转化为其他关联不上的信息
,case when m2_id > 0 then m2_id else cast(rand() * -1000000000 as int) end as m2_id_skew_for_join -- 用于解决ticketid关联产生的数据倾斜
from test.test_table_1 t
) m1
left join
(
select id, updatetime
from test.test_table_2 --里面不存在id为0的数据
) m2
on m1.m2_id_skew_for_join=m2.id --与新列进行关联
limit 10
优化后写法2
select
m1.id
,m1.addtime
,m2.id
,m2.updatetime
from
(
select
id
,addtime
,m2_id
from test.test_table_1 t
) m1
left join
(
select id, updatetime
from test.test_table_2 --里面不存在id为0的数据
) m2
-- 对关联列作处理,若产生数据倾斜的值并不需要展示信息,用随机数将其转化为其他关联不上的信息
on case when m1.m2_id > 0 then m1.m2_id else cast(rand() * -1000000000 as int) end = m2.id
limit 10