背景:核心任务执行优化前90min,优化后25min
案例一:
create table tmp.tmp_test_d as
select
a.sku
,a.barcode
,a.tu_code
,a.buyer_tel
,a.pay_type
,a.deliver_company
,a.need_sync
,a.import_status
,a.source
,a.source_type
,coalesce(cst.tax_price*a.buy_count,a.cost_price) cost_price
from(select /*+ mapjoin(dpb)*/ t0.*,dpb.cid1,dpb.cid2,dpb.cid3,dpb.cname1,dpb.cname2,dpb.cname3,dpb.package_id,dpb.package_type,dpb.item_name item_name2
,dpb.item_oms_cid1,dpb.item_oms_cname1,dpb.item_oms_cid2,dpb.item_oms_cname2,dpb.item_oms_cid3,dpb.item_oms_cname3,dpb.item_oms_cid4,dpb.item_oms_cname4
,case when t0.fullcoupon_id<=0 or t0.fullcoupon_id is null then cast(ceil(rand()*-1000) as bigint) else t0.fullcoupon_id end fullcoupon_id2
from dw.dw_trd_order_sku_star_d t0
left join (select * from dw.dw_prd_barcode_d where stat_day = '20200219') dpb on t0.barcode=dpb.barcode
where t0.stat_day = '20200219'
) a
left join (select * from dw.dw_prd_barcode_d where stat_day = '20200219') dpb_sku
on a.sku=dpb_sku.barcode
left join(select yj_order_detail_id,sku_no,
max(case when rn =1 then delivery_time end) delivery_time,
max(case when rn =1 then delivery_status end) delivery_status,
max(case when rn =1 then receive_time end) receive_time,
max(case when rn =1 then supplier_code end) supplier_code,
max(case when rn =1 then supplier_name end) supplier_name,
max(case when rn =1 then src_type end) src_type,
sum(case when receive_time is not null then delivery_num end) receive_num,
sum(case when delivery_time is not null then delivery_num end) delivery_num,
sum(case when delivery_time is not null then delivery_money end) delivery_money
from(
select yj_order_detail_id,sku_no,
system_time delivery_time,
status as delivery_status,
qty as delivery_num,
total_price delivery_money,
sign_time receive_time,
supplier_code,supplier_name,src_type,
row_number() over(partition by yj_order_detail_id,sku_no order by system_time desc,order_id desc) rn
from dw.dw_lgs_logistics_control_sku_d where stat_day = '20200219'
)lgc0
group by yj_order_detail_id,sku_no
) lgc on a.sub_order_id = lgc.yj_order_detail_id and a.sku = lgc.sku_no
left join (select shop_id,user_id,user_tel,consumer_id from dw.dw_usr_user_shop_d where stat_day = '20200219') usr on a.shop_id = usr.shop_id
left join (select consumer_id,user_id from dw.dw_usr_consumer_d where stat_day = '20200219') cor on a.consumer_id = cor.consumer_id
left join (select fullcoupon_id,max(pop_shop_id) pop_shop_id,max(use_scope_desc) use_scope_desc from ods.ods_cpn_fullcoupon_d where stat_day='20200219' group by fullcoupon_id
) fc on a.fullcoupon_id2 = fc.fullcoupon_id
left join(select department_id,max(department_name_secd) department_name_secd from dw.dw_usr_department_org_d where stat_day = '20200219' group by department_id) org on dpb_sku.dept_id = org.department_id
left join(select barcode,date_format(cost_date,'yyyy-MM-dd') cost_date,max(tax_price) tax_price from dw.dw_prd_day_barcode_cost_d where stat_day='20200219' group by barcode,date_format(cost_date,'yyyy-MM-dd')) cst on a.sku = cst.barcode and date_format(a.pay_time,'yyyy-MM-dd') = cst.cost_date
;
原始sql如上,省略了部分字段,运行时长平均90min以上。
观察stage的运行时长。
发现有几个stage比较慢,进去看下是否倾斜了
DAG:
从DAG图能看到是跑到了如下一段join的语句,可以看到有两个task特别大,估计是数据倾斜了,通过sql分析是barcode这个字段会有倾斜,可以去数据库验证下barcode数量分布。
select /*+ mapjoin(dpb)*/ t0.*,dpb.cid1,dpb.cid2,dpb.cid3,dpb.cname1,dpb.cname2,dpb.cname3,dpb.package_id,dpb.package_type,dpb.item_name item_name2
,dpb.item_oms_cid1,dpb.item_oms_cname1,dpb.item_oms_cid2,dpb.item_oms_cname2,dpb.item_oms_cid3,dpb.item_oms_cname3,dpb.item_oms_cid4,dpb.item_oms_cname4
,case when t0.fullcoupon_id<=0 or t0.fullcoupon_id is null then cast(ceil(rand()*-1000) as bigint) else t0.fullcoupon_id end fullcoupon_id2
from dw.dw_trd_order_sku_star_d t0
left join (select * from dw.dw_prd_barcode_d where stat_day = '20200219') dpb on t0.barcode=dpb.barcode
where t0.stat_day = '20200219'
原始sql用了mapjoin,sql作者本意应该是想走mapjoin的,但实际通过dag或者执行计划可以看到是走sortmergejoin的,mapjoin再spark是broadcastHashJoin,看下数据量大小大概再400M左右,这是一个可以优化的点。
另外那个24min的stage进去也发现是数据倾斜,分析原因实在sql
left join (select shop_id,user_id,user_tel,consumer_id from dw.dw_usr_user_shop_d where stat_day = '20200219') usr on a.shop_id = usr.shop_id
也就是主表的店铺id这个字段有倾斜,查询发现少了的shop_id比较密集,这是其中一个优化的店,
后面这段代码
left join (select consumer_id,user_id from dw.dw_usr_consumer_d where stat_day = '20200219') cor on a.consumer_id = cor.consumer_id
跑的慢的原因同上。
根据上面的一些分析进行优化:
CACHE TABLE dw_prd_barcode_d_cache AS SELECT * from dw.dw_prd_barcode_d where stat_day = '20200220';
set spark.sql.crossJoin.enabled=true;
--shop_id倾斜处理
create temporary view tmp_user_shop_d as
select
concat('Null',a.shop_id,b.num) as shop_id,
user_id,
user_tel,
consumer_id
from dw.dw_usr_user_shop_d a
cross join dim.dim_tb_random_d b
where a.stat_day = '20200220'
and a.shop_id in (...)
union all
select
a.shop_id,
user_id,
user_tel,
consumer_id
from dw.dw_usr_user_shop_d a
where a.stat_day = '20200220'
and a.shop_id not in (...);
drop table if exists tmp.dw_trd_order_sku_all_d_20200220_01;
create table tmp.dw_trd_order_sku_all_d_20200220_01 as
select /*+ mapjoin(dpb_sku,fc,org)*/
a.order_id
,a.sub_order_id
,a.child_order_id
,a.sku
,a.barcode
,dpb_sku.oms_cid3
,dpb_sku.oms_cname3
...
from(select /*+ mapjoin(dpb)*/ t0.*,dpb.cid1,dpb.cid2,dpb.cid3,dpb.cname1,dpb.cname2,dpb.cname3,dpb.package_id,dpb.package_type,dpb.item_name item_name2
,dpb.item_oms_cid1,dpb.item_oms_cname1,dpb.item_oms_cid2,dpb.item_oms_cname2,dpb.item_oms_cid3,dpb.item_oms_cname3,dpb.item_oms_cid4,dpb.item_oms_cname4
,case when t0.fullcoupon_id<=0 or t0.fullcoupon_id is null then cast(ceil(rand()*-1000) as bigint) else t0.fullcoupon_id end fullcoupon_id2
,case when t0.sku in (...'') then concat('Null',ceiling(rand()*1000)) else t0.sku end as sku_new
,case when t0.shop_id in (...'') then concat('Null',t0.shop_id,ceiling(rand()*100)) else t0.shop_id end as shop_id_new
,case when t0.consumer_id in (0) then concat('Null',t0.consumer_id,ceiling(rand()*100)) else t0.consumer_id end as consumer_id_new
from dw.dw_trd_order_sku_star_d t0
left join dw_prd_barcode_d_cache dpb on t0.barcode=dpb.barcode
where t0.stat_day = '20200220'
) a
left join dw_prd_barcode_d_cache dpb_sku
on a.sku=dpb_sku.barcode
left join (select fullcoupon_id,max(pop_shop_id) pop_shop_id,max(use_scope_desc) use_scope_desc from ods.ods_cpn_fullcoupon_d where stat_day='20200220' group by fullcoupon_id
) fc on a.fullcoupon_id2 = fc.fullcoupon_id
left join(select department_id,max(department_name_secd) department_name_secd from dw.dw_usr_department_org_d where stat_day = '20200220' group by department_id) org on dpb_sku.dept_id = org.department_id
left join(select yj_order_detail_id,sku_no,
max(case when rn =1 then delivery_time end) delivery_time,
max(case when rn =1 then delivery_status end) delivery_status,
max(case when rn =1 then receive_time end) receive_time,
max(case when rn =1 then supplier_code end) supplier_code,
max(case when rn =1 then supplier_name end) supplier_name,
max(case when rn =1 then src_type end) src_type,
sum(case when receive_time is not null then delivery_num end) receive_num,
sum(case when delivery_time is not null then delivery_num end) delivery_num,
sum(case when delivery_time is not null then delivery_money end) delivery_money
from(
select yj_order_detail_id,sku_no,
system_time delivery_time,
status as delivery_status,
qty as delivery_num,
total_price delivery_money,
sign_time receive_time,
supplier_code,supplier_name,src_type,
row_number() over(partition by yj_order_detail_id,sku_no order by system_time desc,order_id desc) rn
from dw.dw_lgs_logistics_control_sku_d where stat_day = '20200220'
)lgc0
group by yj_order_detail_id,sku_no
) lgc on a.sub_order_id = lgc.yj_order_detail_id and a.sku = lgc.sku_no
left join tmp_user_shop_d usr on a.shop_id_new = usr.shop_id
left join (select consumer_id,user_id from dw.dw_usr_consumer_d where stat_day = '20200220') cor on a.consumer_id_new = cor.consumer_id
left join(select barcode,date_format(cost_date,'yyyy-MM-dd') cost_date,max(tax_price) tax_price from dw.dw_prd_day_barcode_cost_d where stat_day='20200220' group by barcode,date_format(cost_date,'yyyy-MM-dd')) cst on a.sku_new = cst.barcode and date_format(a.pay_time,'yyyy-MM-dd') = cst.cost_date
;
uncache table dw_prd_barcode_d_cache;
如上改完后发现速度还是很慢主要再left join这块地方。通过sparkui的任务执行可以发现数据其实已经没有倾斜了,但是每个task要处理的量很大,造成大量的shuffle的落地到disk,猜测io花费了很多时间,这个就只要调整并发就OK了。
set spark.sql.shuffle.partitions=3000;
最终处理完,总的任务时长再25分钟左右。
总结:
1、不管是spark还是hive,不要相信自己的代码,一定要去看执行计划和spark的执行ui界面。。PS:踩过的坑…一开始还切过hive计算,发现mapjoin不起效果,直接hint方式没用,还需要设置最小的mapjoin数据大小的参数。同时hive中不同的是,如果mapjoin分发内存溢出,那么会有备用执行stage来代替。而spark走了broadcasthashjoin失败了就是内存溢出就推出了。
2、spark中分析sql慢的原因可以从以下几点出发:1、观察哪个stage慢2、对比执行计划分析到慢的是哪段代码。3、慢的原因是什么4、如果倾斜那么可以采用mapjoin,过滤,随机数,union等方式主要看业务。5、如果数据不倾斜可能是task的处理量太大导致shuffle write很大,这样就要扩大task数。
3、spark中mapjoin最好放一起执行。是执行顺序的问题。我上面有改几个leftjoin的位置。他这个计划是从上到下来的,多个mapjoin的话会放在一个stage运行。
4、spark中可以使用temp view的方式,节省写入hdfs和读取hdfs的过程,数据量一大主要读取至少5min已上,比较耗时,这个时候就不建议分步走临时表,会有很大的损耗。
5、倾斜的处理方式网上很多,可以查查。我这边用的是打随机数扩大100倍的方式,明显快很多
6、spark一些参数调优
executor数量相关
spark.dynamicAllocation.enabled true
spark.dynamicAllocation.minExecutors 0
spark.dynamicAllocation.maxExecutors 40
driver资源相关
spark.driver.cores 2
spark.driver.memory 6g
spark.driver.memoryOverhead 4g
sparksql相关
spark.sql.adaptive.enabled true
spark.sql.adaptive.minNumPostShufflePartitions 5
spark.sql.adaptive.maxNumPostShufflePartitions 2000
spark.sql.adaptive.shuffle.targetPostShuffleInputSize 134217728
spark.sql.shuffle.partitions 500
下面这个文章有详细的参数配置倾斜处理
https://blog.csdn.net/bingdianone/article/details/83824309