SparkSQL广播实现处理数据倾斜

SparkSQL处理数据倾斜

广播实现思路
伪代码

原因是因为我们在做一个需求的时候由于计算量非常大,数据量不是很大,但是会有很多空值,导致数据倾斜,我们采取了Spark SQL方式实现需求,同时采取了广播优化的方案策略。

object External {
 val RESU_SAVE_TABLE_NAME = "dwd_led_tab"
 val session = session.conf.set("spark.app.name","External")
 session.conf.set("spark.sql.adaptive.enabled",false)
 session.conf.set("spark.speculation",true)
 session.conf.set("spark.yarn.am.cores",3)
 session.conf.set("spark.network.timeout",300)
 session.conf.set("spark.reducer.maxReqsInFlight",10)
 session.conf.set("spark.reducer.maxBlocksInFlightPerAddress",10)
 session.conf.set("spark.sql.outoBroadcastJoinThreshold",1048576000)
 
 sparkSession(session)
 
def mm_leader(partition_name:String):Unit = {
	println("partition_name: " + partition_name)
	val mm_leaderDf = session.sql(
	s"""
	|select
	| 	 default.md5(concat('mm',poi_id)) as leader_id
	| 	,city_id
	|	,city_name
	|	,poi_id
	|	,lng
	|	,lat
	|from external_open.dwd_goods_mm_df
	|where dt = '${partition-name}'
	|and lng is not null
	|and lat is not null
	|and lng != 0.0
	|and lat != 0.0
	|""".stripMargin
	).repartition(numpartitions = 1000)
	
	val mm_poiDf = session.sql(
	s"""
	|select
	|	 grid_id
	|	,big_grid_id
	|	,city_id
	|	,fence as contour
	|from es2hive_group_leader_grid
	|where dt = from_unixtime(unix_timestamp(concat_ws('-','${partition_name}',23),'yyyy-MM-dd-HH'),'yyyyMmddHH')
	|and status = 1
	|and fence is not null 
	|and fence != ''
	|""".stripMargin
	).cache()

    val leader_mm_grids = mm_leaderDf.join(broadcase(mm_poiDf),usingColumn = "city_id")
	
	leader_mm_grids.where(col(colName = "is_within").equalTo(other=1)).createTempView("refu_mm_tab")
	println("leader_mm_grids ====+++++++++++ TempView")

	val external_leader = session.sql(
		s"""
		|select
		|	 leader_id
		|	,city_id
		|	,city_name
		|	,poi_id
		|	,lng
		|	,lat
		|	,grid_id
		|	,big_grid_id
		|	,contour
		|	,is_within
		|from(
		|	select
		|		leader_id
		|		,city_id
		|		,city_name
		|		,poi_id
		|		,lng
		|		,lat
		|		,grid_id
		|		,big_grid_id
		|		,contour
		|		,is_within	
		|		,row_number() over(partition by leader_id order by grid_id asc) rank
		|	from refu_mm_tab
		|	where is_within = 1
		|)t1
		|where rank = 1
		|""".stripMargin)
		
	session.sql(
	s"""
	|ALTER TABLE ${RESU_SAVE_TABLE_NAME} DROP IF EXISTS PARTITION(dt = '${partition_name}')
	|""".stripMargin)
	
	session.catalog.refreshTable(RESU_SAVE_TABLE_NAME)
	external_leader.withColumn(colName = "dt",functions.lit(partition_name))
	.repartition(numPartitions = 30).write.mode(SaveMode.Append)
	.format(source = "hive").insertInto(RESU_SAVE_TABLE_NAME)
}
 	def main(args:Array[String]):Unit = {
		val partition_name = get_partition(args)
		mm_leader(partition_name)
	}
}

代码写完后,直接打包上传到集群去检测,实测比之前要提高了百分之九十效率。
亲测有效!
亲测有效!
亲测有效!

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值