SparkSQL处理数据倾斜
广播实现思路
伪代码
原因是因为我们在做一个需求的时候由于计算量非常大,数据量不是很大,但是会有很多空值,导致数据倾斜,我们采取了Spark SQL方式实现需求,同时采取了广播优化的方案策略。
object External {
val RESU_SAVE_TABLE_NAME = "dwd_led_tab"
val session = session.conf.set("spark.app.name","External")
session.conf.set("spark.sql.adaptive.enabled",false)
session.conf.set("spark.speculation",true)
session.conf.set("spark.yarn.am.cores",3)
session.conf.set("spark.network.timeout",300)
session.conf.set("spark.reducer.maxReqsInFlight",10)
session.conf.set("spark.reducer.maxBlocksInFlightPerAddress",10)
session.conf.set("spark.sql.outoBroadcastJoinThreshold",1048576000)
sparkSession(session)
def mm_leader(partition_name:String):Unit = {
println("partition_name: " + partition_name)
val mm_leaderDf = session.sql(
s"""
|select
| default.md5(concat('mm',poi_id)) as leader_id
| ,city_id
| ,city_name
| ,poi_id
| ,lng
| ,lat
|from external_open.dwd_goods_mm_df
|where dt = '${partition-name}'
|and lng is not null
|and lat is not null
|and lng != 0.0
|and lat != 0.0
|""".stripMargin
).repartition(numpartitions = 1000)
val mm_poiDf = session.sql(
s"""
|select
| grid_id
| ,big_grid_id
| ,city_id
| ,fence as contour
|from es2hive_group_leader_grid
|where dt = from_unixtime(unix_timestamp(concat_ws('-','${partition_name}',23),'yyyy-MM-dd-HH'),'yyyyMmddHH')
|and status = 1
|and fence is not null
|and fence != ''
|""".stripMargin
).cache()
val leader_mm_grids = mm_leaderDf.join(broadcase(mm_poiDf),usingColumn = "city_id")
leader_mm_grids.where(col(colName = "is_within").equalTo(other=1)).createTempView("refu_mm_tab")
println("leader_mm_grids ====+++++++++++ TempView")
val external_leader = session.sql(
s"""
|select
| leader_id
| ,city_id
| ,city_name
| ,poi_id
| ,lng
| ,lat
| ,grid_id
| ,big_grid_id
| ,contour
| ,is_within
|from(
| select
| leader_id
| ,city_id
| ,city_name
| ,poi_id
| ,lng
| ,lat
| ,grid_id
| ,big_grid_id
| ,contour
| ,is_within
| ,row_number() over(partition by leader_id order by grid_id asc) rank
| from refu_mm_tab
| where is_within = 1
|)t1
|where rank = 1
|""".stripMargin)
session.sql(
s"""
|ALTER TABLE ${RESU_SAVE_TABLE_NAME} DROP IF EXISTS PARTITION(dt = '${partition_name}')
|""".stripMargin)
session.catalog.refreshTable(RESU_SAVE_TABLE_NAME)
external_leader.withColumn(colName = "dt",functions.lit(partition_name))
.repartition(numPartitions = 30).write.mode(SaveMode.Append)
.format(source = "hive").insertInto(RESU_SAVE_TABLE_NAME)
}
def main(args:Array[String]):Unit = {
val partition_name = get_partition(args)
mm_leader(partition_name)
}
}
代码写完后,直接打包上传到集群去检测,实测比之前要提高了百分之九十效率。
亲测有效!
亲测有效!
亲测有效!