1.起因
单表执行group by语句时候,key分配不均
2.解释
- 理论上,因为sparksql在map阶段有预聚合(相当于mapreduce的combine),理论上很少发生倾斜情况
- 但是,如果比如输入文件64个,其中有1-2个文件内,存在大key,也会导致map阶段内处理较慢
- 往往大key对应的executor容易发生oom
3.图解
3.1数据倾斜图示
如上图,不考虑map端的局部聚合,uuid1数据倾斜
3.2针对大key的处理
4.解决步骤
4.1 模拟数据倾斜
如下表,product(uuid,money)
Seq(
("uuid1",12),
("uuid1",13),
("uuid1",14),
("uuid1",15),
("uuid1",22),
("uuid1",23),
("uuid2",32)
).toDF("uuid","money").createTempView("product")
4.2 正常情况sql
val simpleSql =
"""
|select
|uuid,
|sum(money)
|from product
|group by uuid
|""".stripMargin
spark.sql(simpleSql).explain(mode = "simple")
spark.sql(simpleSql).show(false)
//对应执行逻辑
// == Physical Plan ==
// AdaptiveSparkPlan isFinalPlan=false
// +- HashAggregate(keys=[uuid#7], functions=[sum(money#8)])
// +- Exchange hashpartitioning(uuid#7, 200), ENSURE_REQUIREMENTS, [plan_id=14]
// +- HashAggregate(keys=[uuid#7], functions=[partial_sum(money#8)])
// +- LocalTableScan [uuid#7, money#8]
4.3 加盐去盐处理的sql
def randomPrefixUDF( uuid: String, num: Int ): String = {
new Random().nextInt(num).toString + "_" + uuid
}
def removeRandomPrefixUDF( randomUuid: String ): String = {
randomUuid.split("_")(1)
}
val prefixSql =
"""
|select
| uuid,
| sum(money) totalmoney
|from
| (
| select
| remove_random_prefix(random_uuid) uuid,
| money
| from
| (
| select
| random_uuid,
| sum(money) money
| from
| (
| select
| random_prefix(uuid, 6) random_uuid,
| money
| from
| product
| ) t1
| group by random_uuid
| ) t2
| ) t3
|group by
| uuid
""".stripMargin
spark.udf.register("random_prefix", ( uuid: String, num: Int ) => randomPrefixUDF(uuid, num))
spark.udf.register("remove_random_prefix", ( randomUuid: String ) => removeRandomPrefixUDF(randomUuid))
spark.sql(prefixSql).explain(mode = "simple")
spark.sql(prefixSql).show()
// == Physical Plan ==
// AdaptiveSparkPlan isFinalPlan=false
// +- HashAggregate(keys=[uuid#18], functions=[sum(money#17L)])
// +- Exchange hashpartitioning(uuid#18, 200), ENSURE_REQUIREMENTS, [plan_id=25]
// +- HashAggregate(keys=[uuid#18], functions=[partial_sum(money#17L)])
// +- HashAggregate(keys=[random_uuid#16], functions=[sum(money#8)])
// +- Exchange hashpartitioning(random_uuid#16, 200), ENSURE_REQUIREMENTS, [plan_id=21]
// +- HashAggregate(keys=[random_uuid#16], functions=[partial_sum(money#8)])
// +- LocalTableScan [random_uuid#16, money#8]
4.3 抽样不放回10%定位倾斜key
def sampleTopKey( sparkSession: SparkSession, tableName: String, keyColumn: String ): Array[(Int, Row)] = {
val df: DataFrame = sparkSession.sql("select " + keyColumn + " from " + tableName)
val top10Key = df
.select(keyColumn).sample(false, 0.1).rdd // 对key不放回采样
.map(k => (k, 1)).reduceByKey(_ + _) // 统计不同key出现的次数
.map(k => (k._2, k._1)).sortByKey(false) // 统计的key进行排序
.take(10)
top10Key
}
/**
* 抽样不放回10%定位倾斜key
* 比如已经确定是uuid1导致的倾斜
*/
sampleTopKey(spark,"product","uuid").foreach(println)
4.4 确定大key之后的sql
val prefixSql2 =
"""
|select
| uuid,
| sum(money) totalmoney
|from
| (
| select
| remove_random_prefix(random_uuid) uuid,
| money
| from
| (
| select
| random_uuid,
| sum(money) money
| from
| (
| select
| random_prefix(uuid, 10) random_uuid,
| money
| from
| product
| where uuid in ('uuid1')
| ) t1
| group by random_uuid
| ) t2
| ) t3
|group by
| uuid
|union all
|select
| uuid,
| sum(money) totalmoney
|from
| product
|where uuid not in ('uuid1')
|group by uuid
""".stripMargin
spark.sql(prefixSql2).explain(mode = "simple")
spark.sql(prefixSql2).show()
== Physical Plan ==
// AdaptiveSparkPlan isFinalPlan=false
// +- Union
// :- HashAggregate(keys=[uuid#21], functions=[sum(money#20L)])
// : +- Exchange hashpartitioning(uuid#21, 200), ENSURE_REQUIREMENTS, [plan_id=53]
// : +- HashAggregate(keys=[uuid#21], functions=[partial_sum(money#20L)])
// : +- HashAggregate(keys=[random_uuid#19], functions=[sum(money#8)])
// : +- Exchange hashpartitioning(random_uuid#19, 200), ENSURE_REQUIREMENTS, [plan_id=49]
// : +- HashAggregate(keys=[random_uuid#19], functions=[partial_sum(money#8)])
// : +- LocalTableScan [random_uuid#19, money#8]
// +- HashAggregate(keys=[uuid#7], functions=[sum(money#8)])
// +- Exchange hashpartitioning(uuid#7, 200), ENSURE_REQUIREMENTS, [plan_id=55]
// +- HashAggregate(keys=[uuid#7], functions=[partial_sum(money#8)])
// +- LocalTableScan [uuid#7, money#8]