SparkSql单表数据倾斜

一只咸鱼va

已于 2023-03-30 21:50:16 修改

阅读量492

点赞数 3

分类专栏： spark 文章标签：大数据 spark

于 2023-03-27 16:32:05 首次发布

本文链接：https://blog.csdn.net/YYLong0/article/details/129797749

版权

spark 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

1.起因

单表执行group by语句时候，key分配不均

2.解释

理论上，因为sparksql在map阶段有预聚合（相当于mapreduce的combine），理论上很少发生倾斜情况
但是，如果比如输入文件64个，其中有1-2个文件内，存在大key，也会导致map阶段内处理较慢
往往大key对应的executor容易发生oom

3.图解

3.1数据倾斜图示

如上图，不考虑map端的局部聚合，uuid1数据倾斜

3.2针对大key的处理

4.解决步骤

4.1 模拟数据倾斜

如下表，product（uuid，money）

    Seq(
      ("uuid1",12),
      ("uuid1",13),
      ("uuid1",14),
      ("uuid1",15),
      ("uuid1",22),
      ("uuid1",23),
      ("uuid2",32)
    ).toDF("uuid","money").createTempView("product")

4.2 正常情况sql

    val simpleSql =
      """
        |select
        |uuid,
        |sum(money)
        |from product
        |group by uuid
        |""".stripMargin

    spark.sql(simpleSql).explain(mode = "simple")
    spark.sql(simpleSql).show(false)

//对应执行逻辑
//    == Physical Plan ==
//      AdaptiveSparkPlan isFinalPlan=false
//    +- HashAggregate(keys=[uuid#7], functions=[sum(money#8)])
//    +- Exchange hashpartitioning(uuid#7, 200), ENSURE_REQUIREMENTS, [plan_id=14]
//    +- HashAggregate(keys=[uuid#7], functions=[partial_sum(money#8)])
//    +- LocalTableScan [uuid#7, money#8]

4.3 加盐去盐处理的sql

  def randomPrefixUDF( uuid: String, num: Int ): String = {
    new Random().nextInt(num).toString + "_" + uuid
  }

  def removeRandomPrefixUDF( randomUuid: String ): String = {
    randomUuid.split("_")(1)
  }

   val prefixSql =
      """
        |select
        |  uuid,
        |  sum(money) totalmoney
        |from
        |  (
        |    select
        |      remove_random_prefix(random_uuid) uuid,
        |      money
        |    from
        |      (
        |        select
        |          random_uuid,
        |          sum(money) money
        |        from
        |          (
        |            select
        |              random_prefix(uuid, 6) random_uuid,
        |              money
        |            from
        |              product
        |          ) t1
        |        group by random_uuid
        |      ) t2
        |  ) t3
        |group by
        |  uuid
      """.stripMargin

    spark.udf.register("random_prefix", ( uuid: String, num: Int ) => randomPrefixUDF(uuid, num))
    spark.udf.register("remove_random_prefix", ( randomUuid: String ) => removeRandomPrefixUDF(randomUuid))
    spark.sql(prefixSql).explain(mode = "simple")
    spark.sql(prefixSql).show()


//    == Physical Plan ==
//      AdaptiveSparkPlan isFinalPlan=false
//    +- HashAggregate(keys=[uuid#18], functions=[sum(money#17L)])
//    +- Exchange hashpartitioning(uuid#18, 200), ENSURE_REQUIREMENTS, [plan_id=25]
//    +- HashAggregate(keys=[uuid#18], functions=[partial_sum(money#17L)])
//    +- HashAggregate(keys=[random_uuid#16], functions=[sum(money#8)])
//    +- Exchange hashpartitioning(random_uuid#16, 200), ENSURE_REQUIREMENTS, [plan_id=21]
//    +- HashAggregate(keys=[random_uuid#16], functions=[partial_sum(money#8)])
//    +- LocalTableScan [random_uuid#16, money#8]

4.3 抽样不放回10%定位倾斜key

  def sampleTopKey( sparkSession: SparkSession, tableName: String, keyColumn: String ): Array[(Int, Row)] = {
    val df: DataFrame = sparkSession.sql("select " + keyColumn + " from " + tableName)
    val top10Key = df
      .select(keyColumn).sample(false, 0.1).rdd // 对key不放回采样
      .map(k => (k, 1)).reduceByKey(_ + _) // 统计不同key出现的次数
      .map(k => (k._2, k._1)).sortByKey(false) // 统计的key进行排序
      .take(10)
    top10Key
  }

    /**
     * 抽样不放回10%定位倾斜key
     * 比如已经确定是uuid1导致的倾斜
     */
    sampleTopKey(spark,"product","uuid").foreach(println)

4.4 确定大key之后的sql

 val prefixSql2 =
      """
        |select
        |  uuid,
        |  sum(money) totalmoney
        |from
        |  (
        |    select
        |      remove_random_prefix(random_uuid) uuid,
        |      money
        |    from
        |      (
        |        select
        |          random_uuid,
        |          sum(money) money
        |        from
        |          (
        |            select
        |              random_prefix(uuid, 10) random_uuid,
        |              money
        |            from
        |              product
        |            where uuid in ('uuid1')
        |          ) t1
        |        group by random_uuid
        |      ) t2
        |  ) t3
        |group by
        |  uuid
        |union all
        |select
        |  uuid,
        |  sum(money) totalmoney
        |from
        |  product
        |where uuid not in ('uuid1')
        |group by uuid
      """.stripMargin

    spark.sql(prefixSql2).explain(mode = "simple")
    spark.sql(prefixSql2).show()

    == Physical Plan ==
//      AdaptiveSparkPlan isFinalPlan=false
//    +- Union
//      :- HashAggregate(keys=[uuid#21], functions=[sum(money#20L)])
//    :  +- Exchange hashpartitioning(uuid#21, 200), ENSURE_REQUIREMENTS, [plan_id=53]
//    :     +- HashAggregate(keys=[uuid#21], functions=[partial_sum(money#20L)])
//    :        +- HashAggregate(keys=[random_uuid#19], functions=[sum(money#8)])
//    :           +- Exchange hashpartitioning(random_uuid#19, 200), ENSURE_REQUIREMENTS, [plan_id=49]
//    :              +- HashAggregate(keys=[random_uuid#19], functions=[partial_sum(money#8)])
//    :                 +- LocalTableScan [random_uuid#19, money#8]
//    +- HashAggregate(keys=[uuid#7], functions=[sum(money#8)])
//    +- Exchange hashpartitioning(uuid#7, 200), ENSURE_REQUIREMENTS, [plan_id=55]
//    +- HashAggregate(keys=[uuid#7], functions=[partial_sum(money#8)])
//    +- LocalTableScan [uuid#7, money#8]