Spark:ReduceByKey源码分析

最新推荐文章于 2023-07-31 18:51:01 发布

GScallion

最新推荐文章于 2023-07-31 18:51:01 发布

阅读量297

点赞数

分类专栏： Spark 文章标签： spark

本文链接：https://blog.csdn.net/qq_24325581/article/details/113176035

版权

Spark 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

Spark版本:2.4.0

代码位置：org.apache.spark.rdd.PairRDDFunctions
reduceByKey(func: (V, V) => V): RDD[(K, V)]
reduceByKey(func: (V, V) => V, numPartitions: Int): RDD[(K, V)]
最终调用combineByKeyWithClassTag

应用实例：

object ReduceByKeyDemo {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName("ReduceByKeyDemo")
      .config("spark.master", "local")
      .config("spark.driver.host", "localhost")
      .getOrCreate()
    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("ERROR")

    //demo 1
    val source: RDD[(Int, Int)] = sc.parallelize(Seq((1, 1), (1, 2), (2, 2), (2, 3)))
    val res: RDD[(Int, Int)] = source.reduceByKey(_ + _)
    res.foreach(println)
    println("------------")
    val groupByKeyRDD: RDD[(Int, Iterable[Int])] = source.groupByKey()
    groupByKeyRDD.map(tup => (tup._1, tup._2.sum)).foreach(println)

    // demo2
    val y: RDD[(String, Int, Int, Int, Int)] = sc.parallelize(List(
      ("key1", 1, 0, 2, 0),
      ("key1", 1, 0, 2, 0),
      ("key2", 1, 0, 2, 0),
      ("key3", 1, 0, 3, 0),
      ("key2", 1, 0, 3, 0)
    ))
    val byKey: RDD[(String, (Int, Int))] = y.map(
      { case (key, scrsrp, ncrsrp, l_scrsrp, l_ncrsrp) => (key) -> ((((l_scrsrp - l_ncrsrp) - (scrsrp - ncrsrp)) * ((l_scrsrp - l_ncrsrp) - (scrsrp - ncrsrp))), (1)) }
    )
    byKey.foreach(println)
    println("--------")
    byKey.reduceByKey((x1, x2) => (x1._1 + x2._1, x1._2 + x2._2)).foreach(println)

    spark.stop()
  }
}

方法1

/**
   * Merge the values for each key using an associative and commutative reduce function. This will
   * also perform the merging locally on each mapper before sending results to a reducer, similarly
   * to a "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
   * parallelism level.
   */
  def reduceByKey(func: (V, V) => V): RDD[(K, V)] = self.withScope {
    //最终调用下述方法3的reduceByKey方法
    //此处的func函数为自定义函数，输入参数为kv类型的(V,V)，返回结果为V类型数据
    //reduceByKey方法的返回结果值为RDD[(K, V)]
    reduceByKey(defaultPartitioner(self), func) //此处partitioner为defaultPartitioner
  }

def defaultPartitioner(rdd: RDD[_], others: RDD[_]*): Partitioner = {
    val rdds = (Seq(rdd) ++ others)
    val hasPartitioner = rdds.filter(_.partitioner.exists(_.numPartitions > 0))

    // 查看数据集中是否存在partitioner
    val hasMaxPartitioner: Option[RDD[_]] = if (hasPartitioner.nonEmpty) {
      Some(hasPartitioner.maxBy(_.partitions.length))
    } else {
      None
    }

    // 上下文中配置的默认分区数
    val defaultNumPartitions = if (rdd.context.conf.contains("spark.default.parallelism")) {
      rdd.context.defaultParallelism
    } else {
      rdds.map(_.partitions.length).max
    }

    // If the existing max partitioner is an eligible one, or its partitions number is larger
    // than or equal to the default number of partitions, use the existing partitioner.
    // 如果现有的最大分区程序是合格的分区程序，或者其分区数大于或等于默认分区数，请使用现有的分区程序。
    if (hasMaxPartitioner.nonEmpty && (isEligiblePartitioner(hasMaxPartitioner.get, rdds) ||
        defaultNumPartitions <= hasMaxPartitioner.get.getNumPartitions)) {
      hasMaxPartitioner.get.partitioner.get
    } else {
      new HashPartitioner(defaultNumPartitions) //使用HashPartitioner
    }
  }

方法2

/**
   * Merge the values for each key using an associative and commutative reduce function. This will
   * also perform the merging locally on each mapper before sending results to a reducer, similarly
   * to a "combiner" in MapReduce. Output will be hash-partitioned with numPartitions partitions.
   */
   // 方法第一个参数为(V,V)类型的自定义函数，第二个参数为用户提供的分区数
  def reduceByKey(func: (V, V) => V, numPartitions: Int): RDD[(K, V)] = self.withScope {
    //最终调用下述方法3的reduceByKey方法
    reduceByKey(new HashPartitioner(numPartitions), func) 
  }

/**
 * A [[org.apache.spark.Partitioner]] that implements hash-based partitioning using
 * Java's `Object.hashCode`.
 *
 * Java arrays have hashCodes that are based on the arrays' identities rather than their contents,
 * so attempting to partition an RDD[Array[_]] or RDD[(Array[_], _)] using a HashPartitioner will
 * produce an unexpected or incorrect result.
 */
class HashPartitioner(partitions: Int) extends Partitioner {
  require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")

  def numPartitions: Int = partitions

  def getPartition(key: Any): Int = key match {
    case null => 0
    case _ => Utils.nonNegativeMod(key.hashCode, numPartitions)
  }

  override def equals(other: Any): Boolean = other match {
    case h: HashPartitioner =>
      h.numPartitions == numPartitions
    case _ =>
      false
  }

  override def hashCode: Int = numPartitions
}

方法3

/**
   * Merge the values for each key using an associative and commutative reduce function. This will
   * also perform the merging locally on each mapper before sending results to a reducer, similarly
   * to a "combiner" in MapReduce.
   */
  //方法第一个参数为partitioner,第二个参数为自定义函数
  def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] = self.withScope {
    combineByKeyWithClassTag[V]((v: V) => v, func, func, partitioner) //reduceByKey最终实际聚合操作的方法combineByKey
  }

def combineByKeyWithClassTag[C](
      createCombiner: V => C, //创建聚合器
      mergeValue: (C, V) => C, //每一个Executor内部执行的聚合方法
      mergeCombiners: (C, C) => C, //不同Executor之间执行的聚合器
      partitioner: Partitioner, //提供分区的策略
      mapSideCombine: Boolean = true, //map端开启数据合并策略
      serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
    require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
    if (keyClass.isArray) {
      if (mapSideCombine) {
        throw new SparkException("Cannot use map-side combining with array keys.")
      }
      if (partitioner.isInstanceOf[HashPartitioner]) {
        throw new SparkException("HashPartitioner cannot partition array keys.")
      }
    }
    //Aggregator是实际聚合操作类
    val aggregator = new Aggregator[K, V, C](
      self.context.clean(createCombiner),
      self.context.clean(mergeValue),
      self.context.clean(mergeCombiners)) 
    if (self.partitioner == Some(partitioner)) {
      self.mapPartitions(iter => {
        val context = TaskContext.get()
        new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
      }, preservesPartitioning = true)
    } else {
      // 参数为[K,V,C]，代表输入的RDD类型为(K,V)类型，返回的RDD数据类型为C
      new ShuffledRDD[K, V, C](self, partitioner)
        .setSerializer(serializer)
        .setAggregator(aggregator)
        .setMapSideCombine(mapSideCombine)
    }
  }