Spark版本:2.4.0
代码位置:org.apache.spark.rdd.PairRDDFunctions
reduceByKey(func: (V, V) => V): RDD[(K, V)]
reduceByKey(func: (V, V) => V, numPartitions: Int): RDD[(K, V)]
最终调用combineByKeyWithClassTag
应用实例:
object ReduceByKeyDemo {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName("ReduceByKeyDemo")
.config("spark.master", "local")
.config("spark.driver.host", "localhost")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("ERROR")
//demo 1
val source: RDD[(Int, Int)] = sc.parallelize(Seq((1, 1), (1, 2), (2, 2), (2, 3)))
val res: RDD[(Int, Int)] = source.reduceByKey(_ + _)
res.foreach(println)
println("------------")
val groupByKeyRDD: RDD[(Int, Iterable[Int])] = source.groupByKey()
groupByKeyRDD.map(tup => (tup._1, tup._2.sum)).foreach(println)
// demo2
val y: RDD[(String, Int, Int, Int, Int)] = sc.parallelize(List(
("key1", 1, 0, 2, 0),
("key1", 1, 0, 2, 0),
("key2", 1, 0, 2, 0),
("key3", 1, 0, 3, 0),
("key2", 1, 0, 3, 0)
))
val byKey: RDD[(String, (Int, Int))] = y.map(
{ case (key, scrsrp, ncrsrp, l_scrsrp, l_ncrsrp) => (key) -> ((((l_scrsrp - l_ncrsrp) - (scrsrp - ncrsrp)) * ((l_scrsrp - l_ncrsrp) - (scrsrp - ncrsrp))), (1)) }
)
byKey.foreach(println)
println("--------")
byKey.reduceByKey((x1, x2) => (x1._1 + x2._1, x1._2 + x2._2)).foreach(println)
spark.stop()
}
}
相关的三个方法片段:
方法1
/**
* Merge the values for each key using an associative and commutative reduce function. This will
* also perform the merging locally on each mapper before sending results to a reducer, similarly
* to a "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
* parallelism level.
*/
def reduceByKey(func: (V, V) => V): RDD[(K, V)] = self.withScope {
//最终调用下述方法3的reduceByKey方法
//此处的func函数为自定义函数,输入参数为kv类型的(V,V),返回结果为V类型数据
//reduceByKey方法的返回结果值为RDD[(K, V)]
reduceByKey(defaultPartitioner(self), func) //此处partitioner为defaultPartitioner
}
def defaultPartitioner(rdd: RDD[_], others: RDD[_]*): Partitioner = {
val rdds = (Seq(rdd) ++ others)
val hasPartitioner = rdds.filter(_.partitioner.exists(_.numPartitions > 0))
// 查看数据集中是否存在partitioner
val hasMaxPartitioner: Option[RDD[_]] = if (hasPartitioner.nonEmpty) {
Some(hasPartitioner.maxBy(_.partitions.length))
} else {
None
}
// 上下文中配置的默认分区数
val defaultNumPartitions = if (rdd.context.conf.contains("spark.default.parallelism")) {
rdd.context.defaultParallelism
} else {
rdds.map(_.partitions.length).max
}
// If the existing max partitioner is an eligible one, or its partitions number is larger
// than or equal to the default number of partitions, use the existing partitioner.
// 如果现有的最大分区程序是合格的分区程序,或者其分区数大于或等于默认分区数,请使用现有的分区程序。
if (hasMaxPartitioner.nonEmpty && (isEligiblePartitioner(hasMaxPartitioner.get, rdds) ||
defaultNumPartitions <= hasMaxPartitioner.get.getNumPartitions)) {
hasMaxPartitioner.get.partitioner.get
} else {
new HashPartitioner(defaultNumPartitions) //使用HashPartitioner
}
}
方法2
/**
* Merge the values for each key using an associative and commutative reduce function. This will
* also perform the merging locally on each mapper before sending results to a reducer, similarly
* to a "combiner" in MapReduce. Output will be hash-partitioned with numPartitions partitions.
*/
// 方法第一个参数为(V,V)类型的自定义函数,第二个参数为用户提供的分区数
def reduceByKey(func: (V, V) => V, numPartitions: Int): RDD[(K, V)] = self.withScope {
//最终调用下述方法3的reduceByKey方法
reduceByKey(new HashPartitioner(numPartitions), func)
}
/**
* A [[org.apache.spark.Partitioner]] that implements hash-based partitioning using
* Java's `Object.hashCode`.
*
* Java arrays have hashCodes that are based on the arrays' identities rather than their contents,
* so attempting to partition an RDD[Array[_]] or RDD[(Array[_], _)] using a HashPartitioner will
* produce an unexpected or incorrect result.
*/
class HashPartitioner(partitions: Int) extends Partitioner {
require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")
def numPartitions: Int = partitions
def getPartition(key: Any): Int = key match {
case null => 0
case _ => Utils.nonNegativeMod(key.hashCode, numPartitions)
}
override def equals(other: Any): Boolean = other match {
case h: HashPartitioner =>
h.numPartitions == numPartitions
case _ =>
false
}
override def hashCode: Int = numPartitions
}
方法3
/**
* Merge the values for each key using an associative and commutative reduce function. This will
* also perform the merging locally on each mapper before sending results to a reducer, similarly
* to a "combiner" in MapReduce.
*/
//方法第一个参数为partitioner,第二个参数为自定义函数
def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] = self.withScope {
combineByKeyWithClassTag[V]((v: V) => v, func, func, partitioner) //reduceByKey最终实际聚合操作的方法combineByKey
}
def combineByKeyWithClassTag[C](
createCombiner: V => C, //创建聚合器
mergeValue: (C, V) => C, //每一个Executor内部执行的聚合方法
mergeCombiners: (C, C) => C, //不同Executor之间执行的聚合器
partitioner: Partitioner, //提供分区的策略
mapSideCombine: Boolean = true, //map端开启数据合并策略
serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
if (keyClass.isArray) {
if (mapSideCombine) {
throw new SparkException("Cannot use map-side combining with array keys.")
}
if (partitioner.isInstanceOf[HashPartitioner]) {
throw new SparkException("HashPartitioner cannot partition array keys.")
}
}
//Aggregator是实际聚合操作类
val aggregator = new Aggregator[K, V, C](
self.context.clean(createCombiner),
self.context.clean(mergeValue),
self.context.clean(mergeCombiners))
if (self.partitioner == Some(partitioner)) {
self.mapPartitions(iter => {
val context = TaskContext.get()
new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
}, preservesPartitioning = true)
} else {
// 参数为[K,V,C],代表输入的RDD类型为(K,V)类型,返回的RDD数据类型为C
new ShuffledRDD[K, V, C](self, partitioner)
.setSerializer(serializer)
.setAggregator(aggregator)
.setMapSideCombine(mapSideCombine)
}
}