Spark版本:2.4.0
代码位置:org.apache.spark.rdd.PairRDDFunctions
最终调用combineByKeyWithClassTag
应用示例:
示例部分摘自 https://www.jianshu.com/p/d7552ea4f882
case class Juice(volumn: Int) {
def add(j: Juice): Juice = Juice(volumn + j.volumn)
def getV: Int = volumn
}
case class Fruit(kind: String, weight: Int) {
def makeJuice: Juice = Juice(weight)
}
object CombineByKeyDemo {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName("CombineByKeyDemo")
.config("spark.master", "local")
.config("spark.driver.host", "localhost")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("ERROR")
val apple1 = Fruit("apple", 5)
val apple2 = Fruit("apple", 8)
val orange1 = Fruit("orange", 10)
val fruit: RDD[(String, Fruit)] = sc.parallelize(List(("apple", apple1), ("orange", orange1), ("apple", apple2)))
val juice: RDD[(String, Juice)] = fruit.combineByKey(
(v: Fruit) => v.makeJuice,
(c: Juice, v: Fruit) => c.add(v.makeJuice),
(c1: Juice, c2: Juice) => c1.add(c2)
)
val res: Array[(String, Juice)] = juice.collect()
res.foreach(tup => {
println(tup._1, tup._2.getV)
})
spark.stop()
}
}
源码片段如下:
/**
* Generic function to combine the elements for each key using a custom set of aggregation
* functions. This method is here for backward compatibility. It does not provide combiner
* classtag information to the shuffle.
* 通用函数,使用一组自定义的聚合函数来组合每个键的元素。 此方法是为了向后兼容。
* 它不向混洗提供组合器类标签信息。
* @see `combineByKeyWithClassTag`
*/
def combineByKey[C](
createCombiner: V => C, //创建聚合器
mergeValue: (C, V) => C, //每一个Executor内部执行的聚合方法
mergeCombiners: (C, C) => C, //不同Executor之间执行的聚合器
partitioner: Partitioner, //提供分区的策略
mapSideCombine: Boolean = true, //map端开启数据合并策略
serializer: Serializer = null //序列化方法
): RDD[(K, C)] = self.withScope {
combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners,
partitioner, mapSideCombine, serializer)(null)
}
def combineByKeyWithClassTag[C](
createCombiner: V => C,
mergeValue: (C, V) => C,
mergeCombiners: (C, C) => C,
partitioner: Partitioner,
mapSideCombine: Boolean = true,
serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
if (keyClass.isArray) {
if (mapSideCombine) {
throw new SparkException("Cannot use map-side combining with array keys.")
}
if (partitioner.isInstanceOf[HashPartitioner]) {
throw new SparkException("HashPartitioner cannot partition array keys.")
}
}
//Aggregator是实际聚合操作类
val aggregator = new Aggregator[K, V, C](
self.context.clean(createCombiner),
self.context.clean(mergeValue),
self.context.clean(mergeCombiners))
if (self.partitioner == Some(partitioner)) {
self.mapPartitions(iter => {
val context = TaskContext.get()
new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
}, preservesPartitioning = true)
} else {
// 参数为[K,V,C],代表输入的RDD类型为(K,V)类型,返回的RDD数据类型为C
new ShuffledRDD[K, V, C](self, partitioner)
.setSerializer(serializer)
.setAggregator(aggregator)
.setMapSideCombine(mapSideCombine)
}
}