Spark版本:2.4.0
代码位置:org.apache.spark.rdd.PairRDDFunctions
foldByKey(zeroValue: V, numPartitions: Int)(func: (V, V) => V): RDD[(K, V)]
foldByKey(zeroValue: V)(func: (V, V) => V): RDD[(K, V)]
最终调用combineByKeyWithClassTag
应用示例
object FoldByKeyDemo {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName("ReduceByKeyDemo")
.config("spark.master", "local")
.config("spark.driver.host", "localhost")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("ERROR")
val sourceRdd = sc.parallelize(Seq(("a", 1), ("a", 2), ("b", 2), ("b", 3)))
val resRdd: RDD[(String, Int)] = sourceRdd.foldByKey(0)(
(acc: Int, V: Int) => acc + V
)
resRdd.foreach(println)
spark.stop()
}
}
打印结果:
(a,3)
(b,5)
源码如下:
方法1和方法2统一调用方法3
方法1:
/**
* Merge the values for each key using an associative function and a neutral "zero value" which
* may be added to the result an arbitrary number of times, and must not change the result
* (e.g., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
*/
//需要传入的参数(初始值:zeroValue: V,分区数:numPartitions: Int)(合并函数:func: (V, V) => V)
def foldByKey(zeroValue: V, numPartitions: Int)(func: (V, V) => V): RDD[(K, V)] = self.withScope {
foldByKey(zeroValue, new HashPartitioner(numPartitions))(func)
}
方法2:
/**
* Merge the values for each key using an associative function and a neutral "zero value" which
* may be added to the result an arbitrary number of times, and must not change the result
* (e.g., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
*/
//需要传入的参数(初始值:zeroValue: V)(合并函数:func: (V, V) => V)
def foldByKey(zeroValue: V)(func: (V, V) => V): RDD[(K, V)] = self.withScope {
foldByKey(zeroValue, defaultPartitioner(self))(func)
}
/**
* Merge the values for each key using an associative function and a neutral "zero value" which
* may be added to the result an arbitrary number of times, and must not change the result
* (e.g., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
*/
def foldByKey(
zeroValue: V,
partitioner: Partitioner)(func: (V, V) => V): RDD[(K, V)] = self.withScope {
// Serialize the zero value to a byte array so that we can get a new clone of it on each key
// 将零值序列化为字节数组,以便我们可以在每个键上获取它的新副本
val zeroBuffer = SparkEnv.get.serializer.newInstance().serialize(zeroValue)
val zeroArray = new Array[Byte](zeroBuffer.limit)
zeroBuffer.get(zeroArray)
// When deserializing, use a lazy val to create just one instance of the serializer per task
// 反序列化时,使用惰性val为每个任务仅创建一个序列化程序实例
lazy val cachedSerializer = SparkEnv.get.serializer.newInstance()
val createZero = () => cachedSerializer.deserialize[V](ByteBuffer.wrap(zeroArray))
val cleanedFunc = self.context.clean(func)
combineByKeyWithClassTag[V](
(v: V) => cleanedFunc(createZero(), v), // 初始化合并器
cleanedFunc, // 用于同一个Executor内部合并数据
cleanedFunc, // 不同Executor之间合并数据
partitioner)
}