combineByKey
CombineByKey
这一函数过程就是根据Key值,将Value值进行合并的过程。
参数解释
def combineByKey[C](
createCombiner: V => C,
mergeValue: (C, V) => C,
mergeCombiners: (C, C) => C): RDD[(K, C)] = {
}
createCombiner: V => C:如何处理第一个Value
mergeValue: (C, V) => C:如何处理Combine与后面来的Value之间的运算关系
mergeCombiners: (C, C) => C:如何处理Combine与Combine之间的运算关系
求(sum、count、max、min、avg)
package pro.eddievim.spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object ReduceByKeyDemo {
def main(args: Array[String]): Unit = {
val sc: SparkContext = new SparkContext(new SparkConf().setMaster("local").setAppName("reduce-by-key"))
sc.setLogLevel("ERROR")
val data: RDD[(String, Int)] = sc.parallelize(List(
("eddie", 10),
("july", 80),
("july", 12),
("eddie", 1024),
("eddie", 7),
("july", 12),
("june", 12),
("june", 62)
))
println("--sum--count--max--min--avg--")
println("---sum---")
data.combineByKey(
a=>a,
(a:Int, b:Int) => a + b,
(a:Int, b:Int) => a + b
).foreach(println)
println("---count---")
data.mapValues(_ => 1).combineByKey(
a=>a,
(a:Int, b:Int) => a + b,
(a:Int, b:Int) => a + b
).foreach(println)
println("---max---")
data.combineByKey(
a=>a,
(a:Int, b:Int) => Math.max(a, b),
(a:Int, b:Int) => Math.max(a, b)
).foreach(println)
println("---min---")
data.combineByKey(
a=>a,
(a:Int, b:Int) => Math.min(a, b),
(a:Int, b:Int) => Math.min(a, b)
).foreach(println)
println("---avg---")
data.mapValues((_, 1)).combineByKey(
a => a,
(a: (Int, Int), b: (Int, Int)) => (a._1 + b._1, a._2 + b._2),
(a: (Int, Int), b: (Int, Int)) => (a._1 + b._1, a._2 + b._2)
).mapValues(a=>a._1 * 1.0 / a._2).foreach(println)
}
}