需求:取出每个分区中相同key对应的最大值,然后相加
代码实现
package com.huc.Spark.KeyAndValue
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Test04_aggregateByKey {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCore").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3.使用Scala进行spark编程
// 创建RDD
val rdd: RDD[(String, Int)] = sc.makeRDD(List(("a", 1), ("a", 3), ("a", 5), ("b", 7), ("b", 2), ("b", 4), ("b", 6), ("a", 7)), 2)
// 取出每个分区相同key对应值的最大值,然后相加
val value: RDD[(String, Int)] = rdd.aggregateByKey(0)(_ + _, _ + _)
println(rdd.aggregateByKey(0)(math.max(_, _), _ + _).collect().mkString(","))
println(value.collect().mkString(","))
// 计算分区内的最大值之后 之后在分区间相加
val value1: RDD[(String, Int)] = rdd.aggregateByKey(Int.MinValue)((res, elem) => math.max(res, elem), (res, elem) => res + elem)
println(value1.collect().mkString(","))
//4.关闭连接
sc.stop()
}
}