reduce:
reduce 是用于一元组,遍历一元组的数据,进行处理。
object reduceTest{
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("reduceTest")
val sc = new SparkContext(conf)
val nums: RDD[Int] = sc.parallelize(Array(1,2,3,4,5,6))
val i: Int = nums.reduce((x,y) => x+y)
println(i)
}
}
依次处理:1+2=3 3+3=6 6+4=10 10+5=15 15+6=21
reduceByKey:
reduceByKey是用于二元组,对相同key的value值进行聚合,得到一个新的RDD
object reduceByKeyTest{
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("reduceTest")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.parallelize(Array("hello java","hello spark","hello scala"))
val word: RDD[String] = lines.flatMap(_.split(" "))
val A: RDD[(String, Int)] = word.map((_,1))
val B: RDD[(String, Int)] = A.reduceByKey(_+_)
B.foreach(println)
}
}