1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | | package com.lyzx.day16
import org.apache.spark.{SparkContext, SparkConf}
class T2 {
/** * reduceByKey = groupByKey + reduce *从功能上讲相当于先做GroupByKey然后再做reduce操作但是比groupByKey+reduce操作效率要高 * 即对于键值对的RDD来说,应用于所有的key相同的值做操作 * * 由于reduceByKey是先做groupByKey后做reduce * 所以它包含shuffle操作(groupByKey包含shuffle操作) *前面说过 * 功能上:reduceByKey = groupByKey + reduce * 效率上:reduceByKey > groupByKey + reduce * 原因是reduceByKey在map端自带Combiner * 例如wordCount例子中map端 * [("java",(1,1,1,1)),("c",(1,1,1,1))] * 如果在map做Combiner就像[("java",(4)),("c",(4))] * 在reduce端fatch时效率会高 */ def f1(sc:SparkContext): Unit ={ val arr = List(1,2,3,4,5,5,4,3,2,1)
//rdd中的数据如下[(1,2,3,4,5,5,4,3,2,1)] val rdd = sc.parallelize(arr) //mapRdd中的数据如下[((1,1),(2,2),(3,3),...(5,5),....,(1,1))] val mapRdd = rdd.map(item=>(item,item*10)) val reduceRdd = mapRdd.reduceByKey(_+_) reduceRdd.foreach(println) } }
object T2{ def main(args: Array[String]) { val conf = new SparkConf().setAppName("myTest").setMaster("local") val sc = new SparkContext(conf)
val t = new T2() t.f1(sc) sc.stop() } } |