Spark中aggregateByKey实现reduceByKey与groupByKey,下面为具体函数
/**
* aggregateByKey算子来自定义reduceByKey算子
*/
def aggregateByKeyToReduceBy(sc:SparkContext): Unit ={
val rdd = sc.parallelize(List(("a",1),("b",2),("c",3),("a",2),("a",3),("b",3),("c",2)))
rdd.reduceByKey(_+_).foreach(println)
println("上面为reduceByKey的运算结果,下面为使用aggregateByKey算子的运算结果")
val reduceByKeyRdd: RDD[(String, Int)] = rdd.aggregateByKey(0)(
seqOp1,
combOp1
)
reduceByKeyRdd.foreach(println)
}
//这里的参数arrayBuffer为rdd的value值的列表,v为rdd的value,这里为相同分区的聚合
def seqOp1(c:Int,v:Int): Int ={
c+v
}
//类似于上面那个函数,这里为不同分区相同key值value的聚合
def combOp1(c1:Int,c2:Int): Int ={
c1+c2
}
/**
* aggregateByKey算子来自定义groupByKey算子
*/
def aggregateByKeyToGroupBy(sc:SparkContext): Unit ={
val rdd = sc.parallelize(List(("a",1),("b",2),("c",3),("a",2),("a",3),("b",3),("c",2)))
rdd.groupByKey().foreach(println)
println("上面为用groupByKey效果,下面为使用aggregateByKey算子实现的效果")
val groupByKeyRdd: RDD[(String, ArrayBuffer[String])] = rdd.aggregateByKey(ArrayBuffer[String]())(
seqOp,
combOp
)
groupByKeyRdd.foreach(println)
}
//这里的参数arrayBuffer为rdd的value值的列表,v为rdd的value,这里为相同分区的聚合
def seqOp(arrayBuffer: ArrayBuffer[String],v:Int): ArrayBuffer[String] ={
arrayBuffer.append(v.toString)
arrayBuffer
}
//类似于上面那个函数,这里为不同分区相同key值value的聚合
def combOp(arrayBuffer1: ArrayBuffer[String],arrayBuffer2: ArrayBuffer[String]): ArrayBuffer[String] ={
arrayBuffer1 ++= (arrayBuffer2)
}