美图欣赏:
一.aggregate复杂例子:
aggregate算子要加上前面的初始值
源码:
aggregate
(zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U): U
1.
scala> import scala.math._
import scala.math._
scala> var rdd1 = sc.parallelize(List("12","34","567","8901"),2)
rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0] at parallelize at <console>:24
scala> def fun1(index:Int,iter:Iterator[String]):Iterator[String]={
| iter.toList.map(x => "[partID: "+index+",value:"+x+"]").iterator}
fun1: (index: Int, iter: Iterator[String])Iterator[String]
scala> rdd1.mapPartitionsWithIndex(fun1).collect
res1: Array[String] = Array(
[partID: 0,value:12], [partID: 0,value:34],
[partID: 1,value:567], [partID: 1,value:8901])
scala> rdd1.aggregate("")((x,y)=>math.max(x.length,y.length).toString,(x,y)=>x+y)
scala> rdd1.aggregate("")((x,y)=>math.max(x.length,y.length).toString,(x,y)=>x+y)
res3: String = 42
scala> rdd1.aggregate("")((x,y)=>math.max(x.length,y.length).toString,(x,y)=>x+y)
res4: String = 24
分析:
第一个分区:“12”,“34”
第一次比较:“”,“12”=2.toString ==》 “2”
第二次比较:“2”,“34”=2.toString ==》 “2”
第二个分区:“567”,“8901”
第一次比较:“”,“567”=3.toString ==》“3”
第二次比较:“3”,“8901”=4.toString ==》 “4”
“24”或者“42”
scala> var rdd1 = sc.parallelize(List("12","23","345",""),2)
rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[2] at parallelize at <console>:24
scala> rdd1.mapPartitionsWithIndex(fun1).collect
res6: Array[String] = Array([partID: 0,value:12], [partID: 0,value:23], [partID: 1,value:345], [partID: 1,value:])
scala> rdd1.aggregate("")((x,y)=>math.min(x.length,y.length).toString,(x,y)=>x+y)
res7: String = 10
scala> rdd1.aggregate("")((x,y)=>math.min(x.length,y.length).toString,(x,y)=>x+y)
res9: String = 01
分析:
第一个分区:“12”,“23”
第一次比较:“”,“12”=0.toString ==》 “0”
第二次比较:“0”,“23”=1.toString ==》 “1”
第二个分区:“345”,“”
第一次比较:“”,“345”=0.toString ==》“0”
第二次比较:“0”,“”=0.toString ==》 “0”
“10”或者“01”
scala> var rdd1 = sc.parallelize(List("12","23","","345"),2)
rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[5] at parallelize at <console>:24
scala> rdd1.mapPartitionsWithIndex(fun1).collect
res10: Array[String] = Array([partID: 0,value:12], [partID: 0,value:23], [partID: 1,value:], [partID: 1,value:345])
scala> rdd1.aggregate("")((x,y)=>math.min(x.length,y.length).toString,(x,y)=>x+y)
res11: String = 11
分析:
第一个分区:“12”,“23”
第一次比较:“”,“12”=0.toString ==》 “0”
第二次比较:“0”,“23”=1.toString ==》 “1”
第二个分区:“”,“345”
第一次比较:“”,“”=0.toString ==》“0”
第二次比较:“0”,“345”=1.toString ==》 “1”
2.
aggregate算子要加上前面的初始值
scala> def func1(index: Int, iter: Iterator[(Int)]) : Iterator[String] = {iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator}
func1: (index: Int, iter: Iterator[Int])Iterator[String]
scala> val rdd1 = sc.parallelize(List(1,2,3,4,5,6,7,8,9), 2)
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[5] at parallelize at <console>:24
scala> rdd1.mapPartitionsWithIndex(func1).collect
res4: Array[String] = Array([partID:0, val: 1], [partID:0, val: 2], [partID:0, val: 3], [partID:0, val: 4], [partID:1, val: 5], [partID:1, val: 6], [partID:1, val: 7], [partID:1, val: 8], [partID:1, val: 9])
scala> rdd1.aggregate(0)(math.max(_, _), _ + _)
res5: Int = 13
scala> rdd1.aggregate(5)(math.max(_, _), _ + _)
res6: Int = 19
二.aggregateByKey算子例子:
aggregateByKey算子不需要加前面的初始值(最后合计算并后)
源码:
(zeroValue: U, partitioner: Partitioner)(seqOp: (U, V) => U, combOp: (U, U) => U): RDD[(K, U)
scala> import scala.math._
import scala.math._
scala> var rdd1 = sc.parallelize(List(("Tom",20),("Tom",25),("Plus",2),("Plus",18),("Make",30),("Make",20),("Tom",10)),2)
rdd1: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[10] at parallelize at <console>:24
scala> def fun2(index:Int,iter:Iterator[(String,Int)]):Iterator[String]={
| iter.toList.map(x => "[partId: "+index+",value: "+x+"]").iterator}
fun2: (index: Int, iter: Iterator[(String, Int)])Iterator[String]
scala> rdd1.mapPartitionsWithIndex(fun2)
res18: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[8] at mapPartitionsWithIndex at <console>:29
scala> rdd1.mapPartitionsWithIndex(fun2).collect
res20: Array[String] = Array(
[partId: 0,value: (Tom,20)], [partId: 0,value: (Tom,25)], [partId: 0,value: (Plus,2)], [partId: 1,value: (Plus,18)], [partId: 1,value: (Make,30)], [partId: 1,value: (Make,20)], [partId: 1,value: (Tom,10)])
scala> rdd1.aggregateByKey(0)(math.max(_,_),_+_)
res21: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[12] at aggregateByKey at <console>:27
scala> var rdd2 = rdd1.aggregateByKey(0)(math.max(_,_),_+_)
rdd2: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[13] at aggregateByKey at <console>:26
scala> rdd2.collect
res22: Array[(String, Int)] = Array((Tom,35), (Plus,20), (Make,30))
第一个分区:(Tom,20),(Tom,25),(Plus,2)
max:Tom:25,Plus:2
第二个分区:(Plus,18), (Make,30),(Make,20),(Tom,10)
max:plus:18,Make:30,Tom:10
相加:Tom:35,Plus:20,Make:30
scala> var rdd2 = rdd1.aggregateByKey(0)(_+_,_+_)
rdd2: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[14] at aggregateByKey at <console>:26
scala> rdd2.collect
res23: Array[(String, Int)] = Array((Tom,55), (Plus,20), (Make,50))
————保持饥饿,保持学习
Jackson_MVP