http://homepage.cs.latrobe.edu.au/zhe/ZhenHeSparkRDDAPIExamples.html
/\\\\\\
mapPartitionsWithIndex
val rdd1=sc.parallelize(List(1,2,3,4,5,6,7,8,9),2)
def myfunc(index: Int, iter: Iterator[(Int)]) : Iterator[String] = {
iter.map(x => "[partID:" + index + ", val: " + x + "]")
}
rdd1.mapPartitionsWithIndex(myfunc).collect
res1: Array[String] = Array([partID:0, val: 1], [partID:0, val: 2], [partID:0, val: 3], [partID:0, val: 4], [partID:1, val: 5], [partID:1, val: 6], [partID:1, val: 7], [partID:1, val: 8], [partID:1, val: 9])
repartition
val rdd2 = rdd1.repartition(3)
rdd2.partitions.length
res5: Int = 3
rdd2.mapPartitionsWithIndex(myfunc).collect
res6: Array[String] = Array([partID:0, val: 3], [partID:0, val: 6], [partID:0, val: 9], [partID:1, val: 1], [partID:1, val: 4], [partID:1, val: 7], [partID:2, val: 2], [partID:2, val: 5], [partID:2, val: 8])
coalesce
val rdd2 = rdd1.coalesce(3)
rdd2.mapPartitionsWithIndex(myfunc).collect
res7: Array[String] = Array([partID:0, val: 1], [partID:0, val: 2], [partID:0, val: 3], [partID:0, val: 4], [partID:1, val: 5], [partID:1, val: 6], [partID:1, val: 7], [partID:1, val: 8], [partID:1, val: 9])
val rdd2 = rdd1.coalesce(3,true)
rdd2.mapPartitionsWithIndex(myfunc).collect
res8: Array[String] = Array([partID:0, val: 3], [partID:0, val: 6], [partID:0, val: 9], [partID:1, val: 1], [partID:1, val: 4], [partID:1, val: 7], [partID:2, val: 2], [partID:2, val: 5], [partID:2, val: 8])
collectAsMap
val rdd = sc.parallelize(List(("a",1),("b",2)))
rdd.collectAsMap
res9: scala.collection.Map[String,Int] = Map(b -> 2, a -> 1)
rdd.collect
res10: Array[(String, Int)] = Array((a,1), (b,2))
countByKey,countByValue
val rdd1 = sc.parallelize(List(("a",1),("b",2),("b",2),("c",2),("c",2)))
rdd1.count
res11: Long = 5
rdd1.countByKey
res12: scala.collection.Map[String,Long] = Map(a -> 1, b -> 2, c -> 2)
rdd1.countByValue
res13: scala.collection.Map[(String, Int),Long] = Map((c,2) -> 2, (a,1) -> 1, (b,2) -> 2)
filterByRange
val rdd1 = sc.parallelize(List(("a",1),("e",5),("d",4),("c",2),("b",6),("c",3)))
rdd1.filterByRange("b","d").collect
res14: Array[(String, Int)] = Array((d,4), (c,2), (b,6), (c,3))
flatMapValues
val a = sc.parallelize(List(("a","1 2"),("b","3 4")))
a.flatMapValues(_.split(" ")).collect
res15: Array[(String, String)] = Array((a,1), (a,2), (b,3), (b,4))
foldByKey
val rdd1 = sc.parallelize(List("dog","wolf","cat","beer"),2)
rdd1.map(x => (x.length,x)).collect
res17: Array[(Int, String)] = Array((3,dog), (4,wolf), (3,cat), (4,beer))
val rdd2 = rdd1.map(x => (x.length,x))
rdd2.foldByKey("")(_+_).collect
res19: Array[(Int, String)] = Array((4,wolfbeer), (3,dogcat))
keyBy()
val rdd1 = sc.parallelize(List("dog","wolf","elephant","salmon"),3)
val rdd2 = rdd1.keyBy(_.length)
rdd2.collect
res21: Array[(Int, String)] = Array((3,dog), (4,wolf), (8,elephant), (6,salmon))
val rdd3 = rdd1.keyBy(_(0))
rdd3.collect
res22: Array[(Char, String)] = Array((d,dog), (w,wolf), (e,elephant), (s,salmon))
/\\\\\\\\