import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
/**
* Created by liupeng on 2017/6/15.
*/
object T_mapPartitionsWithIndex {
System.setProperty("hadoop.home.dir","F:\\hadoop-2.6.5")
def fun_index(index : Int, iter : Iterator[String]) : Iterator[String] = {
var list = ArrayBuffer[String]()
while (iter.hasNext)
{
val name : String = iter.next()
var fs = index + ":" + name
list += fs
println(fs)
}
return list.iterator
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("mapPartitionsWithIndex_test").setMaster("local")
val sc = new SparkContext(conf)
//准备一下数据
val names: List[String] = List("liupeng", "xuliuxi", "xiaoma")
val nameRDD = sc.parallelize(names, 2)
// 按照分区以及索引遍历
//如果想知道谁分到了一起,mapPartitionsWithIndex这个算子可以拿到每个partition的index
val nameWithPartionIndex = nameRDD.mapPartitionsWithIndex(fun_index)
println(nameWithPartionIndex.count())
}
}
运行结果:
0:liupeng
1:xuliuxi
1:xiaoma
1:xuliuxi
1:xiaoma
3