小经典
效果:17/08/17 14:56:36 INFO TaskSchedulerImpl: Adding task set 0.0 with 3 tasks
17/08/17 14:56:36 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, executor driver, partition 0, PROCESS_LOCAL, 4825 bytes)
17/08/17 14:56:36 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
partitionId:0value:1
partitionId:0value:2
partitionId:0value:3
17/08/17 14:56:37 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 703 bytes result sent to driver
17/08/17 14:56:37 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, localhost, executor driver, partition 1, PROCESS_LOCAL, 4825 bytes)
17/08/17 14:56:37 INFO Executor: Running task 1.0 in stage 0.0 (TID 1)
partitionId:1value:4
partitionId:1value:5
partitionId:1value:6
17/08/17 14:56:37 INFO Executor: Finished task 1.0 in stage 0.0 (TID 1). 660 bytes result sent to driver
17/08/17 14:56:37 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 183 ms on localhost (executor driver) (1/3)
17/08/17 14:56:37 INFO TaskSetManager: Starting task 2.0 in stage 0.0 (TID 2, localhost, executor driver, partition 2, PROCESS_LOCAL, 4882 bytes)
17/08/17 14:56:37 INFO Executor: Running task 2.0 in stage 0.0 (TID 2)
partitionId:2value:7
partitionId:2value:8
partitionId:2value:9
partitionId:2value:10
17/08/17 14:56:37 INFO Executor: Finished task 2.0 in stage 0.0 (TID 2). 621 bytes result sent to driver
17/08/17 14:56:37 INFO TaskSetManager: Finished task 1.0 in stage 0.0 (TID 1) in 59 ms on localhost (executor driver) (2/3)
17/08/17 14:56:37 INFO TaskSetManager: Finished task 2.0 in stage 0.0 (TID 2) in 39 ms on localhost (executor driver) (3/3)
17/08/17 14:56:37 INFO DAGScheduler: ResultStage 0 (collect at MapPartitionsWithIndexOperator.scala:44) finished in 0.256 s
import org.apache.spark.SparkConf
import org.apache.spark.SparkContextimport scala.collection.mutable.ListBuffer
object MapPartitionsWithIndexOperator {
def main(args: Array[String]): Unit = {
/**
* 创建一个设置Spark运行参数的对象
* SparkConf对象可以设置运行模式,设置Application的名称
* 设置Application执行所需要的资源情况
*/
val conf = new SparkConf()
.setMaster("local")
.setAppName("Map_Operator")
/**
* 创建一个SparkContext的上下文对象
* SparkContext是通往集群的 唯一通道
* 负责任务分发,以及任务失败后的重试工作
*/
val sc = new SparkContext(conf)
/**
* makeRDD方法的第一个参数代表的是RDD中的 元素
* 第二个参数:RDD的分区数
* rdd[Int]
*/
val rdd = sc.makeRDD(1 to 10,3)
/**
* mapPartitions这个算子遍历的单位是partition
* 会将一个partition的数据量全部加载到一个集合里面
*/
val mapPartitionsWithIndexRDD = rdd.mapPartitionsWithIndex((index,iterator)=>{
val list = new ListBuffer[Int]()
while (iterator.hasNext) {
val num = iterator.next()
println("partitionId:" + index + "value:" + num)
list+=num
}
list.iterator
}, false).collect()
/**
* 释放资源
*/
sc.stop()
}
}