package com.cn.spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
object SparkMap extends App {
val conf: SparkConf = new SparkConf().setMaster("local[1]").setAppName("TestMapAndMapPartitions")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("WARN")
val rdd: RDD[Int] = sc.parallelize(Array(1,2,3,4,5,6))
/**
* map函数每次处理一个/行数据
*/
private val value: RDD[Int] = rdd.map(x => {
x
})
value.foreach(println)
println("===========================")
/**
* mapPartitions每次处理一批数据
* 将 rdd分成x批数据进行处理
* p是其中一批数据
* mapPartitions返回一批数据(iterator)
* mapPartitions返回值必须是迭代器(iterator)
*/
private val v_value: RDD[Int] = rdd.mapPartitions(p => {
var arr: ArrayBuffer[Int] = new ArrayBuffer[Int]()
p.foreach(ele => {
arr.+=(ele)
})
arr.iterator
})
v_value.foreach(println)
}