import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.immutable.HashMap
import scala.collection.mutable.ArrayBuffer
/**
* Created by liupeng on 2017/6/15.
*/
object T_mapPartitions {
System.setProperty("hadoop.home.dir","F:\\hadoop-2.6.5")
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("mapPartitions_test").setMaster("local")
val sc = new SparkContext(conf)
//准备一下数据
val names : List[String] = List("liupeng", "xuliuxi", "xiaoma")
val nameRDD = sc.parallelize(names)
//map算子,一次就处理一个partition的一条数据
//mapPartitions算子,按照分区遍历,一次处理一个partition中所有的数据
//推荐的使用场景!!!
//如果你的RDD的数据不是特别多,那么采用mapPartitions算子代替map算子,可以加快处理速度
//比如说100亿条数据,你一个partition里面就有10亿条数据,不建议使用mapPartitions
//内存溢出
def showCapital(x : Option[Int]) = x match {
case Some(s) => s
case None => 0
}
def scoreFunc(iter : Iterator[String]) : Iterator[Int]= {
val res = ArrayBuffer[Int]()
var scoreMap = new HashMap[String, Int]
scoreMap += ("liupeng" -> 150)
scoreMap += ("xuliuxi" -> 120)
scoreMap += ("xiaoma" -> 100)
while (iter.hasNext)
{
var name : String = iter.next()
var score = scoreMap.get(name).get
res += score
println(name)
}
return res.iterator
}
val result = nameRDD.mapPartitions(scoreFunc)
result.foreach(println)
}
}
运行结果: