spark--transform算子--mapPartitions

最新推荐文章于 2022-05-13 15:56:51 发布

书灯

最新推荐文章于 2022-05-13 15:56:51 发布

阅读量419

点赞数

分类专栏： spark 文章标签： spark mapPartitions

本文链接：https://blog.csdn.net/struct_slllp_main/article/details/75331106

版权

spark 专栏收录该内容

30 篇文章 0 订阅

订阅专栏

import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.immutable.HashMap
import scala.collection.mutable.ArrayBuffer


/**
  * Created by liupeng on 2017/6/15.
  */
object T_mapPartitions {

  System.setProperty("hadoop.home.dir","F:\\hadoop-2.6.5")

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("mapPartitions_test").setMaster("local")
    val sc = new SparkContext(conf)

    //准备一下数据
    val names : List[String] = List("liupeng", "xuliuxi", "xiaoma")
    val nameRDD = sc.parallelize(names)


    //map算子，一次就处理一个partition的一条数据
    //mapPartitions算子，按照分区遍历，一次处理一个partition中所有的数据

    //推荐的使用场景！！！
    //如果你的RDD的数据不是特别多，那么采用mapPartitions算子代替map算子，可以加快处理速度
    //比如说100亿条数据，你一个partition里面就有10亿条数据，不建议使用mapPartitions
    //内存溢出

    def showCapital(x : Option[Int]) = x match {
      case Some(s) => s
      case None => 0
    }

    def scoreFunc(iter : Iterator[String]) : Iterator[Int]= {
      val res = ArrayBuffer[Int]()

      var scoreMap  = new HashMap[String, Int]
      scoreMap += ("liupeng" -> 150)
      scoreMap += ("xuliuxi" -> 120)
      scoreMap += ("xiaoma" -> 100)

      while (iter.hasNext)
      {
        var name : String = iter.next()
        var score  = scoreMap.get(name).get
        res += score
        println(name)
      }
      return res.iterator
    }

    val result = nameRDD.mapPartitions(scoreFunc)
    result.foreach(println)
  }

}

运行结果：