Spark中常用转换算子（非键值对）示例

qq_45973211

于 2022-08-26 08:52:46 发布

阅读量197

点赞数

分类专栏： Spark 文章标签： spark 大数据 scala

本文链接：https://blog.csdn.net/qq_45973211/article/details/126535966

版权

Spark 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

object RDDDemo {
  def main(args: Array[String]): Unit = {
    val sparkConf: SparkConf = new SparkConf().setAppName("Map算子").setMaster("local[2]")
    val sc: SparkContext = new SparkContext(sparkConf)

    mapOper(sc)
    filterOper(sc)
    flatMapOper(sc)
    mapPartitionsOper(sc)
    mapPartitionsWithIndexOper(sc)

    sc.stop()
  }

  /**
   * map算子操作函数
   *
   * @param sc
   */
  def mapOper(sc: SparkContext): Unit = {
    // map算子
    val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0), 1)
    val rdd1: RDD[Int] = rdd.map((a: Int) => {
      a * 3
    })
    rdd1.foreach(println(_))

    val rdd2: RDD[String] = sc.makeRDD(Array("hadoop", "scala", "flink", "hadoop"))
    val rdd3: RDD[(String, Int)] = rdd2.map((word: String) => {
      (word, 1)
    })
    rdd3.foreach(println(_))

  }

  /**
   * filter算子操作函数
   */
  def filterOper(sc: SparkContext): Unit = {
    val rdd: RDD[Int] = sc.makeRDD(1 to 100)
    val rdd1: RDD[Int] = rdd.filter((a: Int) => {
      if (a % 2 == 1) {
        true
      } else {
        false
      }
    })
    rdd1.foreach(println(_))

  }

  /**
   * flatMap算子——压扁操作
   */
  def flatMapOper(sc: SparkContext): Unit = {
    val rdd: RDD[String] = sc.makeRDD(Array("i am people", "dog is an animal"))
    val rdd1: RDD[String] = rdd.flatMap((line: String) => {
      // 数组里的每一条数据是新RDD中的一条数据
      line.split(" ")
    })
    println("============flatMap============")
    rdd1.foreach(println(_))
  }

  /**
   * mapPartitions算子——和map算子函数含义一致，不同的是mapPartitions算子是对每一个分区数据操作一次，map算子是每一条数据操作一次
   *
   * def mapPartitions[U: ClassTag](
      f: Iterator[T] => Iterator[U],       // 返回迭代器
      preservesPartitioning: Boolean = false): RDD[U]={}
   */
  def mapPartitionsOper(sc: SparkContext): Unit = {
    val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0), 4)
    val rdd1: RDD[Int] = rdd.mapPartitions((a: Iterator[Int]) => {
      var listBuffer: ListBuffer[Int] = ListBuffer()
      for (num <- a) {     // 按分区对数据进行操作
        println(s"处理了一个分区数据$num")
        listBuffer.append(num * 3)
      }
      listBuffer.iterator   // 返回迭代器
    })
    rdd1.foreach(println(_))
  }

  /**
   * mapPartitionsWithIndex算子——和mapPartitions算子函数含义一致，不同的是mapPartitionsWithIndex的func带有一个整数参数表示分片的索引值
   */
  def mapPartitionsWithIndexOper(sc: SparkContext): Unit = {
    val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0), 4)
    val rdd1: RDD[Int] = rdd.mapPartitionsWithIndex((index: Int, data: Iterator[Int]) => {
      val listBuffer: ListBuffer[Int] = ListBuffer()
      for (num <- data) {
        println(s"正在处理$index 分区的数据$num")
        listBuffer.append(num * 3)
      }
      listBuffer.iterator
    })
    rdd1.foreach(println(_))
  }
}

object RDDDemo01 {
  def main(args: Array[String]): Unit = {
    val sparkConf: SparkConf = new SparkConf().setAppName("RDDDemo01").setMaster("local[2]")
    val sc: SparkContext = new SparkContext(sparkConf)
    sampleOper(sc)
    unionOper(sc)
    intersectionOper(sc)
    repartitionOper(sc)
    sc.stop()
  }

  /**
   * sample算子和takeSample算子
   *
   * def sample(
      withReplacement: Boolean,
      fraction: Double,
      seed: Long = Utils.random.nextLong): RDD[T] ={}
   * 第一个参数withReplacement：是否放回式抽样
   * 第二个参数fraction：抽取的比率
   * 第三个参数seed：抽样算法的初始值
   *
   * 区别：和Sample的区别是：takeSample返回的是最终的结果集合
   */
  def sampleOper(sc: SparkContext): Unit = {
    val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3), 4)
    val rdd2: RDD[Int] = rdd.mapPartitionsWithIndex((index: Int, data: Iterator[Int]) => {
      println(s"sample之前有一个$index 数据为${data.mkString(",")}")
      data
    })
    val rdd1: RDD[Int] = rdd.sample(false, 0.5)
    val rdd3: RDD[Int] = rdd1.mapPartitionsWithIndex((index: Int, data: Iterator[Int]) => {
      println(s"sample之后有一个$index 数据为${data.mkString(",")}")
      data
    })

    // takeSample算子
    val array2:Array[Int] = rdd.takeSample(false, 10)
    println(array2.mkString("::"))

    rdd2.collect()
    rdd3.collect()
  }

  /**
   * union——并集，不去重
   */
  def unionOper(sc: SparkContext): Unit = {
    val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 8, 9))
    val rdd1: RDD[Int] = sc.makeRDD(Array(11, 12, 13, 14))
    val rdd2: RDD[Int] = rdd.union(rdd1)
    rdd2.foreach(print(_))
  }

  /**
   * intersection算子——交集
   */
  def intersectionOper(sc: SparkContext): Unit = {
    val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 8, 9))
    val rdd1: RDD[Int] = sc.makeRDD(Array(1, 12, 3, 14))
    val rdd2: RDD[Int] = rdd.intersection(rdd1)
    rdd2.foreach(print(_))
  }

  /**
   * distinct算子——对RDD去重
   */
  def distinctOper(sc: SparkContext): Unit = {
    val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 3, 9))
    val rdd1: RDD[Int] = rdd.distinct(3)
    rdd1.foreach(println(_))
  }

  /**
   * repartition——对RDD重新分区
   */
  def repartitionOper(sc: SparkContext): Unit = {
    val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 3, 9), 10)
    println("重新分区前的分区数：" + rdd.getNumPartitions)
    val rdd1: RDD[Int] = rdd.repartition(2)
    println("重新分区后的分区数：" + rdd1.getNumPartitions)
  }
}