Spark算子使用大集合

最新推荐文章于 2024-07-24 10:46:45 发布
无声---
最新推荐文章于 2024-07-24 10:46:45 发布
阅读量176
点赞数
分类专栏： spark 文章标签： spark
本文链接：https://blog.csdn.net/qq_43605617/article/details/110496848
版权
spark 专栏收录该内容
6 篇文章 0 订阅
订阅专栏
Spark算子使用大集合

package com.atguigu.sparkTest.RDDS
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.shell.Command
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.rdd.RDD
import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkConf, SparkContext}
import org.junit._
import scala.math
import scala.math.Ordered
import scala.reflect.ClassTag

case class Emp(var name: String, age: Int)
case class Person(var name: String, age: Int) extends Ordered[Person] {
  override def compare(that: Person): Int = {
    var result = -this.age.compareTo(that.age)
    if (result == 0) {
      result = this.name.compareTo(that.name)
    }
    result
  }
}
class MyPartition(var num: Int) extends Partitioner {
  override def numPartitions: Int = num
  override def getPartition(key: Any): Int = {
    if (!key.isInstanceOf[Person]) {
      0
    } else {
      val p = key.asInstanceOf[Person]
      if (p.age >= 20) 1
      else 0
    }
  }
}
/**
 * ----------------------------------------------------------------------------------
 *
 *                          Spark算子练习
 *                                     1.单Value算子
 *                                     2.双Value算子
 *                                     3.Key-Value算子
 *                                     4.行动算子
 *
 * ----------------------------------------------------------------------------------
 */


/**
 * ----------------------------------------------------------------------------------
 *
 *                                    单Value算子
 *
 * ----------------------------------------------------------------------------------
 */
class RDDDemo {
  val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("My app")
  val sparkContext = new SparkContext(conf)

  @After
  def close(): Unit = {
    sparkContext.stop()
  }

  @Before
  def start(): Unit = {
    //本地文件系统
    val fileSystem: FileSystem = FileSystem.get(new Configuration())
    // 输出目录
    val path = new Path("output")
    if (fileSystem.exists(path)) {
      fileSystem.delete(path, true)
    }
    fileSystem.close()
  }

  /**
   * 对rdd每一个元素进行操作
   */
  @Test
  def testMap(): Unit = {
    sparkContext.makeRDD(List(1, 2, 3, 4), 2)
      .map(_ + 1)
      .saveAsTextFile("output")
  }

  /**
   * 从服务器日志数据apache.log中获取用户请求URL资源路径
   */
  @Test
  def testMapEx(): Unit = {
    val datas: RDD[String] = sparkContext.textFile("input/apache.log")
    datas.map(_.split(" ")(6))
      .saveAsTextFile("output")
  }

  /**
   * 对每个分区进行操作 .mapPartitions(x => { }) x代表 分区
   */
  @Test
  def testMapPartitions(): Unit = {
    sparkContext.makeRDD(List(1, 2, 3, 4), 2)
      .mapPartitions(x => {
        x.map(x => {
          x + 1
        })
      })
      .saveAsTextFile("output")
  }

  @Test
  def testMapPartitionsEx(): Unit = {
    val rdd1 = sparkContext.makeRDD(List(1, 2, 3, 4, 5, 6, 7), 2)
    rdd1.mapPartitions(x => {
      Iterator(x.max)
    })
      .saveAsTextFile("output")
  }

  /**
   * 对每个分区进行操作，带有分区号
   */
  @Test
  def testMapPartitionsWithIndex(): Unit = {
    sparkContext.makeRDD(List(1, 2, 3, 4), 2)
      .mapPartitionsWithIndex((index, iter) => {
        iter.map(x => {
          (index, x)
        })
      })
      .saveAsTextFile("output")
  }

  @Test
  def testMapPartitionsWithIndexEx(): Unit = {
    println(sparkContext.makeRDD(List(1, 2, 3, 4), 2)
      .mapPartitionsWithIndex((index, iter) => {
        if (index == 1) iter
        else Nil.iterator
      })
      .collect().toList)
  }

  /**
   * 偏平化操作，用于集合里面嵌套集合
   */
  @Test
  def testFlatMap(): Unit = {
    sparkContext.makeRDD(List(List(1, 2), List(3, 4), List(5, 6)), 2)
      .flatMap(x => {
        x.map(_ + 1)
      })
      .saveAsTextFile("output")
  }

  @Test
  def testFlatMapEx(): Unit = {
    val rdd: RDD[Any] = sparkContext.makeRDD(List(List(1, 2), 3, List(4, 5)), 2)
    var rdd1 = rdd.flatMap {
      case list: List[Int] => list
      case num: Int => List(num)
    }
    val rdd2: RDD[Int] = rdd1.sortBy(x => x, false, 1)
    rdd2.foreach(println)
  }

  /**
   * 将分区的数据转换为数组
   */
  @Test
  def testGlom(): Unit = {
    sparkContext.makeRDD(List(1, 2, 3, 4), 2)
      .glom()
      .map(_.max)
      .saveAsTextFile("output")
  }

  //计算所有分区最大值求和（分区内取最大值，分区间最大值求和）
  @Test
  def testGlomEx(): Unit = {
    val d: Double = sparkContext.makeRDD(List(1, 2, 3, 4), 2)
      .glom()
      .map(_.max)
      .sum()
    println(d)
  }

  /**
   * 按照某一条件分组，对所有分区
   */
  @Test
  def testGroupBy(): Unit = {
    sparkContext.makeRDD(List(1, 2, 3, 4), 1)
      .groupBy(_ % 2)
      .saveAsTextFile("output")
  }

  @Test
  def testGroupByEx(): Unit = {
    val dataRDD = sparkContext.makeRDD(List("Hello", "hive", "hbase", "Hadoop", "kafka", "kk"), 2)
      .groupBy(x => {
        x(0)
      })
      .saveAsTextFile("output")
  }

  //从服务器日志数据apache.log中获取每个时间段访问量。
  @Test
  def testGroupByEx1(): Unit = {
    sparkContext.textFile("input/apache.log")
      .groupBy(x => {
        x.split(" ")(3)
      })
      .map(x => {
        (x._1, x._2.size)
      })
      .foreach(println)

  }

  @Test
  def testPartitioner(): Unit = {
    val rdd: RDD[String] = sparkContext.textFile("input")
    val rdd1: RDD[String] = rdd.flatMap(_.split(" "))
    val rdd2: RDD[(String, Int)] = rdd1.map(x => {
      (x, 1)
    })
    val rdd3: RDD[(String, Int)] = rdd2.reduceByKey(_ + _)
    println(rdd3.partitioner)
    rdd3.saveAsTextFile("output")
    Thread.sleep(1000000000)
  }

  @Test
  def testPartitioner1(): Unit = {
    val rdd1: RDD[Int] = sparkContext.makeRDD(List(1, 2, 3, 4, 4, 4, 4, 4, 5, 5, 5), 2)
    println(rdd1.partitioner)
    val rdd2: RDD[Int] = rdd1.distinct()
    println(rdd2.partitioner)
    val rdd3 = rdd2.groupBy(_ % 2)
    println(rdd3.partitioner)
    rdd3.saveAsTextFile("output")
    Thread.sleep(1000000000)
  }

  /**
   * HashPartitioner分区器，根据元素的hash值/分区的个数 的值分区
   */
  @Test
  def testHashPartitioner(): Unit = {
    val rdd1: RDD[Int] = sparkContext.makeRDD(List(2, 4, 4, 4, 4, 4, 6, 6, 6, 6), 2)
    val hashPartitioner = new HashPartitioner(2)
    val rdd2: RDD[(Int, Int)] = rdd1.map(x => {
      (x, 1)
    })
    val rdd3: RDD[(Int, Int)] = rdd2.partitionBy(hashPartitioner)
    rdd3.saveAsTextFile("output")
  }

  /**
   * RangePartitioner分区器 先进行抽样，确定分区的范围，再使用二分查找法分区
   */
  @Test
  def testRangePartitioner(): Unit = {
    val rdd1: RDD[Int] = sparkContext.makeRDD(List(2, 4, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8), 2)
    val rdd2: RDD[(Int, Int)] = rdd1.map(x => {
      (x, 1)
    })
    val p: RangePartitioner[Int, Int] = new RangePartitioner[Int, Int](3, rdd2)
    val rdd3: RDD[(Int, Int)] = rdd2.partitionBy(p)
    rdd3.saveAsTextFile("output")
  }
  /**
   * 对kv类型的rdd中的v进行操作
   * (a,89)
   * (b,96)
   * (a,92)
   * (c,89)
   */
  @Test
  def testMapValues() : Unit = {
    sparkContext.makeRDD(List(("a", 88), ("b", 95), ("a", 91),("c",88)), 1)
      .mapValues(x=>x+1)
      .saveAsTextFile("output")
  }
  /**
   * 根据条件过滤数据
   */
  @Test
  def testFilter(): Unit = {
    sparkContext.makeRDD(List(1, 2, 3, 4), 2)
      .filter(_ % 2 == 0)
      .saveAsTextFile("output")
  }

  //从服务器日志数据apache.log中获取2015年5月17日的请求路径
  @Test
  def testFilterEx(): Unit = {
    sparkContext.textFile("input/apache.log")
      .filter(_.contains("17/05/2015"))
      .map(_.split(" ")(6))
      .foreach(println)
  }

  /**
   * 由rdd产生一个rdd集合
   * withReplacement 是否可以重复
   * fraction withReplacement=true时，这个数大于0
   * withReplacement=false时，这个数再[0,1]0：全不取；1：全取；
   * seed 种子， 种子相同产生结果相同
   */
  @Test
  def testSample(): Unit = {
    sparkContext.makeRDD(List(1, 2, 3, 4), 2)
      .sample(false, 1)
      .foreach(print)
  }

  /**
   * 去重
   */
  @Test
  def testDistinct(): Unit = {
    sparkContext.makeRDD(List(1, 2, 3, 4, 5, 6, 6, 6), 2)
      .distinct(3)
      .saveAsTextFile("output")
    Thread.sleep(1000000)
  }

  /**
   * 自定义分区器排序
   */
  @Test
  def testMyPartitioner(): Unit = {
    val p1 = Person("tom", 18)
    val p2 = Person("alisa", 22)
    val persons = List(p1, p2)
    val rdd = sparkContext.makeRDD(persons)
    val rdd1: RDD[(Person, Int)] = rdd.map(x => (x, 1))
    val rdd2: RDD[(Person, Int)] = rdd1.partitionBy(new MyPartition(2))
    rdd2.saveAsTextFile("output")
  }

  /**
   * 重分区，一般由分区数多的向分区数少的转换
   * numPartitions:分区数 shuffle为false如果设置的分区数大于父rdd分区数，分区数不会增加，还是父分区数
   * shuffle ： true使用shuffle false不shuffle
   */
  @Test
  def testCoalesce(): Unit = {
    sparkContext.makeRDD(List(1, 2, 3, 4), 2)
      .coalesce(3)
      .saveAsTextFile("output")
  }

  /**
   * 一定会shuffle，一般用于扩展分区
   */
  @Test
  def testRepartition(): Unit = {
    sparkContext.makeRDD(List(1, 2, 3, 4), 2)
      .repartition(4)
      .saveAsTextFile("output")
    Thread.sleep(10000000)
  }
  /**
   * 根据指定的规则排序
   */
  @Test
  def testSortBy(): Unit = {
    sparkContext.makeRDD(List(1, 2, 3, 4, 9, 0, 2), 1)
      .sortBy(x => x)
      .saveAsTextFile("output")
  }

  @Test
  def testSortBy1(): Unit = {
    val rdd = sparkContext.makeRDD(
      List(Person("alisa", 12), Person("kob", 20), Person("carry", 22), Person("jack", 20)), 1)

    /**
     * 先根据Person的age排序，再根据name排序，都是升序
     * .sortBy(x=>{(x.age,x.name)})
     */
    /**
     * 自定义排序：
     *         1. 先根据Person的age排序<降序>，再根据name排序<升序>，
     * implicit def Tuple2[T1, T2]
     * (implicit ord1: Ordering[T1], ord2: Ordering[T2]): Ordering[(T1, T2)]
     * 增加柯里化的函数(排序器Ordering中调用方法Tuple2[K的类型，V的类型](K的排序规则<reverse可以翻转>,V的排序规则))
     * ,ClassTag(传入Tuple2的类型)
     * .sortBy(x=>{(x.age,x.name)})(Ordering.Tuple2[Int,String](Ordering.Int.reverse,Ordering.String)
     * ,ClassTag(classOf[(String, Int)]))
     */
    /**
     *         2.  自定义排序器   new Ordering[要排序的类型，可以是对象] ，重写compare方法
     * val orderPerson = new Ordering[Person] {
     * override def compare(x: Person, y: Person): Int = {
     * var result: Int = x.age.compareTo(y.age)
     * if (result == 0) {
     * result = x.name.compareTo(y.name)
     * }
     * result
     * }
     * }
     * 柯里化函数中直接传入自定义的比较器， ClassTag(传入比较器比较的类型)
     *          rdd.sortBy(x=>x)(orderPerson,ClassTag(classOf[Person]))
     * .saveAsTextFile("output")
     *         3.在定义的类上直接继承Ordered[比较的类型]，重写compare方法
     * case class Person(var name: String, age: Int) extends Ordered[Person] {
     * override def compare(that: Person): Int = {
     * var result = -this.age.compareTo(that.age)
     * if (result == 0) {
     * result = this.name.compareTo(that.name)
     * }
     * result
     * }
     * }
     */
    rdd.sortBy(x => x)
      .saveAsTextFile("output")
  }

  /**
   * 执行脚本例如shell
   */
  @Test
  def testPipe(): Unit = {
    sparkContext.makeRDD(List("hi", "Hello", "how", "are", "you"), 1)
      .pipe(
        """
          |D:\myjava\bin\java.exe -version
          |""".stripMargin)
      .saveAsTextFile("output")
  }

  /**
   * ----------------------------------------------------------------------------------
   *
   *                                    双Value算子
   *
   * ----------------------------------------------------------------------------------
   */

  /**
   * 求两个RDD的交集
   */
  @Test
  def testIntersection(): Unit = {
    val rdd = sparkContext.makeRDD(List(1, 2, 3, 4, 5, 6), 2)
    val rdd1 = sparkContext.makeRDD(List(4, 5, 6, 7, 8, 9), 2)
    rdd.intersection(rdd1)
      .saveAsTextFile("output")
  }

  /**
   * 求两个RDD的并集
   */
  @Test
  def testUnion(): Unit = {
    val rdd = sparkContext.makeRDD(List(1, 2, 3, 4, 5, 6), 2)
    val rdd1 = sparkContext.makeRDD(List(4, 5, 6, 7, 8, 9), 2)
    rdd.union(rdd1)
      .saveAsTextFile("output")
  }

  /**
   * 以rdd为主去除rdd与rdd1相同的元素
   */
  @Test
  def testSubtract(): Unit = {
    val rdd = sparkContext.makeRDD(List(1, 2, 3, 4, 5, 6), 2)
    val rdd1 = sparkContext.makeRDD(List(4, 5, 6, 7, 8, 9), 2)
    rdd.subtract(rdd1)
      .saveAsTextFile("output")
  }

  /**
   * 产生笛卡尔积
   * (4,7)(4,4)(1,4)(1,7)(1,5)(4,5)(4,8)(4,6)(1,6)(1,8)(2,4)(5,4)
   * (4,9)(5,5)(2,5)(1,9)(2,6)(5,6)(5,7)(3,4)(2,7)(3,5)(5,8)(6,4)
   * (5,9)(3,6)(2,8)(6,7)(6,5)(6,8)(2,9)(6,9)(6,6)(3,7)(3,8)(3,9)
   */
  @Test
  def testCartesian() : Unit = {
    val rdd = sparkContext.makeRDD(List(1, 2, 3, 4, 5, 6), 2)
    val rdd1 = sparkContext.makeRDD(List(4, 5, 6, 7, 8, 9), 2)
    rdd.cartesian(rdd1)
      .foreach(print)
  }

  /**
   * 两个RDD对应位置进行重组，生成新的rdd,拉链的两个RDD中的元素数必须相等,对于元素的类型没有要求
   * (1,4)(2,5)(3,6)(4,7)(5,8)(kk,9)
   */
  @Test
  def testZip(): Unit = {
    val rdd = sparkContext.makeRDD(List(1, 2, 3, 4, 5, "kk"), 2)
    val rdd1 = sparkContext.makeRDD(List(4, 5, 6, 7, 8, 9), 2)
    rdd.zip(rdd1)
      .saveAsTextFile("output")
  }

  /**
   * rdd中每个元素与之对应的索引进行拉链
   * (1,0)(2,1)(3,2)(4,3)(5,4)(kk,5)
   */
  @Test
  def testZipWithIndex() : Unit = {
    val rdd = sparkContext.makeRDD(List(1, 2, 3, 4, 5, "kk"), 1)
    rdd.zipWithIndex()
      .saveAsTextFile("output")
  }

  /**
   *  rdd.zipPartitions(rdd1)((iter1,iter2)=>{iter1.zipAll(iter2,111,111)})
   *                         (f: (Iterator[T], Iterator[B]) => Iterator[V])
   *                         zipAll返回一个迭代器
   *  rdd(要与rdd拉链的RDD)((迭代器1，迭代器2)=>{迭代器1.zipAll(迭代器2，
   *                                                            如果迭代器1中的元素小于迭代器2的元素，补齐迭代器1，填写任意类型
   *                                                          ，如果迭代器2中的元素小于迭代器1的元素，补齐迭代器2，填写任意类型)})
   * (1,4)(2,5)(3,6)(4,7)(5,8)(4,9)(111,2)
   */
  @Test
  def testZipPartitions() : Unit = {
    val rdd: RDD[Int] = sparkContext.makeRDD(List(1, 2, 3, 4, 5, 4), 1)
    val rdd1: RDD[Int] = sparkContext.makeRDD(List(4, 5, 6, 7, 8, 9,2), 1)
    rdd.zipPartitions(rdd1)((iter1,iter2)=>{iter1.zipAll(iter2,111,111)})
      .foreach(print)
      //.saveAsTextFile("output")
  }
  /**
   * ----------------------------------------------------------------------------------
   *
   *                                    Key-Value算子
   *
   * ----------------------------------------------------------------------------------
   */
  /**
   * 直接填写分区数，就会分几个区
   * partitionBy(new HashPartitioner(4))
   * 填写的分区数不一定就会分几个区，还和第二个参数传入的RDD有关的抽样有关（需要传入不同的KV类型，相同的KV只会取一个）
   * partitionBy(new RangePartitioner(4,sparkContext.makeRDD(List((1,2),(2,3),(1,2),(2,3),(1,2),(7,7)))))
   */
  @Test
  def testPartitionBy(): Unit = {
    sparkContext.makeRDD(List(1, 2, 3, 4, 5, 6), 2)
      .map((_, 1))
      //HashPartition
      //.partitionBy(new HashPartitioner(4))
      //RangePartitioner
      .partitionBy(new RangePartitioner(4,
        sparkContext.makeRDD(List((1, 2), (2, 3), (1, 2), (2, 3), (1, 2), (7, 7)))))
      .saveAsTextFile("output")
  }

  /**
   * reduceByKey
   * 对(K,V)的RDD进行操作
   * 先按照K进行分组
   * func: (V, V) => V
   * 第一个（K,V）的V作为第一个V，
   * 第二个（K,V）的V作为第二个V，
   * 两个V根据所定义的逻辑进行操作，返回一个V类型的值
   */
  @Test
  def testReduceByKey(): Unit = {
    sparkContext.textFile("input/word.txt")
      .flatMap(_.split(" "))
      .map((_, 1))
      .reduceByKey(_ + _, 1)
      .saveAsTextFile("output")
  }

  /**
   * groupByKey
   * 针对k,v类型的rdd
   * 根据Key进行分组 k->CompactBuffer(v1，v2,v3)
   */
  @Test
  def testGroupByKey(): Unit = {
    sparkContext.textFile("input/word.txt")
      .flatMap(_.split(" "))
      .map((_, 1))
      .groupByKey()
      .map(x => {
        (x._1, x._2.sum)
      })
      .saveAsTextFile("output")
  }

  /**
   * def aggregateByKey[U: ClassTag]  针对k,v类型的rdd
   * (zeroValue: U) 自定义的初始值
   * (seqOp: (U, V) => U, 对每一个分区，先分组，然后再又函数自定义的逻辑对每一个组内进行操作
   * combOp: (U, U) => U): RDD[(K, U)]) 对不同分区的数据进行操作 ，会产生shuffle
   */
  @Test
  def testAggregateByKey(): Unit = {
    sparkContext.makeRDD(List((1, 2), (4, 2), (2, 3), (0, 9), (1, 3), (2, 4)))
      .aggregateByKey(100, 2)({
        case (n1, n2) => n1 + n2
      }, {
        case (n1, n2) => n1 + n2
      })
      .saveAsTextFile("output")

    // 0 (0,109),(2,107),(4,102)
    // 1 (1,205)
  }

  /**
   * def foldByKey
   * (zeroValue: V) 设置初始值
   * (func: (V, V) => V) 对数据所有的分区进行分组分组，第一个V是设置的初始值，第二个V是第一个KV的V，进行逻辑操作
   * 下一个调用函数的第一个V，是上次计算的结果，第二个V为第二个KV的V，进行逻辑操作。以后以此类推
   * : RDD[(K, V)]   针对KV类型的RDD
   */
  @Test
  def testFoldByKey(): Unit = {
    sparkContext.makeRDD(List((1, 2), (4, 2), (2, 3), (0, 9), (1, 3), (2, 4)))
      .foldByKey(100, 2)(_ + _)
      .saveAsTextFile("output")
    // 0 (0,109),(2,107),(4,102)
    // 1 (1,205)
  }

  /**
   * def combineByKey[C]( 针对kv类型RDD
   * createCombiner: V => C,先按照K进行分组，将组内第一个KV的V转换成C类型 例如：v=>(v,1)
   * mergeValue: (C, V) => C,第一个C就是组内第一个KV转换后的类型，V就是组内从第二个KV的V，
   * mergeCombiners: (C, C) => C): RDD[(K, C)] = self.withScope { 不同分区相同的K，既不同分区的同一组，产生shuffle
   * combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners)(null) ClassTag保存类型擦除前的类型
   * }
   */
  @Test
  def testCombineByKey(): Unit = {
    sparkContext.makeRDD(
      List(("a", 88), ("b", 95), ("a", 91), ("b", 93), ("a", 95), ("b", 98)), 2)
      .combineByKey(
        (_, 1),
        (x: (Int, Int), y: Int) => {
          (x._1 + y, x._2 + 1)
        },
        (x: (Int, Int), y: (Int, Int)) => {
          (x._1 + y._1, x._2 + y._2)
        }
      )
      .map(x => {
        (x._1, x._2._1 / x._2._2)
      })
      .saveAsTextFile("output")
  }

  /**
   * 隐式召唤
   */
  implicit var a: Int = 10

  @Test
  def testImplicitly(): Unit = {
    val i = implicitly[Int]
    println(i)
  }

  @Test
  def testSortByKey(): Unit = {
    /**
     * 前提是要先将对象封装成KV类型的rdd
     * 定义好作用域内要隐士转换的排序规则，sortByKey会自动调用此方法，使用了幽冥召唤
     * 以下排序规则定义了先按照name降序排序，再按照age升序排序,
     */

    implicit var myOrder = new Ordering[Emp] {
      override def compare(x: Emp, y: Emp): Int = {
        var result: Int = -x.name.compareTo(y.name)
        if (result == 0) {
          result = x.age.compareTo(y.age)
        }
        result
      }
    }
    sparkContext.makeRDD(
      List(Emp("a", 10), Emp("b", 11), Emp("b", 10), Emp("a", 20)), 1)
      .map(x => {
        (x, 1)
      })
      .sortByKey()
      .map(x => {
        (x._1)
      })
      .saveAsTextFile("output")
  }

  /**
   * join    如果关联上，使用Some，如果关联不上使用None标识！
   * rdd1中的每个元素分别于rdd2中的每个元素相同K的进行交集,排除没有相同K的元素
   * (a,(88,89))
   * (a,(88,91))
   * (a,(91,89))
   * (a,(91,91))
   * (b,(95,95))
   */
  @Test
  def testJoin(): Unit = {
    val rdd1 = sparkContext.makeRDD(List(("a", 88), ("b", 95), ("a", 91),("c",88)), 2)
    val rdd2 = sparkContext.makeRDD(List(("a", 89), ("b", 95), ("a", 91),("d",66)), 2)
    rdd1.join(rdd2, 1)
      .saveAsTextFile("output")
  }

  /**
   * rdd1中的每个元素分别于rdd2中的每个元素相同K的进行交集,左侧rdd1的元素全取出来，右侧没有相同key的排除
   * (a,(88,Some(89)))
   * (a,(88,Some(91)))
   * (a,(91,Some(89)))
   * (a,(91,Some(91)))
   * (b,(95,Some(95)))
   * (c,(88,None))
   */
  @Test
  def testLeftJoin(): Unit = {
    val rdd1 = sparkContext.makeRDD(List(("a", 88), ("b", 95), ("a", 91),("c",88)), 2)
    val rdd2 = sparkContext.makeRDD(List(("a", 89), ("b", 95), ("a", 91),("d",66)), 2)
    rdd1.leftOuterJoin(rdd2,1)
      .saveAsTextFile("output")
  }

  /**
   * rdd1中的每个元素分别于rdd2中的每个元素相同K的进行交集,右侧rdd1的元素全取出来，左侧没有相同key的排除
   * (d,(None,66))
   * (a,(Some(88),89))
   * (a,(Some(88),91))
   * (a,(Some(91),89))
   * (a,(Some(91),91))
   * (b,(Some(95),95))
   */
  @Test
  def testRightJoin() : Unit = {
    val rdd1 = sparkContext.makeRDD(List(("a", 88), ("b", 95), ("a", 91),("c",88)), 2)
    val rdd2 = sparkContext.makeRDD(List(("a", 89), ("b", 95), ("a", 91),("d",66)), 2)
    rdd1.rightOuterJoin(rdd2,1)
      .saveAsTextFile("output")
  }

  /**
   *  rdd1中的每个元素分别于rdd2中的每个元素相同K的进行交集,两边的元素都取出来
   * (d,(None,Some(66)))
   * (a,(Some(88),Some(89)))
   * (a,(Some(88),Some(91)))
   * (a,(Some(91),Some(89)))
   * (a,(Some(91),Some(91)))
   * (b,(Some(95),Some(95)))
   * (c,(Some(88),None))
   */
  @Test
  def testFullJoin() : Unit = {
    val rdd1 = sparkContext.makeRDD(List(("a", 88), ("b", 95), ("a", 91),("c",88)), 2)
    val rdd2 = sparkContext.makeRDD(List(("a", 89), ("b", 95), ("a", 91),("d",66)), 2)
    rdd1.fullOuterJoin(rdd2,1)
      .saveAsTextFile("output")
  }

  /**
   * 将两侧的RDD根据key进行聚合，返回左右两侧RDD相同的values集合的RDD！
   * (d,(CompactBuffer(),CompactBuffer(66)))
   * (a,(CompactBuffer(88, 91),CompactBuffer(89, 91)))
   * (b,(CompactBuffer(95),CompactBuffer(95)))
   * (c,(CompactBuffer(88),CompactBuffer()))
   */
  @Test
  def testCoGroup() : Unit = {
    val rdd1 = sparkContext.makeRDD(List(("a", 88), ("b", 95), ("a", 91),("c",88)), 2)
    val rdd2 = sparkContext.makeRDD(List(("a", 89), ("b", 95), ("a", 91),("d",66)), 2)
    rdd1.cogroup(rdd2,1)
      .saveAsTextFile("output")
  }

  /**
   * ----------------------------------------------------------------------------------
   *
   *                                    行动算子
   *
   * ----------------------------------------------------------------------------------
   */

  /**
   * 行动算子用来提交Job！和转换算子不同的时，转换算子一般是懒执行的！转换算子需要行动算子触发！
   */

  /**
   * 分区的数据通过初始值和分区内的数据进行聚合，然后再和初始值进行分区间的数据聚合
   * 结果是13
   * 原因：两个分区由两个Task，分布式计算同时进行，每个Task都有一个初始值+1+1，再加driver+1 最后+3
   */
  @Test
  def testAggregate(): Unit = {
    println(sparkContext.makeRDD(List(1, 2, 3, 4), 2)
      .aggregate(1)(_ + _, _ + _))
    Thread.sleep(10000000)
  }

  /**
   * 对rdd内的元素进行求和
   *  def reduce(f: (T, T) => T): T
   *    第一轮计算
   *    第一个T，rdd中第一个元素，第二个T，rdd中第二个元素
   *    第二轮计算
   *    第一个T，上次计算的结果，第二个T，上次计算第二个T的下一个元素
   *
   */
  @Test
  def testReduce() : Unit = {
    println(sparkContext.makeRDD(List(1, 2, 3, 4, 5, 6, 7, 8))
      .reduce(_ + _))
  }

  /**
   * 将RDD的所有元素使用Array进行返回，收集到Driver
   * 慎用！如果RDD元素过多，Driver端可能会OOM！
   */
  @Test
  def testCollect() : Unit = {
   sparkContext.makeRDD(List(9,34,4,5))
      .collect()
      .foreach(print)
  }

  /**
   * 统计RDD中元素的个数
   * 慎用！如果RDD元素过多，Driver端可能会OOM！
   */
  @Test
  def testCount() : Unit = {
    println(sparkContext.makeRDD(List(9, 34, 4, 5))
      .count())
  }

  /**
   * 取前N个元素
   * 慎用！如果取RDD元素过多，Driver端可能会OOM！
   */
  @Test
  def testTake() : Unit = {
    sparkContext.makeRDD(List(9, 34, 4, 5))
      .take(3)
      .foreach(println)
  }

  /**
   * 取排序后的前N个元素
   * 慎用！如果取RDD元素过多，Driver端可能会OOM！
   */
  @Test
  def testTakeOrdered() : Unit = {
    sparkContext.makeRDD(List(9, 34, 4, 5))
      //Ordering.Int.reverse 倒序排序
      .takeOrdered(3)(Ordering.Int.reverse)
      .foreach(println)
  }

  /**
   * 返回第一个元素
   */
  @Test
  def testFirst() : Unit = {
    println(sparkContext.makeRDD(List("kk", "pop", "dad"))
      .first())
  }

  /**
   * 简化版的aggregate，分区内和分区间的运算逻辑一样
   * .fold(0)(_ + _))
   *                第一个参数：分区内的初始值
   *                第二个参数：分区内执行的逻辑
   *                结果是13
   *                原因：两个分区由两个Task，分布式计算同时进行，每个Task都有一个初始值+2，再加driver+1 最后+3
   *                解决办法广播变量
   */
  @Test
  def testFold() : Unit = {
    println(sparkContext.makeRDD(List(1, 2, 3, 4),1)
      .fold(1)(_ + _))
  }

  /**
   * 统计相同元素的个数
   * Map(4 -> 2, 1 -> 1, 3 -> 1, 2 -> 1)
   */
  @Test
  def testCountByValue() : Unit = {
    println(sparkContext.makeRDD(List(1, 2, 3, 4,4), 1)
      .countByValue())
  }

  /**
   *  针对RDD[(K,V)]类型的RDD，统计相同key对应的K-V的个数
   *  Map(1 -> 2, 2 -> 1, 3 -> 1)
   */
  @Test
  def testCountByKey() : Unit = {
    println(sparkContext.makeRDD(List((1, 2), (1, 3), (2, 3), (3, 4)))
      .countByKey())
  }

  /**
   * 遍历集合中的每一个元素，对元素执行函数
   */
  @Test
  def testForeach() : Unit = {
  sparkContext.makeRDD(List(1, 3, 4, 5, 6, 7))
      .foreach(print)
  }

  /**
   * 将RDD中的数据保存为TextFile
   */
  @Test
  def testSave() : Unit = {
      sparkContext.makeRDD(List(1,2,3,4,4),1)
      .saveAsTextFile("output")
  }

  /**
   * 保存为SequenceFile
   */
   @Test
   def testSave1() : Unit = {
       sparkContext.makeRDD(List((1, 2), (1, 3), (2, 3), (3, 4)),1)
       .saveAsSequenceFile("output")
   }

  /**
   *  new RangePartitioner会产生JOb
   */
   @Test
   def test() : Unit = {
       val p: RangePartitioner[Int, Int] = new RangePartitioner[Int, Int](3,sparkContext.makeRDD(List((1,2))))
     Thread.sleep(1000000)
   }
}
无声---
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Spark算子使用大集合

Spark算子使用大集合package com.atguigu.sparkTest.RDDSimport org.apache.hadoop.conf.Configurationimport org.apache.hadoop.fs.shell.Commandimport org.apache.hadoop.fs.{FileSystem, Path}import org.apache.spark.rdd.RDDimport org.apache.spark.{HashPartitioner, P
复制链接

扫一扫
专栏目录