关于spark的部分算子(一)

spark的RDD算子

  • cartesian笛卡尔积

       val conf: SparkConf = new SparkConf()
          conf.setAppName("MyPartition")
          conf.setMaster("local")
          val context:SparkContext = new SparkContext(conf)
          val array1=Array(1,2,3,4,5,6)
          val rdd1: RDD[Int] = context.parallelize(array1)
          val array2=Array(6,7,8,9,10)
          val rdd2: RDD[Int] = context.parallelize(array2)
          //笛卡尔积
          println("----------cartesian-----------")
          val result1: RDD[(Int, Int)] = rdd1.cartesian(rdd2)
          result1.foreach(x=>println(x))
    
  • union两个变一个

      val conf: SparkConf = new SparkConf()
      conf.setAppName("MyPartition")
      conf.setMaster("local")
      val context:SparkContext = new SparkContext(conf)
      val array1=Array(1,2,3,4,5,6)
      val rdd1: RDD[Int] = context.parallelize(array1)
      val array2=Array(6,7,8,9,10)
      val rdd2: RDD[Int] = context.parallelize(array2)
      val result2: RDD[Int] = rdd1.union(rdd2)
      result2.foreach(x=>println(x))
    
  • join AND subtract求并集和差集

      val conf: SparkConf = new SparkConf()
      conf.setAppName("MyPartition")
      conf.setMaster("local")
      val context:SparkContext = new SparkContext(conf)
      val list1: List[(String, Int)] =List(("a",1),("b",2),("c",3))
      val list2:List[(String,String)]=List(("a","haungbo"),("b","xuzheng"),("c","wuzun"))
      val rdd3: RDD[(String,Int)] = context.parallelize(list1)
      val rdd4: RDD[(String,String)] = context.parallelize(list2)
      //求并集
      println("----------join-----------")
      val result3: RDD[(String, (Int, String))] = rdd3.join(rdd4)
      result3.foreach(x=>println(x))
      //求差集
      println("----------subtract-----------")
      val result4: RDD[Int] = rdd2.subtract(rdd1)
      result4.foreach(x=>println(x))
    
  • reduceByKey 和 reduceByKeyLocally的差别返回值得类型不同

      package day01
      
      import org.apache.spark.rdd.RDD
      import org.apache.spark.{SparkConf, SparkContext}
      
      object RDD_Test {
        def main(args: Array[String]): Unit = {
          val conf: SparkConf = new SparkConf()
          conf.setAppName("MyPartition")
          conf.setMaster("local")
          val context: SparkContext = new SparkContext(conf)
          val data: List[(String, Int)] = List(("math", 89), ("english", 84),
           ("hadoop", 56), ("math", 87),
            ("english", 122), ("hadoop", 89), ("hadoop", 100))
          val rdd: RDD[(String, Int)] = context.makeRDD(data)
          //RDD[(String, Int)]------------->RDD[(String, Int)]
          val result1: RDD[(String, Int)] = rdd.reduceByKey((x, y) => x + y)
          //RDD[(String, Int)]------------->collection.Map[String, Int]
          val result2: collection.Map[String, Int] = rdd.reduceByKeyLocally((x, y) => x + y)
        }
      }
    
  • groupByKey的使用根据key进行分组

      val conf: SparkConf = new SparkConf()
      conf.setAppName("MyPartition")
      conf.setMaster("local")
      val context:SparkContext = new SparkContext(conf)
      println("----------groupByKey-----------")
      val data: List[(String, Int)] = List(("math", 89), ("english", 84), ("hadoop", 56), ("math", 87),
        ("english", 122), ("hadoop", 89), ("hadoop", 100))
      val rdd1: RDD[(String, Int)] = context.makeRDD(data)
      val result1: RDD[(String, Iterable[Int])] = rdd1.groupByKey(3)
      result1.foreach(x=>{
        println(x._1+"\t"+x._2.mkString(","))
      })
      //求平均成绩
      result1.map(x=>{
        val sourceavg: Double = x._2.sum.toDouble/x._2.toList.size
        (x._1,sourceavg)
      }).foreach(x=>println(x))
    
  • reduceByKey

      val conf: SparkConf = new SparkConf()
      conf.setAppName("MyPartition")
      conf.setMaster("local")
      val context:SparkContext = new SparkContext(conf)
      println("----------groupByKey-----------")
      val data: List[(String, Int)] = List(("math", 89), ("english", 84), ("hadoop", 56), ("math", 87),
        ("english", 122), ("hadoop", 89), ("hadoop", 100))
      val rdd1: RDD[(String, Int)] = context.makeRDD(data)
      println("----------reduceByKey-----------")
      val rdd2: RDD[(String, Int)] = rdd1.reduceByKey((x:Int, y:Int)=>x+y)
      rdd2.foreach(x=>println(x))
    
  • sortByKey 和 sortBy的使用

      val conf: SparkConf = new SparkConf()
      conf.setAppName("MyPartition")
      conf.setMaster("local")
      val context:SparkContext = new SparkContext(conf)
      println("----------groupByKey-----------")
      val data: List[(String, Int)] = List(("math", 89), ("english", 84), ("hadoop", 56), ("math", 87),
        ("english", 122), ("hadoop", 89), ("hadoop", 100))
      val rdd1: RDD[(String, Int)] = context.makeRDD(data)
      val rdd2: RDD[(String, Int)] = rdd1.reduceByKey((x:Int, y:Int)=>x+y)
      println("----------sortByKey-----------")
      //总成绩降序排列
      val rdd3 = rdd2.map(x=>(x._2,x._1)).sortByKey(false).map(x=>(x._2,x._1))
      rdd3.foreach(x=>println(x))
      println("----------sortBy-----------")
      //在key-value中使用
      rdd2.sortBy((x:(String,Int))=>x._2,true).foreach(x=>println(x))
    
  • aggregate 和 fold的使用

      val conf: SparkConf = new SparkConf()
      conf.setAppName("MyPartition")
      conf.setMaster("local")
      val context:SparkContext = new SparkContext(conf)
      val array1=Array(1,2,3,4,5,6,7,8,9,10)
      val rdd4: RDD[Int] = context.parallelize(array1)
      //求array1的和
      val sum: Int = rdd4.aggregate(0)((x:Int, y:Int)=>x+y, (x:Int, y:Int)=>x+y)
      println(sum)
      //求array1的平均数
      val result2: (Int, Int) = rdd4.aggregate(0,0)((u:(Int,Int), x:Int)=>(u._1+x,u._2+1),
        (x:(Int,Int), y:(Int,Int))=>(x._1+y._1,x._2+y._2))
      println(result2._1.toDouble/result2._2)
      println("----------fold求平均值-----------")
      //求平均值
      val result3 = rdd4.map(x=>(x,1)).fold((0,0))((x,y)=>(x._1+y._1,x._2+y._2))
      println(result3._1.toDouble/result3._2)
      val result4: (Int, Int) = array1.foldLeft((0,0))((x, y)=>(x._1+y,x._2+1))
      println(result4._1.toDouble/result4._2)、
    
  • aggregateByKey的使用

      val conf: SparkConf = new SparkConf()
      conf.setAppName("MyPartition")
      conf.setMaster("local")
      val context:SparkContext = new SparkContext(conf)
      println("----------groupByKey-----------")
      val data: List[(String, Int)] = List(("math", 89), ("english", 84), ("hadoop", 56), ("math", 87),
        ("english", 122), ("hadoop", 89), ("hadoop", 100))
      val rdd1: RDD[(String, Int)] = context.makeRDD(data)
      println("----------aggregateByKey-----------")
      //求平均分
      /**
        * 参数  zeroValue: U 初始值,
        * seqOp: (U, V) => U 迭代操作rdd中的每个元素和初始值进行合并,
        * combOp: (U, U) => U 分区聚合最终聚合
        * aggregateByKey =groupByKey+aggregate 只对每一组中的value进行聚合
        * rdd1.aggregateByKey()
        */
      val rdd5: RDD[(String, (Int, Int))] = rdd1.aggregateByKey((0,0))((x:(Int,Int), y:Int)
      =>(x._1+y,x._2+1),
        (x:(Int,Int), y:(Int,Int))=>(x._1+y._1,x._2+y._2))
      val result5: Unit = rdd5.foreach(x => {
        val sourceAvg = x._2._1.toDouble / x._2._2
        println(x._1,sourceAvg)
      })
    
  • combineByKey的使用

      val conf: SparkConf = new SparkConf()
      conf.setAppName("MyPartition")
      conf.setMaster("local")
      val context:SparkContext = new SparkContext(conf)
      println("----------groupByKey-----------")
      val data: List[(String, Int)] = List(("math", 89), ("english", 84), ("hadoop", 56), ("math", 87),
        ("english", 122), ("hadoop", 89), ("hadoop", 100))
      val rdd1: RDD[(String, Int)] = context.makeRDD(data
    
      println("----------combineByKey-----------")
    
      /**
        *
        * https://www.jianshu.com/p/b77a6294f31c
        * createCombiner: V => C,
        * mergeValue: (C, V) => C,
        * mergeCombiners: (C, C) => C
        *
        * createCombiner: V => C ,这个函数把当前的值作为参数,此时我们可以对其做些附加操作(类型转换)
        * 并把它返回 (这一步类似于初始化操作)
        * mergeValue: (C, V) => C,该函数把元素V合并到之前的元素C(createCombiner)上 (这个操作在每个分区内进行)
        * mergeCombiners: (C, C) => C,该函数把2个元素C合并 (这个操作在不同分区间进行)
        *
        */
      val rdd6: RDD[(String, (Int, Int))] = rdd1.combineByKey(x=>(x,1), 
      (x:(Int,Int), y:Int)=>(x._1+y,x._2+1),(x:(Int,Int), y:(Int,Int))=>(x._1+y._1,x._2+y._2))
      val result6: Array[(String, Double)] = rdd6.map(x => {
        (x._1, x._2._1.toDouble / x._2._2)
      }).collect
      result6.foreach(x=>println(x))
    
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值