Spark之transformation算子

最新推荐文章于 2024-08-19 15:43:18 发布

三岁会演戏

最新推荐文章于 2024-08-19 15:43:18 发布

阅读量872

点赞数 2

分类专栏： Scala 大数据文章标签： Scala spark

本文链接：https://blog.csdn.net/weixin_42568742/article/details/90320980

版权

Scala 同时被 2 个专栏收录

9 篇文章 0 订阅

订阅专栏

大数据

8 篇文章 0 订阅

订阅专栏

1、combineByKey

首先给RDD中每个分区中的每个key一个初始值其次在RDD每个分区内部相同的key聚合一次
再次在RDD不同的分区之间将相同的key结果聚合一次

def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("combineByKey")
    val sc = new SparkContext(conf)
    val rdd1: RDD[(String, Int)] = sc.makeRDD(List[(String, Int)](
      ("zhangsan", 10), ("zhangsan", 20), ("wangwu", 30),
      ("lisi", 40), ("zhangsan", 50), ("lisi", 60),
      ("wangwu", 70), ("wangwu", 80), ("lisi", 90)
    ), 3)
    rdd1.mapPartitionsWithIndex((index,iter)=>{
      val arr = ArrayBuffer[(String,Int)]()
      iter.foreach(tp=>{
        arr.append(tp)
        println("rdd1 partition index = "+index+",value = "+tp)
      })
      arr.iterator
    }).count()

    /**
      * 0号分区：("zhangsan", 10), ("zhangsan", 20), ("wangwu", 30)
      * 1号分区：("lisi", 40), ("zhangsan", 50), ("lisi", 60)
      * 2号分区：("wangwu", 70), ("wangwu", 80), ("lisi", 90)
      *
      * 初始化后：
      * 0号分区：("zhangsan", 10hello),("wangwu", 30hello)
      * 1号分区：("lisi", 40hello), ("zhangsan", 50hello)
      * 2号分区：("wangwu", 70hello),("lisi", 90hello)
      *
      * 经过RDD分区内的合并后:
      * 0号分区：("zhangsan", 10hello@20)，("wangwu", 30hello)
      * 1号分区：("lisi", 40hello@60), ("zhangsan", 50hello)
      * 2号分区：("wangwu", 70hello@80),("lisi", 90hello#50hello)
      *
      * 经过RDD分区之间的合并：("zhangsan", 10hello@20#50hello),("lisi",40hello@60#90hello),("wangwu", 30hello#70hello@80)
      */
    //    rdd1.combineByKey((v:Int)=>{v+"hello"},(s:String,v:Int)=>{s+"@"+v},(s1:String,s2:String)=>{s1+"#"+s2})
    val result: RDD[(String, String)] = rdd1.combineByKey(v=>{v+"hello"}, (s:String, v)=>{s+"@"+v}, (s1:String, s2:String)=>{s1+"#"+s2})
    result.foreach(println)
  }

2、aggregateByKey

首先是给定RDD的每个分区一个初始值，然后RDD中每个分区中按照相同的key，结合初始值去合并，最后RDD之间相同的key 聚合。

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("aggregateByKey").setMaster("local")
    val sc = new SparkContext(conf)
    val rdd1 = sc.makeRDD(List[(String,Int)](
      ("zhangsan",10),("zhangsan",20),("wangwu",30),
      ("lisi",40),("zhangsan",50),("lisi",60),
      ("wangwu",70),("wangwu",80),("lisi",90)
    ),3)
    rdd1.mapPartitionsWithIndex((index,iter)=>{
        val arr = ArrayBuffer[(String,Int)]()
        iter.foreach(tp=>{
          arr.append(tp)
          println("rdd1 partition index = "+index+",value = "+tp)
        })
       arr.iterator
    }).count()

    /**
      *  0号分区：
      *     ("zhangsan",10)
      *     ("zhangsan",20)
      *     ("wangwu",30)
      *  1号分区：
      *     ("lisi",40)
      *     ("zhangsan",50)
      *     ("lisi",60)
      *  2号分区：
      *     ("wangwu",70)
      *     ("wangwu",80)
      *     ("lisi",90)
      *
      *  init :
      *   0:("zhangsan",hello~10~20),("wangwu",hello~30)
      *   1:("zhangsan",hello~50),("lisi"，hello~40~60)
      *   2:("lisi",hello~90)，("wangwu",hello~70~80)
      *
      *   分区合并后：("zhangsan",hello~10~20#hello~50),("lisi",hello~40~60#hello~90),("wangwu",hello~30#hello~70~80)
      *   ("zhangsan")
      */
    val result: RDD[(String, String)] = rdd1.aggregateByKey("hello")((s, v)=>{s+"~"+v}, (s1, s2)=>{s1+"#"+s2})
    result.foreach(print)
  }

3、coalesce

coalesce(numPartiton,shuffle=false) coalesce 增加或者减少分区，默认没有shuffle
coalesce 如果从少的分区增到多的分区，如果指定没有shuffle,那么不起作用。 coalesce(num,true) =
repartition(num)

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local").setAppName("coalesce")
    val sc = new SparkContext(conf)
    val rdd1: RDD[String] = sc.parallelize(List[String](
      "love1", "love2", "love3", "love4",
      "love5", "love6", "love7", "love8",
      "love9", "love10", "love11", "love12"),3)
    val rdd2 :RDD[String] = rdd1.mapPartitionsWithIndex((index,iter)=>{
      val list = ListBuffer[String]()
      iter.foreach(one=>{
        list.append(s"rdd1 partition = 【$index】,value = 【$one】")
      })
      list.iterator
    },true)

    val rdd3 = rdd2.coalesce(4,false)
    val rdd4 = rdd3.mapPartitionsWithIndex((index,iter)=>{
      val arr = ArrayBuffer[String]()
      iter.foreach(one=>{
        arr.append(s"rdd3 partition = 【$index】,value =  【$one】")
      })
      arr.iterator
    })
    val results : Array[String] = rdd4.collect()
    results.foreach(println)
    sc.stop()
  }

4、cogroup

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("cogroup")
    val sc = new SparkContext(conf)
    val rdd1 = sc.parallelize(List[(String,String)](("zhangsan","female"),("zhangsan","female1"),("lisi","male"),("wangwu","female"),("maliu","male")),3)
    val rdd2 = sc.parallelize(List[(String,Int)](("zhangsan",18),("lisi",19),("lisi",190),("wangwu",20),("tianqi",21)),4)
    val resultRDD: RDD[(String, (Iterable[String], Iterable[Int]))] = rdd1.cogroup(rdd2)
    resultRDD.foreach(info=>{
      val key = info._1
      val value1: List[String] = info._2._1.toList
      val value2: List[Int] = info._2._2.toList
      println("key = "+key+",value1 = "+value1+",value2 = "+value2)
    })
    println("resultRDD partitioin length = "+resultRDD.getNumPartitions)
    sc.stop()
  }

5、distinct

distinct 去重，有shuffle产生，内部实际是 map+reduceByKey+map实现

def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("distinct")
    val sc = new SparkContext(conf)
    val infos = sc.parallelize(List[String]("a","a","b","b","c","c","d"),4)
    val result: RDD[String] = infos.distinct()
    result.foreach(println)
    sc.stop()
  }

6、filter

filter 过滤算子，过滤数据，返回true的数据会被留下

def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("filter")
    conf.setMaster("local")
    val sc = new SparkContext(conf)
    val infos = sc.makeRDD(List[Int](1,2,3,4,5))
    val result = infos.filter(one=>{
      one>3
    })
    result.foreach(println)
    sc.stop()
  }

7、flatMap

flatMap 是一对多的关系处理一条数据得到多条数据结果

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("map").setMaster("local")
    val sc = new SparkContext(conf)
    val infos = sc.parallelize(Array[String]("hello spark","hello hdfs","hello bjsxt"))
    val result = infos.flatMap(one=>{
      one.split(" ")
    })
    result.foreach(println)
  }

8、flatMapValues

(K,V) -> (K,V) 作用在K,V格式的RDD上，对一个Key的一个Value返回多个Value

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("flatMapValues")
    conf.setMaster("local")
    val sc = new SparkContext(conf)
    val infos: RDD[(String, String)] = sc.makeRDD(List[(String, String)](("zhangsna", "18"), ("lisi", "20"), ("wangwu", "30")))
    val transInfo: RDD[(String, String)] = infos.mapValues(s => {
      s + " " + "zhangsan18"
    })
    val result = transInfo.flatMapValues(s=>{
      s.split(" ")
    })
    result.foreach(print)

    sc.stop()
  }

9、fullOuterJoin
(K,V)格式的RDD和(K,W)格式的RDD 使用fullOuterJoin结合是以两边的RDD出现的key为主，得到（K,(Option(V)，Option(W))）

def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("fullOuterJoin")
    val sc = new SparkContext(conf)
    val nameRDD = sc.parallelize(List[(String,String)](("zhangsan","female"),("lisi","male"),("wangwu","female"),("maliu","male")),3)
    val scoreRDD = sc.parallelize(List[(String,Int)](("zhangsan",18),("lisi",19),("wangwu",20),("tianqi",21)),4)
    val fullOuterJoin: RDD[(String, (Option[String], Option[Int]))] = nameRDD.fullOuterJoin(scoreRDD)
    fullOuterJoin.foreach(println)
    println("fullOuterJoin RDD partition length = "+fullOuterJoin.getNumPartitions)
    sc.stop()
  }

10、groupBy

按照指定的规则，将数据分组

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("groupBy")
    val sc = new SparkContext(conf)
    val rdd = sc.parallelize(List[(String,Double)](("zhangsan",66.5),("lisi",33.2),("zhangsan",66.7),("lisi",33.4),("zhangsan",66.8),("wangwu",29.8)))
    val result: RDD[(Boolean, Iterable[(String, Double)])] = rdd.groupBy(one => {
      one._2 > 34
    })
    result.foreach(print)


//    val rdd1: RDD[String] = sc.parallelize(List[String](
//      "love1", "love2", "love3", "love4",
//      "love5", "love6", "love7", "love8",
//      "love9", "love10", "love11", "love12"),3)
//
//    val result: RDD[(String, Iterable[String])] = rdd1.groupBy(one=>{one.split("")(4)})
//    result.foreach(print)
  }

11、groupByKey

根据key 去将相同的key 对应的value合并在一起（K,V）=>(K,[V])

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("groupByKey")
    val sc = new SparkContext(conf)
    val rdd = sc.parallelize(List[(String,Double)](("zhangsan",66.5),("lisi",33.2),("zhangsan",66.7),("lisi",33.4),("zhangsan",66.8),("wangwu",29.8)))
    val rdd1 = rdd.groupByKey()
    rdd1.foreach(info=>{
      val name = info._1
      val value: Iterable[Double] = info._2
      val list: List[Double] = info._2.toList
      print("name = "+name+",list = "+list)
    })
    sc.stop()
  }

12、intersection

intersection 取两个RDD的交集，两个RDD的类型要一致结果RDD的分区数与父rdd多的一致

def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("intersection")
    val sc = new SparkContext(conf)
    val rdd1 = sc.parallelize(List[String]("zhangsan","lisi","wangwu"),5)
    val rdd2 = sc.parallelize(List[String]("zhangsan","lisi","maliu"),4)
    val intersectionRDD: RDD[String] = rdd1.intersection(rdd2)
    intersectionRDD.foreach(println)
    println("intersectionRDD partition length = "+intersectionRDD.getNumPartitions)
    sc.stop()
  }

13、join

join 会产生shuffle （K,V）格式的RDD和（K,V）格式的RDD按照key相同join
得到（K,(V,W)）格式的数据。

def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("join")
    conf.setMaster("local")
    val sc = new SparkContext(conf)
    val nameRDD = sc.parallelize(List[(String,String)](("zhangsan","female"),("lisi","male"),("wangwu","female")))
    val scoreRDD = sc.parallelize(List[(String,Int)](("zhangsan",18),("lisi",19),("wangwu",20)))
    val joinRDD: RDD[(String, (String, Int))] = nameRDD.join(scoreRDD)
    joinRDD.foreach(println)
  }

14、leftOuterJoin

(K,V)格式的RDD和(K,W)格式的RDD
使用leftOuterJoin结合是以左边的RDD出现的key为主，得到（K,(V,Option(W))）

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("leftOuterJoin")
    val sc = new SparkContext(conf)
    val nameRDD = sc.parallelize(List[(String,String)](("zhangsan","female"),("lisi","male"),("wangwu","female"),("maliu","male")))
    val scoreRDD = sc.parallelize(List[(String,Int)](("zhangsan",18),("lisi",19),("wangwu",20),("tianqi",21)))
    val leftOuterJoin: RDD[(String, (String, Option[Int]))] = nameRDD.leftOuterJoin(scoreRDD)
    leftOuterJoin.foreach(println)
    sc.stop()
  }

15、map

处理数据是一对一的关系，进入一条数据处理，出来的还是一条数据

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("map").setMaster("local")
    val sc = new SparkContext(conf)
    val infos = sc.parallelize(Array[String]("hello spark","hello hdfs","hello bjsxt"))
    val result = infos.map(one=>{one.split(" ")})
    result.foreach(arr=>{arr.foreach(println)})
    sc.stop()
  }

16、mapPartitions

遍历的是每个分区中的数据，一个个分区的遍历，相对于map 一条条处理数据，性能比较高。

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("mapPartitions")
    val sc = new SparkContext(conf)
    val infos = sc.parallelize(List[String]("a","b","c","d","e","f","g"),4)
    val result = infos.mapPartitions(iter=>{
      println("创建数据库连接... ... ")
      val array = ArrayBuffer[String]()
      while(iter.hasNext){
        val s = iter.next()
        println("拼接sql... ... "+s)
        array.append(s)
      }
      println("关闭数据库连接... ... ")
      array.iterator
    })
    result.count()
    sc.stop()
  }

17、mapPartitionsWithIndex

可以拿到每个RDD中的分区，以及分区中的数据

def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("mapPartitionWithIndex")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("./data/words",5)
    val result = lines.mapPartitionsWithIndex((index,iter)=>{
      val arr = ArrayBuffer[String]()
      iter.foreach(one=>{
        arr.append(s"partition = 【$index】,value = $one")
      })
      arr.iterator
    },true)
    result.foreach(println)
    sc.stop()
  }

18、mapValues

针对K,V格式的数据，只对Value做操作，Key保持不变

def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("mapValues")
    conf.setMaster("local")
    val sc = new SparkContext(conf)
    val infos: RDD[(String, String)] = sc.makeRDD(List[(String, String)](("zhangsna", "18"), ("lisi", "20"), ("wangwu", "30")))
    val result: RDD[(String, String)] = infos.mapValues(s => {
      s + " " + "zhangsan18"
    })
    result.foreach(print)
    sc.stop()
  }

19、reduceByKey

首先会根据key 去分组，然后处理每个组，将每个组内的value聚合作用在K,V格式的RDD上

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("reduceByKey")
    conf.setMaster("local")
    val sc = new SparkContext(conf)
    val infos = sc.parallelize(List[(String,Int)](("zhangsan",1),("zhangsan",2),("zhangsan",3),("lisi",100),("lisi",200)))
    val result = infos.reduceByKey((v1,v2)=>{v1+v2})
    result.foreach(println)
    sc.stop()
  }

20、repartition

重新分区，可以将RDD的分区增多或者减少，会产生shuffle coalesce(num,true) = repartition(num)

def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("repartition")
    val sc = new SparkContext(conf)
    val rdd1: RDD[String] = sc.parallelize(List[String](
      "love1", "love2", "love3", "love4",
      "love5", "love6", "love7", "love8",
      "love9", "love10", "love11", "love12"),3)
    val rdd2 :RDD[String] = rdd1.mapPartitionsWithIndex((index,iter)=>{
      val list = ListBuffer[String]()
      iter.foreach(one=>{
        list.append(s"rdd1 partition = 【$index】,value = 【$one】")
      })
      list.iterator
    },true)

//    val rdd3 = rdd2.repartition(4)
    val rdd3 = rdd2.repartition(3)
    val rdd4 = rdd3.mapPartitionsWithIndex((index,iter)=>{
      val arr = ArrayBuffer[String]()
      iter.foreach(one=>{
        arr.append(s"rdd3 partition = 【$index】,value =  【$one】")
      })
      arr.iterator
    })
    val results : Array[String] = rdd4.collect()
    results.foreach(println)
    sc.stop()
  }

21、rightOuterJoin

(K,V)格式的RDD和(K,W)格式的RDD
使用rightOuterJoin结合是以右边的RDD出现的key为主，得到（K,(Option(V)，W)）

def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("rightOuterJoin")
    val sc = new SparkContext(conf)
    val nameRDD = sc.parallelize(List[(String,String)](("zhangsan","female"),("lisi","male"),("wangwu","female"),("maliu","male")),3)
    val scoreRDD = sc.parallelize(List[(String,Int)](("zhangsan",18),("lisi",19),("wangwu",20),("tianqi",21)),4)
    val rightOuterJoin: RDD[(String, (Option[String], Int))] = nameRDD.rightOuterJoin(scoreRDD)
    rightOuterJoin.foreach(println)
    println("rightOuterJoin RDD partition length = "+rightOuterJoin.getNumPartitions)
    sc.stop()
  }

22、sample

sample随机抽样，参数sample(有无放回抽样，抽样的比例，种子)
有种子和无种子的区别：
有种子是只要针对数据源一样，都是指定相同的参数，那么每次抽样到的数据都是一样的
没有种子是针对同一个数据源，每次抽样都是随机抽样

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("sample")
    conf.setMaster("local")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("./data/sampleData.txt")
    val result = lines.sample(true,0.01,100)
    result.foreach(println)
    sc.stop()
  }

23、sortBy

  sortBy 排序，参数中指定按照什么规则去排序，第二个参数 true/false 指定升序或者降序
   无需作用在K,V格式的RDD上

def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("sortBy")
    conf.setMaster("local")
    val sc = new SparkContext(conf)
    val infos = sc.parallelize(Array[(String,String)](("f","f"),("a","a"),("c","c"),("b","b")))
    val result = infos.sortBy(tp=>{
      tp._1
    },false)
    result.foreach(println)

    val infos1 = sc.parallelize(Array[Int](400,200,500,100,300))
    val result1 = infos1.sortBy(one=>{one/100},false)
    result1.foreach(println)
    sc.stop()
  }

24、sortByKey

sortByKey 默认按照key去排序，作用在K,V格式的RDD上

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("sortByKey")
    conf.setMaster("local")
    val sc = new SparkContext(conf)
    val infos = sc.parallelize(Array[(String,String)](("f","f"),("a","a"),("c","c"),("b","b")))
    val result = infos.sortByKey(false)
    result.foreach(println)
    sc.stop()
  }

25、subtract

 subtract 取RDD的差集
 subtract两个RDD的类型要一致，结果RDD的分区数与subtract算子前面的RDD的分区个数一致

def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("subtract")
    val sc = new SparkContext(conf)
    val rdd1 = sc.parallelize(List[String]("zhangsan","lisi","wangwu"),5)
    val rdd2 = sc.parallelize(List[String]("zhangsan","lisi","maliu"),4)
    val subtractRDD: RDD[String] = rdd1.subtract(rdd2)
    subtractRDD.foreach(println)
    println("subtractRDD partition length = "+subtractRDD.getNumPartitions)
    sc.stop()
  }

26、union

union 合并RDD ，两个RDD必须是同种类型，不必要是K,V格式的RDD

def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("union")
    val sc = new SparkContext(conf)
    val rdd1 = sc.parallelize(List[String]("zhangsan","lisi","wangwu","maliu"),3)
    val rdd2 = sc.parallelize(List[String]("a","b","c","d"),4)
    val unionRDD: RDD[String] = rdd1.union(rdd2)
    unionRDD.foreach(println)
    println("unionRDD partitioin length = "+unionRDD.getNumPartitions)
    sc.stop()
  }

27、zip

将两个RDD 合成一个K,V格式的RDD,分区数要相同，每个分区中的元素必须相同

def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("zip")
    val sc = new SparkContext(conf)
    val rdd1 = sc.parallelize(List[String]("a","b","c"),2)
    val rdd2 = sc.parallelize(List[Int](1,2,3),numSlices = 2)

    val result: RDD[(String, Int)] = rdd1.zip(rdd2)
    result.foreach(print)
  }

28、zipWithIndex

将RDD和数据下标压缩成一个K,V格式的RDD

 def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("zip")
    val sc = new SparkContext(conf)
    val rdd1 = sc.parallelize(List[String]("a","b","c"),2)
    val rdd2 = sc.parallelize(List[Int](1,2,3),numSlices = 2)
    val result: RDD[(String, Long)] = rdd1.zipWithIndex()
    result.foreach(print)
  }