[1.4]Spark RDD经典Transformation算子实战

最新推荐文章于 2022-03-18 10:16:25 发布

一方架构

最新推荐文章于 2022-03-18 10:16:25 发布

阅读量1.9k

点赞数 1

分类专栏：开发文章标签： spark 大数据

本文链接：https://blog.csdn.net/pengych_321/article/details/51514196

版权

开发专栏收录该内容

58 篇文章 1 订阅

订阅专栏

参考

DT大数据梦工厂
 Spark API

场景

transformation类算子：map、flatMap、reduceByKey、join与cogroup实战

实验

scala版

package main.scala

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.util.collection.CompactBuffer

/**
 * 常用transformation算子实战
 * map、filter
 */
object Trasformation {

  def main(args: Array[String]): Unit = {

    val sc = sparkContext("just for fun")
    mapTransformation(sc)
    filterTransformation(sc)
    flatMapTransformation(sc)
    groupByKeyTransformation(sc)
   reduceByKeyTransformation(sc)
   joinTransformation(sc)
   leftJoinTransformation(sc)
   rightJoinTransformation(sc)
    cogroupTransformation(sc)
    sc.stop()
  }  

   def sparkContext(name:String*):SparkContext = {
    val conf = new SparkConf().setMaster("local[*]").setAppName("Trasformation Ops")
    new SparkContext(conf)
  }

   /*
    * def map[U](f: Int => U)(implicit evidence$243: ClassTag[U]): RDD[U]
    */
  def mapTransformation(sc:SparkContext){
     val nums = sc.parallelize(1 to 10)
     nums.map(elem => elem*2).collect().foreach(println)
  }

  /*
   * def filter(f: T => Boolean): RDD[T]
   */
   def filterTransformation(sc:SparkContext){
     val nums = sc.parallelize(1 to 10)
      nums.filter { elem => elem%2==0 }.collect().foreach { println }
  }

   /*
    * def flatMap[U](f: String => TraversableOnce[U])(implicit evidence$204: ClassTag[U]): RDD[U]
    */
   def flatMapTransformation(sc:SparkContext){
     val bigData = Array("Scala Spark","Java Hadoop")
     val strRDD = sc.parallelize(bigData)
     strRDD.flatMap { line => line.split(" ") }.collect().foreach { println }
  }

   /*
    * def groupByKey(): RDD[(Int, Iterable[String])]
    */
   def groupByKeyTransformation(sc:SparkContext){
     val scores = Array(Tuple2(100,"Spark"),Tuple2(90,"flink"),Tuple2(60,"hadoop"),Tuple2(90,"docker"))
     sc.parallelize(scores).groupByKey().collect().foreach(println)
  }

    def reduceByKeyTransformation(sc:SparkContext){
      sc.textFile("file:home/pengyucheng/java/rdd2dfram.txt").flatMap(line=>line.split(" ")).map(pair=>(pair,1)).reduceByKey(_+_).collect().foreach(println)
  }

    /*
     *  def join[W](other: RDD[(Int, W)]): RDD[(Int, (String, W))]
     */
    def joinTransformation(sc:SparkContext){
      val studentNames =Array(Tuple2(1,"Spark"),Tuple2(2,"hadoop"),Tuple2(3,"Tachyon"))
      val studentScores = Array(Tuple2(1,100),Tuple2(2,60),Tuple2(3,90))

      val names = sc.parallelize(studentNames)
      val scores = sc.parallelize(studentScores)

      names.join(scores).collect().foreach(println)
  }

     /*
     *  def join[W](other: RDD[(Int, W)]): RDD[(Int, (String, W))]
     */
    def leftJoinTransformation(sc:SparkContext){
      val studentNames =Array(Tuple2(1,"Spark"),Tuple2(2,"hadoop"),Tuple2(3,"Tachyon"),Tuple2(4,"Other"))
      val studentScores = Array(Tuple2(1,100),Tuple2(2,60),Tuple2(3,90))

      val names = sc.parallelize(studentNames)
      val scores = sc.parallelize(studentScores) 
      names.leftOuterJoin(scores).collect().foreach(println)
  }

      /*
     *  def join[W](other: RDD[(Int, W)]): RDD[(Int, (String, W))]
     */
    def rightJoinTransformation(sc:SparkContext){
      val studentNames =Array(Tuple2(1,"Spark"),Tuple2(2,"hadoop"),Tuple2(3,"Tachyon"),Tuple2(4,"Other"))
      val studentScores = Array(Tuple2(1,100),Tuple2(2,60),Tuple2(3,90))

      val names = sc.parallelize(studentNames)
      val scores = sc.parallelize(studentScores)

      names.rightOuterJoin(scores).collect().foreach(println)
  }

    /*
     *  def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))] 
     */
     def cogroupTransformation(sc:SparkContext){
      val studentNames =Array(Tuple2(1,"spark"),Tuple2(2,"tackyon"),Tuple2(3,"hadoop"))
      val studentScores = Array(Tuple2(1,100),Tuple2(2,90),Tuple2(3,70),Tuple2(1,110),Tuple2(2,95),Tuple2(2,60))

      val names = sc.parallelize(studentNames)
      val scores = sc.parallelize(studentScores)
          names.cogroup(scores).collect().foreach(println)
  }   
}

执行结果

-reduceByKey
16/05/27 09:38:17 INFO DAGScheduler: ResultStage 2 (collect at Trasformation.scala:43) finished in 0.036 s
Scala
Spark
Java
Hadoop
16/05/27 09:38:17 INFO DAGScheduler: Job 2 finished: collect at Trasformation.scala:43, took 0.062018 s

-groupByKey
16/05/27 09:38:17 INFO DAGScheduler: Job 3 finished: collect at Trasformation.scala:48, took 0.226175 s
(100,CompactBuffer(Spark))
(90,CompactBuffer(flink, docker))
(60,CompactBuffer(hadoop))
16/05/27 09:38:17 INFO SparkUI: Stopped Spark web UI at http://192.168.0.5:4040

-join
16/05/27 10:01:57 INFO DAGScheduler: Job 5 finished: collect at Trasformation.scala:65, took 0.099787 s
(2,(hadoop,60))
(1,(Spark,100))
(3,(Tachyon,90))
16/05/27 10:01:57 INFO SparkUI: Stopped Spark web UI at http://192.168.0.5:4041

-leftjoin
16/05/27 10:12:55 INFO DAGScheduler: Job 0 finished: collect at Trasformation.scala:86, took 0.805630 s
(4,(Other,None))
(2,(hadoop,Some(60)))
(1,(Spark,Some(100)))
(3,(Tachyon,Some(90)))
16/05/27 10:12:55 INFO SparkContext: Starting job: collect at Trasformation.scala:99

-rightjoin
16/05/27 10:12:55 INFO DAGScheduler: Job 1 finished: collect at Trasformation.scala:99, took 0.139523 s
(2,(Some(hadoop),60))
(1,(Some(Spark),100))
(3,(Some(Tachyon),90))
16/05/27 10:12:55 INFO ContextCleaner: Cleaned accumulator 3

-cogroup
16/05/27 11:10:03 INFO DAGScheduler: Job 0 finished: collect at Trasformation.scala:110, took 0.866773 s
(2,(CompactBuffer(tackyon),CompactBuffer(90, 95, 60)))
(1,(CompactBuffer(spark),CompactBuffer(100, 110)))
(3,(CompactBuffer(hadoop),CompactBuffer(70)))
16/05/27 11:10:03 INFO SparkUI: Stopped Spark web UI at http://192.168.0.5:4040

java版（cogroup）

package cool.pengych.spark.core;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
//cogroup 算子java版
public class CogroupTest {
    public static void main(String[] args)
    {
        SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("RDD2DataFrameByReflection");
        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Tuple2<Integer,String>> nameList = Arrays.asList(
                new Tuple2<Integer,String>(1,"spark"),
                new Tuple2<Integer,String>(2,"tackyon"),
                new Tuple2<Integer,String>(3,"hadoop")
                );

        List<Tuple2<Integer,Integer>> scoreList = Arrays.asList(
                new Tuple2<Integer,Integer>(1,100),
                new Tuple2<Integer,Integer>(2,90),
                new Tuple2<Integer,Integer>(3,70),
                new Tuple2<Integer,Integer>(1,110),
                new Tuple2<Integer,Integer>(2,95),
                new Tuple2<Integer,Integer>(2,60)
                );

         JavaPairRDD<Integer, String> names = sc.parallelizePairs(nameList);
         JavaPairRDD<Integer, Integer> scores = sc.parallelizePairs(scoreList);

         /*
          * <Integer> JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> org.apache.spark.api.java.JavaPairRDD.cogroup(JavaPairRDD<Integer, Integer> other)
          */
         JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> cogroups =  names.cogroup(scores);
         cogroups.foreach(new VoidFunction<Tuple2<Integer,Tuple2<Iterable<String>,Iterable<Integer>>>>() {
            private static final long serialVersionUID = 1L;
            @Override
            public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> t) throws Exception {
                System.out.println("ID:" + t._1);
                System.out.println("Name:" + t._2._1);
                System.out.println("Score:" + t._2._2);
            }
        });

         sc.close();
    }
}

执行结果

16/05/27 10:57:56 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 3 ms
ID:2
Name:[tackyon]
Score:[90, 95, 60]
16/05/27 10:57:56 INFO Executor: Finished task 0.0 in stage 2.0 (TID 4). 1165 bytes result sent to driver
ID:1
Name:[spark]
Score:[100, 110]
ID:3
Name:[hadoop]
Score:[70]
16/05/27 10:57:56 INFO TaskSetManager: Finished task 0.0 in stage 2.0 (TID 4) in 141 ms on localhost (1/2)

总结

1、join、leftOuterJoin、rightOuterJoin相当于关系型数据库中的join等操作。leftOuterJoin:以左边的RDD为基准进行join，右边的RDD中的相关元素可能为空，所以用 Some进行了封装;rightOuterJoin同理.
2、cogroup：先join 然后再group.
3、CompactBuffer(Spark API居然找不到这个类的说明，源码有说明：org.apache.spark.util.collection.CompactBuffer) & ArrayBuffer:对于 groupBy等操作后的entry中同一K对应的V数量比较小的情况下，CompactBuffer比ArrayBuffer更高效。
“
* An append-only buffer similar to ArrayBuffer, but more memory-efficient for small buffers.
* ArrayBuffer always allocates an Object array to store the data, with 16 entries by default,
* so it has about 80-100 bytes of overhead. In contrast, CompactBuffer can keep up to two
* elements in fields of the main object, and only allocates an Array[AnyRef] if there are more
* entries than that. This makes it more efficient for operations like groupBy where we expect
* some keys to have very few elements.
”