[笔记迁移][Spark][5]RDD——分布式弹性数据集相关2

紧接着RDD——分布式弹性数据集相关1,首先补充高级算子cogroup
(8) 高级算子 cogroup,与join算子的两点不同之处:
[1] join算子将两个RDD按Key连接,结果形成一条条的记录,而cogroup将两个RDD按Key连接后,若相同的Key对应着多条记录,则将记录进行归并。返回JavaPairRDD中的元素形式为<公用键,Tuple2<调用RDD中的值的可迭代容器,参数RDD中的值的可迭代容器>>。
[2] join算子将不显示“非双关联”的结果,而cogroup显示所有结果,“非双关联”的结果中,有一方可迭代容器可以为空(注意:是没有内容的空,而不是没有容器结构的null)
[3] java实现-案例

       private static void cogroupOp(){
             //1. 创建SparkConf
            SparkConf conf = new SparkConf().setAppName("cpgroup" ).setMaster("local");
            
             //2. 创建JavaSparkContext
            JavaSparkContext sc = new JavaSparkContext(conf );
            
             //3. 构建模拟集合
            List<Tuple2<Integer,String>> stuList = Arrays.asList(
                         new Tuple2<Integer,String>(1,"tom-z" ),
                         new Tuple2<Integer,String>(2,"tom-x" ),
                         new Tuple2<Integer,String>(3,"tom" ),
                         new Tuple2<Integer,String>(4,"amy" ),
                         new Tuple2<Integer,String>(5,"fiona" )
                        );
            
            List<Tuple2<Integer,Integer>> scoreList = Arrays.asList(
                         new Tuple2<Integer,Integer>(1,80),
                         new Tuple2<Integer,Integer>(2,30),
                         new Tuple2<Integer,Integer>(1,70),
                         new Tuple2<Integer,Integer>(2,60),
                         new Tuple2<Integer,Integer>(3,100)
                        );
            
             //4. 并行化集合,创建两个RDD
            JavaPairRDD<Integer,String> students = sc.parallelizePairs(stuList );
            JavaPairRDD<Integer, Integer> scores = sc.parallelizePairs(scoreList );
            
             //5. 对两个pairRDD执行join算子,执行类似SQL JOIN的操作
            JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> resRDD = students.cogroup( scores);
            
             //6.打印结果RDD
             resRDD.foreach(new VoidFunction<Tuple2<Integer,Tuple2<Iterable<String>,Iterable<Integer>>>>() {

                   private static final long serialVersionUID = 1L;

                   @Override
                   public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> tuple ) throws Exception {
                        System. out.println("id(key): " +tuple ._1 );
                        System. out.println("stu name list: "+tuple._2._1);
                        System. out.println("stu score list: "+tuple._2._2);
                        System. out.println("===============================" );
                  }
            });
             //7. 关闭SparkContext
             sc.close();
      }

  1. Action类算子(本地模拟)
    【常用七种:reduce/collect/count/take/saveAsTextFile/countByKey/foreach】
    (1) reduce
    [1] java实现-案例
           /**
           * 对集合中的数字进行累加
           */
           private static void reduceOp(){
                SparkConf conf = new SparkConf().setAppName("reduce" ).setMaster("local");
                
                JavaSparkContext sc = new JavaSparkContext(conf );
                
                JavaRDD<Integer> javaRDD = sc.parallelize(Arrays.asList(1,2,3,4,5,6));
                
                Integer res = javaRDD .reduce(new Function2<Integer, Integer, Integer>() {
                      
                       private static final long serialVersionUID = 1L;
    
                       @Override
                       public Integer call(Integer val1 , Integer val2) throws Exception {
                            Integer temp = val1 +val2 ;
                            System. out.println(val1 +"+" +val2 +"=" +temp );
                             return temp ;
                      }
                });
                
                System. out.println(res );
                
                 sc.close();
          }
    
    [2] scala实现-案例
      def reduce(){
        val conf = new SparkConf().setAppName ("reduceOp" ).setMaster ("local" )
        val sc = new SparkContext(conf )
        val numbers = Array(1,2,3,4,5)
        val srcRDD = sc.parallelize(numbers, 5)
        val res = srcRDD.reduce(_+_)
        println(res)
      }
    
    (2) collect
    [1] 将远程集群中的RDD所有元素获取至Driver本地,但collect算子一般不建议使用。因为若RDD的数据量比较大的话,拉取至本地要消耗大量的网络资源,性能会比较差。此外,当RDD数据量规模很大时,很可能发生OutOfMemory内存溢出异常。 通常使用foreach算子,对最终RDD进行处理。
    [2] java实现-案例
           private static void collectOp() {
            SparkConf conf = new SparkConf().setAppName("collect" ).setMaster("local");
            
            JavaSparkContext sc = new JavaSparkContext(conf);
            
            JavaRDD<Integer> javaRDD = sc.parallelize(Arrays.asList(1,2,3,4,5,6));
            
            JavaRDD<Integer> resRDD = javaRDD .map(new Function<Integer, Integer>() {
    
                   private static final long serialVersionUID = 1L;
    
                   @Override
                   public Integer call(Integer num ) throws Exception {
                         return num *2;
                  }
            });
            
             //使用collect算子,将分布在远射程集群上的resRDD数据拉取至本地
             //此外,还可能发出OutOfMememory异常,内存溢出
             //通常,使用foreach算子,对最终RDD元素进行处理
            List<Integer> numList = resRDD .collect();
            
             for(Integer num : numList){
                  System. out.println(num );
            }
      }
    
    [3] scala实现-案例
      def collect(){
    	val conf = new SparkConf().setAppName ("collectOp" ).setMaster ("local" )
    	val sc = new SparkContext(conf )
    	val numbers = Array(1,2,3,4,5)
    	val srcRDD = sc.parallelize(numbers, 5)
    	val resRDD = srcRDD.map( num => num* 2 )
    	val localRes = resRDD.collect()
    	 for( num <- localRes ){
      	println(num)
    	}
     }
    
    (3) count:使用很少
    [1]java实现-案例
      private static void countOp() {
           SparkConf conf = new SparkConf().setAppName("count" ).setMaster("local");
           
           JavaSparkContext sc = new JavaSparkContext(conf );
           
           JavaRDD<Integer> javaRDD = sc.parallelize(Arrays.asList(1,2,3,4,5,6));
     
            long count = javaRDD.count();
           
           System. out.println("Total elements:" +count );
           
            sc.close();
     }
    
    [2]scala实现-案例
      def countOp{
       val conf = new SparkConf().setAppName ("mapOp" ).setMaster ("local" )
       val sc = new SparkContext(conf )
       val numbers = Array(1,2,3,4,5)
       val srcRDD = sc.parallelize(numbers, 5)
       val count = srcRDD.count()
       println(count)
     }
    
    (4) take
    [1] 与collect算子类似,拿去调用RDD中的前n个元素至本地,区别在于获取的数据量
    [2] java实现-案例
       private static void takeOp(){
           SparkConf conf = new SparkConf().setAppName("take" ).setMaster("local");
           
           JavaSparkContext sc = new JavaSparkContext(conf);
           
           JavaRDD<Integer> javaRDD = sc.parallelize(Arrays.asList(1,2,3,4,5,6));
           
           List<Integer> numList = javaRDD .take(3);
           
            for(Integer num : numList){
                 System. out.println(num );
           }
     	}
    
    [3] scala实现-案例
       def takeOp{
           val conf = new SparkConf().setAppName ("takeOp" ).setMaster ("local" )
           val sc = new SparkContext(conf )
           val numbers = Array(1,2,3,4,5)
           val srcRDD = sc.parallelize(numbers, 5)
           val res = srcRDD.take(3)
           for ( num <- res){
             println(num)
           }
      }
    
    (5) saveAsTextFile
    [1]将RDD中的数据保存到文件(本地/HDFS)。注意,只能指定文件夹目录,实际上,将保存在指定目录下的两个文件SUCCESS和part-xxxx
    [2]java实现-案例
     private static void saveAsFileOp (){
           SparkConf conf = new SparkConf().setAppName("save" ).setMaster("local");
           
           JavaSparkContext sc = new JavaSparkContext(conf );
           
           JavaRDD<Integer> javaRDD = sc.parallelize(Arrays.asList(1,2,3,4,5,6));
           
            javaRDD.saveAsTextFile("C:\\Users\\Z-Jay\\Desktop\\saveAsFile.txt" );
           
            sc.close();
     }
    
    (6) countByKey
    [1]与SELECT * … GROUP BY…效果一样,返回Map。
    [2]java实现-案例
      private static void countByKey(){
            SparkConf conf = new SparkConf().setAppName("countByKey" ).setMaster("local");
            
            JavaSparkContext sc = new JavaSparkContext(conf);
            
            List<Tuple2< String,Integer>> scores = Arrays. asList(
                         new Tuple2<String,Integer>( "class01",1),
                         new Tuple2<String,Integer>( "class02",0),
                         new Tuple2<String,Integer>( "class01",1),
                         new Tuple2<String,Integer>( "class01",0),
                         new Tuple2<String,Integer>( "class02",0)
                        );
            
            JavaPairRDD< String,Integer> javaPairRDD = sc.parallelizePairs(scores );
            
            Map< String, Long> resMap = javaPairRDD.countByKey();
            
            System. out.println(resMap );
      }
    
    [3]scala实现-案例
       def countByKeyOp(){
        val conf = new SparkConf().setAppName ("countByKey" ).setMaster ("local" )
        val sc = new SparkContext(conf )
        val stuList = Array( Tuple2( "class01","leo" ) , Tuple2("class02", "tom") , Tuple2("class01" ,"amy" ) , Tuple2("class01", "jack") , Tuple2 ("class02" ,"lily" ) )
        val stus = sc.parallelize(stuList, 5)
        val resMap = stus.countByKey()
        println(resMap)
      }
    
    (7) foreach:最常用,在远程集群遍历元素,效率比拉取至本地的collect高得多
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值