spark 聚合和广播

本文主要通过代码实现spark的聚合和广播两个特殊的算子

1 广播变量或者少量的数据,可以起到优化功能

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.broadcast.Broadcast;

/**
 * 广播变量
 * @author Administrator
 *
 */
public class BroadcastVariable {

   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("BroadcastVariable") 
            .setMaster("local"); 
      JavaSparkContext sc = new JavaSparkContext(conf);
   
      // 在java中,创建共享变量,就是调用SparkContext的broadcast()方法
      // 获取的返回结果是Broadcast<T>类型
      final int factor = 3;
      final Broadcast<Integer> factorBroadcast = sc.broadcast(factor);
      
      List<Integer> numberList = Arrays.asList(1, 2, 3, 4, 5);
      
      JavaRDD<Integer> numbers = sc.parallelize(numberList);
      
      // 让集合中的每个数字,都乘以外部定义的那个factor
      JavaRDD<Integer> multipleNumbers = numbers.map(new Function<Integer, Integer>() {

         private static final long serialVersionUID = 1L;
         
         @Override
         public Integer call(Integer v1) throws Exception {
            // 使用共享变量时,调用其value()方法,即可获取其内部封装的值
            int factor = factorBroadcast.value();
            return v1 * factor;
         }
         
      });
      
      multipleNumbers.foreach(new VoidFunction<Integer>() {
         
         private static final long serialVersionUID = 1L;
         
         @Override
         public void call(Integer t) throws Exception {
            System.out.println(t);  
         }
         
      });
      
      sc.close();
   }
   
}

2 accumulator聚合器 通长用来统计用户的session会话聚合

import org.apache.spark.Accumulator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;

/**
 * 累加变量
 * @author Administrator
 *
 */
public class AccumulatorVariable {

   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("Accumulator") 
            .setMaster("local");
      JavaSparkContext sc = new JavaSparkContext(conf);
   
      // 创建Accumulator变量
      // 需要调用SparkContext的accumulator()方法
      final Accumulator<Integer> sum = sc.accumulator(0);
      
      List<Integer> numberList = Arrays.asList(1, 2, 3, 4, 5);
      JavaRDD<Integer> numbers = sc.parallelize(numberList);
      
      numbers.foreach(new VoidFunction<Integer>() {
         
         private static final long serialVersionUID = 1L;

         @Override
         public void call(Integer t) throws Exception {
            // 然后在函数内部,就可以对Accumulator变量,调用add()方法,累加值
            sum.add(t);  
         }
         
      });
      
      // 在driver程序中,可以调用Accumulator的value()方法,获取其值
      System.out.println(sum.value());  
      
      sc.close();
   }
   
}

阅读更多
想对作者说点什么?

博主推荐

换一批

没有更多推荐了,返回首页