spark自定义累加器

最新推荐文章于 2022-11-05 21:04:36 发布

小手追梦

最新推荐文章于 2022-11-05 21:04:36 发布

阅读量238

点赞数

分类专栏： hadoop实战

本文链接：https://blog.csdn.net/epitomizelu/article/details/118100909

版权

hadoop实战专栏收录该内容

137 篇文章 6 订阅

订阅专栏

package lcy.spark.sharedVariables.accumulators;
 
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.AccumulatorV2;
 
import java.math.BigInteger;
import java.util.Arrays;
import java.util.List;
 
/**
 * 自定义累加器
 * */
public class CustomAccumulatorDemo {
 
    // 需要注意的是累加操作不能依赖顺序，比如类似于StringAccumulator这种则会得到错误的结果
    public static class BigIntegerAccumulator extends AccumulatorV2<BigInteger, BigInteger> {
 
        private BigInteger num = BigInteger.ZERO;
 
        public BigIntegerAccumulator() {
        }
 
        public BigIntegerAccumulator(BigInteger num) {
            this.num = new BigInteger(num.toString());
        }
 
        @Override
        public boolean isZero() {
            return num.compareTo(BigInteger.ZERO) == 0;
        }
 
        @Override
        public AccumulatorV2<BigInteger, BigInteger> copy() {
            return new BigIntegerAccumulator(num);
        }
 
        @Override
        public void reset() {
            num = BigInteger.ZERO;
        }
 
        @Override
        public void add(BigInteger num) {
            this.num = this.num.add(num);
        }
 
        @Override
        public void merge(AccumulatorV2<BigInteger, BigInteger> other) {
            num = num.add(other.value());
        }
 
        @Override
        public BigInteger value() {
            return num;
        }
    }
 
    public static void main(String[] args) {
 
        SparkSession spark = SparkSession.builder().master("local[*]").getOrCreate();
        SparkContext sc = spark.sparkContext();
 
        // 直接new自定义的累加器
        BigIntegerAccumulator bigIntegerAccumulator = new BigIntegerAccumulator();
        // 然后在SparkContext上注册一下
        sc.register(bigIntegerAccumulator, "bigIntegerAccumulator");
 
        List<BigInteger> numList = Arrays.asList(new BigInteger("9999999999999999999999"), new BigInteger("9999999999999999999999"), new BigInteger("9999999999999999999999"));
        Dataset<BigInteger> num = spark.createDataset(numList, Encoders.kryo(BigInteger.class));
        Dataset<BigInteger> num2 = num.map((MapFunction<BigInteger, BigInteger>) x -> {
            bigIntegerAccumulator.add(x);
            return x;
        }, Encoders.kryo(BigInteger.class));
 
        num2.count();
        System.out.println("bigIntegerAccumulator: " + bigIntegerAccumulator.value());
 
    }
 
}

注意：