package lcy.spark.sharedVariables.accumulators;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.AccumulatorV2;
import java.math.BigInteger;
import java.util.Arrays;
import java.util.List;
/**
* 自定义累加器
* */
public class CustomAccumulatorDemo {
// 需要注意的是累加操作不能依赖顺序,比如类似于StringAccumulator这种则会得到错误的结果
public static class BigIntegerAccumulator extends AccumulatorV2<BigInteger, BigInteger> {
private BigInteger num = BigInteger.ZERO;
public BigIntegerAccumulator() {
}
public BigIntegerAccumulator(BigInteger num) {
this.num = new BigInteger(num.toString());
}
@Override
public boolean isZero() {
return num.compareTo(BigInteger.ZERO) == 0;
}
@Override
public AccumulatorV2<BigInteger, BigInteger> copy() {
return new BigIntegerAccumulator(num);
}
@Override
public void reset() {
num = BigInteger.ZERO;
}
@Override
public void add(BigInteger num) {
this.num = this.num.add(num);
}
@Override
public void merge(AccumulatorV2<BigInteger, BigInteger> other) {
num = num.add(other.value());
}
@Override
public BigInteger value() {
return num;
}
}
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().master("local[*]").getOrCreate();
SparkContext sc = spark.sparkContext();
// 直接new自定义的累加器
BigIntegerAccumulator bigIntegerAccumulator = new BigIntegerAccumulator();
// 然后在SparkContext上注册一下
sc.register(bigIntegerAccumulator, "bigIntegerAccumulator");
List<BigInteger> numList = Arrays.asList(new BigInteger("9999999999999999999999"), new BigInteger("9999999999999999999999"), new BigInteger("9999999999999999999999"));
Dataset<BigInteger> num = spark.createDataset(numList, Encoders.kryo(BigInteger.class));
Dataset<BigInteger> num2 = num.map((MapFunction<BigInteger, BigInteger>) x -> {
bigIntegerAccumulator.add(x);
return x;
}, Encoders.kryo(BigInteger.class));
num2.count();
System.out.println("bigIntegerAccumulator: " + bigIntegerAccumulator.value());
}
}
注意:
-
在driver获取累加器的值用bigIntegerAccumulator.value
-
在executor获取累加器的值用bigIntegerAccumulator
-
累加器不会改变我们RDD的Lazy的特性,之后再Action之后完成计算和更新。
-
但是假如出现两个Action公用一个转化操作,如map,在map里面进行累加器累加,那么每次action都会累加,造成某些我们不需要的结果。