JavaSpark 自定义累加器实现
import org.apache.spark.util.AccumulatorV2;
import scala.runtime.BoxedUnit;
import java.util.*;
public class VectorAccumulatorV2 extends AccumulatorV2<String, String> {
public Map<String, Object> map = new HashMap<>();
public List<Object> list = new ArrayList();
private String string = "";
@Override
public boolean isZero() {
return "".equals(string) && map.size() == 0;
}
@Override
public AccumulatorV2 copy() {
VectorAccumulatorV2 vectorAccumulatorV2 = new VectorAccumulatorV2();
vectorAccumulatorV2.string = this.string;
vectorAccumulatorV2.map = this.map;
return vectorAccumulatorV2;
}
@Override
public void reset() {
string = "";
map.clear();
}
@Override
public void add(String o) {
string = string + "\t" + o.toString();
}
public void addMap(Map<String, Object> mpaPara) {
map.putAll(mpaPara);
}
public void addList(Object o) {
list.add(o);
}
@Override
public void merge(AccumulatorV2 other) {
if (other instanceof VectorAccumulatorV2) {
this.string += ((VectorAccumulatorV2) other).string;
this.map.putAll(((VectorAccumulatorV2) other).map);
BoxedUnit var4 = BoxedUnit.UNIT;
} else {
throw new UnsupportedOperationException("AccumulatorV2 merge failed!");
}
}
@Override
public String value() {
return string;
}
public Map<String, Object> getMapValue() {
Map remap = new HashMap();
remap.putAll(map);
return remap;
}
public List<Object> getListValue() {
return list;
}
}
调用示例
JavaRDD<Tuple2<String, Map<String, String>>> parallelizeRDD = sc.parallelize(list);
VectorAccumulatorV2 vector = new VectorAccumulatorV2();
sc.sc().register(vector, "myAccumulator");
JavaPairRDD<String, Map<String, Object>> batchJavaPairRDD = parallelizeRDD.flatMapToPair(new PairFlatMapFunction<Tuple2<String, Map<String, String>>, String, Map<String, Object>>() {
private static final long serialVersionUID = 1L;
public GaoD gaoD = new GaoD();
@Override
public Iterator<Tuple2<String, Map<String, Object>>> call(Tuple2<String, Map<String, String>> tuple2) throws Exception {
List<Tuple2<String, Map<String, Object>>> tuple2List = new ArrayList<>();
String inputAddress = "";
String inputId = "";
for (Map.Entry entry : tuple2._2.entrySet()) {
inputAddress = (String) entry.getValue();
inputId = (String) entry.getKey();
}
if (isNull(inputAddress) || isNull(inputId)) {
return tuple2List.iterator();
}
Map<String, Object> geocodeList = gaoD.getGeocodeList(inputAddress, inputId);
if (geocodeList.size() > 0) {
for (Map.Entry entry : geocodeList.entrySet()) {
Map<String, Object> map = new HashMap<>();
Map allMap = (Map) entry.getValue();
map.put("address_province", getdefult(allMap.get("province")));
map.put("address_city", getdefult(allMap.get("city")));
map.put("address_area", getdefult(allMap.get("district")));
tuple2List.add(new Tuple2<>(entry.getKey().toString(), map));
}
}
vector.reset();
vector.addMap(gaoD.getFinalMap());
return tuple2List.iterator();
}
}).filter(new Function<Tuple2<String, Map<String, Object>>, Boolean>() {
@Override
public Boolean call(Tuple2<String, Map<String, Object>> tuple2) throws Exception {
return tuple2._2.size() > 1;
}
}).persist(StorageLevel.DISK_ONLY());
logger.warn("batchJavaPairRDD.count()=======" + batchJavaPairRDD.count());
logger.warn("vector map====" + vector.getMapValue());