像aggregate函数一样工作,只不过aggregateByKey应用于具有相同键的值。 与聚合函数不同,初始值不应用于第二个reduce。
示例如下:
package com.cb.spark.sparkrdd;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;
public class AggregateByKeyExample {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("AggregateByKey").setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaPairRDD<String, Integer> pairRDD = jsc.parallelizePairs(
Arrays.asList(new Tuple2<String, Integer>("cat", 2), new Tuple2<String, Integer>("cat", 5),
new Tuple2<String, Integer>("mouse", 4), new Tuple2<String, Integer>("cat", 12),
new Tuple2<String, Integer>("dog", 12), new Tuple2<String, Integer>("mouse", 2)),
2);
JavaRDD<String> mapPartitionsWithIndexRDD = pairRDD
.mapPartitionsWithIndex(new Function2<Integer, Iterator<Tuple2<String, Integer>>, Iterator<String>>() {
private static final long serialVersionUID = 1L;
@Override
public Iterator<String> call(Integer arg0, Iterator<Tuple2<String, Integer>> arg1)
throws Exception {
List<String> list = new ArrayList<>();
while (arg1.hasNext()) {
Tuple2<String, Integer> tuple2 = arg1.next();
list.add("partition " + arg0 + ":(" + tuple2._1 + "," + tuple2._2 + ")");
}
return list.iterator();
}
}, true);
mapPartitionsWithIndexRDD.foreach(x -> System.out.println(x));
JavaPairRDD<String, Integer> aggregateByKeyRDD = pairRDD.aggregateByKey(0, (a, b) -> Math.max(a, b),
(x, y) -> x + y);
// (dog,12)
// (cat,17)
// (mouse,6)
aggregateByKeyRDD.foreach(x -> System.out.println(x));
JavaPairRDD<String, Integer> aggregateByKeyRDD1 = pairRDD.aggregateByKey(100, (a, b) -> Math.max(a, b),
(x, y) -> x + y);
// (dog,100)
// (cat,200)
// (mouse,200)
aggregateByKeyRDD1.foreach(x -> System.out.println(x));
jsc.stop();
}
}