cogroup和groupWith是一组非常强大的功能,允许使用键将最多3个键值RDD组合在一起。
示例如下:
package com.cb.spark.sparkrdd;
import java.util.Arrays;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
public class CogroupExample {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("Cogroup").setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<Integer> javaRDD = jsc.parallelize(Arrays.asList(1, 2, 1, 3), 2);
JavaPairRDD<Integer, String> b = javaRDD.mapToPair(x -> new Tuple2<>(x, "b"));
JavaPairRDD<Integer, String> c = javaRDD.mapToPair(x -> new Tuple2<>(x, "c"));
// (1,([b, b],[c, c])) (3,([b],[c])) (2,([b],[c]))
b.cogroup(c).foreach(x -> System.out.print(x + " "));
JavaPairRDD<Integer, String> x = jsc.parallelizePairs(
Arrays.asList(new Tuple2<Integer, String>(1, "apple"), new Tuple2<Integer, String>(2, "banana"),
new Tuple2<Integer, String>(3, "orange"), new Tuple2<Integer, String>(4, "kiwi")),
2);
JavaPairRDD<Integer, String> y = jsc.parallelizePairs(
Arrays.asList(new Tuple2<Integer, String>(5, "computer"), new Tuple2<Integer, String>(1, "laptop"),
new Tuple2<Integer, String>(1, "desktop"), new Tuple2<Integer, String>(4, "ipad")),
2);
// (1,([apple],[laptop, desktop]))(3,([orange],[]))(5,([],[computer]))
// (4,([kiwi],[ipad]))(2,([banana],[]))
x.cogroup(y).foreach(m -> System.out.print(m));
// (4,([kiwi],[ipad]))(2,([banana],[]))
// (1,([apple],[laptop, desktop]))(3,([orange],[]))(5,([],[computer]))
x.cogroup(y, b).foreach(m -> System.out.print(m));
// (1,([apple],[laptop, desktop]))(3,([orange],[]))(5,([],[computer]))
// (4,([kiwi],[ipad]))(2,([banana],[]))
x.groupWith(y).foreach(m -> System.out.print(m));
jsc.stop();
}
}