Spark RDD(五)
groupByKey
groupByKey会将RDD[key,value] 按照相同的key进行分组,形成RDD[key,Iterable[value]]的形式
scala版本
val rdd1 = sc.parallelize(List(("a", 97),("a", 87),("b", 99),("b", 97),("c", 100)))
val groupByKeyRDD = rdd1.groupByKey()
groupByKeyRDD.collect.foreach(println)
java版本
JavaRDD<Tuple2<String, Integer>> rdd1 = sc.parallelize(Arrays.asList(
new Tuple2<>("a", 1),
new Tuple2<>("a", 11),
new Tuple2<>("a", 3),
new Tuple2<>("b", 5),
new Tuple2<>("b", 2),
new Tuple2<>("d", 3),
new Tuple2<>("c", 9)
));
//将JavaRDD<Tuple2<String,Integer>> 类型转换为 JavaPairRDD<String, Integer>
JavaPairRDD<String, Integer> pairRDD1 = JavaPairRDD.fromJavaRDD(rdd1);
JavaPairRDD<String, Iterable<Integer>> groupByKeyRDD = pairRDD1.groupByKey();
List<Tuple2<String, Iterable<Integer>>> collect = groupByKeyRDD.collect();
for (Tuple2<String, Iterable<Integer>> s : collect) {
System.out.println(s);
}
cogroup
对两个RDD中的<k,v>元素,每个RDD中相同key中的元素分别聚合成一个集合
scala版本
- rdd1.cogroup(rdd2)会将rdd1和edd2按照相同的key进行分组,得到(key,RDD[key,Iterable[value1],Iterable[value2]])的形式
- cogroup也可以多个进行分组
val rdd1 = sc.parallelize(List(("a",1),("a",11),("b",5),("b",2),("c",9)))
val rdd2 = sc.parallelize(List(("b",15),("b",2),("d",13),("c",4)))
val rdd3 = sc.parallelize(List(("a",85),("b",45),("d",33),("c",99)))
val pairRDD1 = JavaPairRDD.fromJavaRDD(rdd1)
val pairRDD2 = JavaPairRDD.fromJavaRDD(rdd2)
val pairRDD3 = JavaPairRDD.fromJavaRDD(rdd3)
val Rdd1CoRdd2 = rdd1.cogroup(rdd2)
val Rdd1CoRdd2CoRdd3 = rdd1.cogroup(rdd2,rdd3)
println("--------rdd1 cogroup rdd2------")
Rdd1CoRdd2.collect.foreach(println)
println("------rdd1 cogroup (rdd2,rdd3)------ ")
Rdd1CoRdd2CoRdd3.collect.foreach(println)
java版本
JavaRDD<Tuple2<String, Float>> rdd1 = sc.parallelize(Arrays.asList(
new Tuple2<String, Float>("a", 11f),
new Tuple2<String, Float>("a", 111f),
new Tuple2<String, Float>("b", 11f),
new Tuple2<String, Float>("c", 11f)
));
JavaRDD<Tuple2<String, Float>> rdd2 = sc.parallelize(Arrays.asList(
new Tuple2<String, Float>("b", 21f),
new Tuple2<String, Float>("c", 11f),
new Tuple2<String, Float>("d", 21f),
new Tuple2<String, Float>("b", 21f)
));
JavaRDD<Tuple2<String, Float>> rdd3 = sc.parallelize(Arrays.asList(
new Tuple2<String, Float>("a", 31f),
new Tuple2<String, Float>("b", 31f),
new Tuple2<String, Float>("d", 31f),
new Tuple2<String, Float>("a", 31f)
));
JavaPairRDD<String, Float> rdd1Pair = JavaPairRDD.fromJavaRDD(rdd1);
JavaPairRDD<String, Float> rdd2Pair = JavaPairRDD.fromJavaRDD(rdd2);
JavaPairRDD<String, Float> rdd3Pair = JavaPairRDD.fromJavaRDD(rdd3);
JavaPairRDD<String, Tuple2<Iterable<Float>, Iterable<Float>>> cogroup = rdd1Pair.cogroup(rdd2Pair);
List<Tuple2<String, Tuple2<Iterable<Float>, Iterable<Float>>>> collect = cogroup.collect();
for (Tuple2<String, Tuple2<Iterable<Float>, Iterable<Float>>> stringTuple2Tuple2 : collect) {
System.out.println(stringTuple2Tuple2);
}
subtractByKey
subtractByKey和基本转换操作中的subtract类似,只不过这里是针对K的,返回在主RDD中出现,并且不在otherRDD中出现的元素
scala版本
val rdd1 = sc.parallelize(List("a","b","c","d")).map((_,1))
val rdd2 = sc.parallelize(List("b","c","e","f")).map((_,1))
rdd1.subtractByKey(rdd2).collect.foreach(println)
java版本
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("a", "b", "c","d"));
JavaRDD<String> rdd2 = sc.parallelize(Arrays.asList("c", "c","e", "f", "b"));
JavaPairRDD<String, Integer> pairRDD1 = rdd1.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
});
JavaPairRDD<String, Integer> pairRDD2 = rdd2.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
});
JavaPairRDD<String, Integer> subtractByKeyRDD = pairRDD1.subtractByKey(pairRDD2);
List<Tuple2<String, Integer>> collect3 = subtractByKeyRDD.collect();
System.out.println("------subtractByKeyRDD------");
for (Tuple2<String, Integer> t : collect3) {
System.out.println(t);
}
join
内连接,用于将两个key相同的rdd关联起来,与mysql中的join类似
scala版本
// 转换成pairRDD
val rdd1 = sc.parallelize(List("a","b","c")).map((_,1))
val rdd2 = sc.parallelize(List("c","c","d","e","f","b")).map((_,1))
rdd.join(rdd4).collect.foreach(println)
java版本
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("a", "b", "c"));
JavaRDD<String> rdd2 = sc.parallelize(Arrays.asList("c", "c", "d", "e", "f", "b"));
// 将rdd转换成pairRDD
JavaPairRDD<String, Integer> pairRDD1 = rdd1.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
});
JavaPairRDD<String, Integer> pairRDD2 = rdd2.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
});
JavaPairRDD<String, Tuple2<Integer, Integer>> joinRDD = pairRDD1.join(pairRDD2);
List<Tuple2<String, Tuple2<Integer, Integer>>> collect = joinRDD.collect();
for (Tuple2<String, Tuple2<Integer, Integer>> t : collect) {
System.out.println(t);
}
leftOuterjoin及rightOuterJoin
左外连接和右外连接。左外连接以左表为基准,会将左表元素全部罗列,右表返回option值;右外连接以右表为基准,会将右表元素全部罗列,左表返回option值
scala版本
val rdd1 = sc.parallelize(List("a","b","c")).map((_,1))
val rdd2 = sc.parallelize(List("c","c","d","e","f","b")).map((_,1))
println("----------leftOuterJoin------------")
rdd1.leftOuterJoin(rdd2).collect.foreach(println)
println("----------rightOuterJoin------------")
rdd1.rightOuterJoin(rdd2).collect.foreach(println)
java版本
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("a", "b", "c"));
JavaRDD<String> rdd2 = sc.parallelize(Arrays.asList("c", "c", "d", "e", "f", "b"));
JavaPairRDD<String, Integer> pairRDD1 = rdd1.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
});
JavaPairRDD<String, Integer> pairRDD2 = rdd2.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
});
JavaPairRDD<String, Tuple2<Integer, Optional<Integer>>> leftOuterJoinRDD = pairRDD1.leftOuterJoin(pairRDD1);
List<Tuple2<String, Tuple2<Integer, Optional<Integer>>>> collect = leftOuterJoinRDD.collect();
System.out.println("------leftOuterJoin------");
for (Tuple2<String, Tuple2<Integer, Optional<Integer>>> t :collect){
System.out.println(t);
}
JavaPairRDD<String, Tuple2<Optional<Integer>, Integer>> rightOuterJoinRDD = pairRDD1.rightOuterJoin(pairRDD2);
List<Tuple2<String, Tuple2<Optional<Integer>, Integer>>> collect2 = rightOuterJoinRDD.collect();
System.out.println("------rightOuterJoin------");
for (Tuple2<String, Tuple2<Optional<Integer>, Integer>> t : collect2) {
System.out.println(t);
}