groupByKey
def groupByKey(): RDD[(K, Iterable[V])]
def groupByKey(numPartitions: Int): RDD[(K, Iterable[V])]
def groupByKey(partitioner: Partitioner): RDD[(K, Iterable[V])]
groupByKey会将RDD[key,value] 按照相同的key进行分组,形成RDD[key,Iterable[value]]的形式, 有点类似于sql中的groupby,例如类似于mysql中的group_concat
下面的例子我们对不同学生的成绩进行分组
scala版本
val conf = new SparkConf().setMaster("local[*]").setAppName("GroupByKey_CogroupScala")
val sc = new SparkContext(conf)
val scoreDetailRdd = sc.parallelize(List(("Godv",97), ("Godv",95),("Forever",87),("Forever",99),("ZGG01",100)))
val scoreGroupByKeyRdd = scoreDetailRdd.groupByKey()
scoreGroupByKeyRdd.collect.foreach(println)
//还原回原数据形式
println("-----------")
scoreGroupByKeyRdd.foreach(x=>{
val name = x._1
val score = x._2
score.foreach(score=>println(name,score))
})
打印得出结果
(Forever,CompactBuffer(87, 99))
(Godv,CompactBuffer(97, 95))
(ZGG01,CompactBuffer(100))
-----------
(Godv,97)
(Godv,95)
(ZGG01,100)
(Forever,87)
(Forever,99)
java版本
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("groupbyJava");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<String, Integer>> list = Arrays.asList(
new Tuple2<String, Integer>("Godv", 97),
new Tuple2<String, Integer>("Godv", 95),
new Tuple2<String, Integer>("Forever", 87),
new Tuple2<String, Integer>("Forever", 99),
new Tuple2<String, Integer>("ZGG01", 100));
JavaRDD<Tuple2<String, Integer>> scoreRdd = sc.parallelize(list);
JavaPairRDD<String, Integer> scorePairRdd = JavaPairRDD.fromJavaRDD(scoreRdd);
Map<String, Iterable<Integer>> scoreMap = scorePairRdd.groupByKey().collectAsMap();
Set<String> keys = scoreMap.keySet();
for (String name :
keys) {
System.out.println(name+" : "+scoreMap.get(name));
}
打印得出结果
Godv : [97, 95]
Forever : [87, 99]
ZGG01 : [100]
cogroup
groupByKey是对单个 RDD 的数据进行分组,还可以使用一个叫作 cogroup() 的函数对多个共享同一个键的 RDD 进行分组
例如
RDD1.cogroup(RDD2) 会将RDD1和RDD2按照相同的key进行分组,得到(key,RDD[key,Iterable[value1],Iterable[value2]])的形式
cogroup也可以多个进行分组
例如RDD1.cogroup(RDD2,RDD3,…RDDN), 可以得到(key,Iterable[value1],Iterable[value2],Iterable[value3],…,Iterable[valueN])
scala版本
//成绩在85以上的
val scoreDetail1 = sc.parallelize(List(("Godv",91),("Godv",98),("Forever",95),("ZGG01",88)))
//成绩在60-85的
val scoreDetail2= sc.parallelize(List(("Godv",70),("ZGG01",68),("Cpt",72)))
//成绩在60以下的
val scoreDetail3 = sc.parallelize(List(("Godv",55),("Forever",58),("xxxLu",43)))
println("-----60以上的学生成绩-----")
val rdd1CGrdd2: RDD[(String, (Iterable[Int], Iterable[Int]))] = scoreDetail1.cogroup(scoreDetail2)
rdd1CGrdd2.collect.foreach(println)
println("-----所有学生的全部成绩-----")
val rdd1CGrdd2rdd3: RDD[(String, (Iterable[Int], Iterable[Int], Iterable[Int]))] = scoreDetail1.cogroup(scoreDetail2,scoreDetail3)
rdd1CGrdd2rdd3.collect.foreach(println)
打印得到结果
-----60以上的学生成绩-----
(Forever,(CompactBuffer(95),CompactBuffer()))
(Cpt,(CompactBuffer(),CompactBuffer(72)))
(Godv,(CompactBuffer(91, 98),CompactBuffer(70)))
(ZGG01,(CompactBuffer(88),CompactBuffer(68)))
-----所有学生的全部成绩-----
(xxxLu,(CompactBuffer(),CompactBuffer(),CompactBuffer(43)))
(Forever,(CompactBuffer(95),CompactBuffer(),CompactBuffer(58)))
(Cpt,(CompactBuffer(),CompactBuffer(72),CompactBuffer()))
(Godv,(CompactBuffer(91, 98),CompactBuffer(70),CompactBuffer(55)))
(ZGG01,(CompactBuffer(88),CompactBuffer(68),CompactBuffer()))
java版本
JavaRDD<Tuple2<String, Integer>> scoreRdd1 = sc.parallelize(Arrays.asList(
new Tuple2<String, Integer>("Godv", 91),
new Tuple2<String, Integer>("Godv", 98),
new Tuple2<String, Integer>("Forever", 95),
new Tuple2<String, Integer>("ZGG01", 88)));
JavaRDD<Tuple2<String, Integer>> scoreRdd2 = sc.parallelize(Arrays.asList(
new Tuple2<String, Integer>("Godv",70),
new Tuple2<String, Integer>("ZGG01",68),
new Tuple2<String, Integer>("Cpt",72)));
JavaRDD<Tuple2<String, Integer>> scoreRdd3 = sc.parallelize(Arrays.asList(
new Tuple2<String, Integer>("Godv",55),
new Tuple2<String, Integer>("Forever",58),
new Tuple2<String, Integer>("xxxLu",43)));
JavaPairRDD<String, Integer> scorePairRdd1 = JavaPairRDD.fromJavaRDD(scoreRdd1);
JavaPairRDD<String, Integer> scorePairRdd2 = JavaPairRDD.fromJavaRDD(scoreRdd2);
JavaPairRDD<String, Integer> scorePairRdd3 = JavaPairRDD.fromJavaRDD(scoreRdd3);
JavaPairRDD<String, Tuple3<Iterable<Integer>, Iterable<Integer>, Iterable<Integer>>> rdd1CGrdd2rdd3
= scorePairRdd1.cogroup(scorePairRdd2, scorePairRdd3);
List<Tuple2<String, Tuple3<Iterable<Integer>, Iterable<Integer>, Iterable<Integer>>>> collect = rdd1CGrdd2rdd3.collect();
for (Tuple2 str :
collect) {
System.out.println(str);
}
打印得出结果
(xxxLu,([],[],[43]))
(Forever,([95],[],[58]))
(Cpt,([],[72],[]))
(Godv,([91, 98],[70],[55]))
(ZGG01,([88],[68],[]))