// groupByKey算子,返回的还是JavaPairRDD
// 但是,JavaPairRDD的第一个泛型类型不变,第二个泛型类型变成Iterable这种集合类型
// 也就是说,按照了key进行分组,那么每个key可能都会有多个value,此时多个value聚合成了Iterable
public static void myGroupByKey(){
SparkConf conf=new SparkConf()
.setMaster("local")
.setAppName("myGroupByKey");
JavaSparkContext sc=new JavaSparkContext(conf);
List list=Arrays.asList(new Tuple2<String,String>("c1","cai"),new Tuple2<String,String>("c2","niao"),
new Tuple2<String,String>("c1","huo"),new Tuple2<String,String>("c2","niao"));
JavaPairRDD<String, String> listRdd= sc.parallelizePairs(list);
JavaPairRDD<String, Iterable<String>> groupRdd=listRdd.groupByKey();
groupRdd.foreach(new VoidFunction<Tuple2<String, Iterable<String>>>(){
@Override
public void call(Tuple2<String, Iterable<String>> tuple)
throws Exception {
// TODO Auto-generated method stub
System.out.println("key:"+tuple._1);
Iterator<String> it=tuple._2.iterator();
while(it.hasNext())
System.out.println("-----values:"+it.next());
}
});
}
结果:
key:c2
-----values:niao
-----values:niao
key:c1
-----values:cai
-----values:huo