public static void combineByKeyDemo(JavaSparkContext sc){
// JavaPairRDD input = sc.parallelize(Arrays.asList((1,2),(4,1)));
JavaPairRDD pariRdd = sc.parallelize(Arrays.asList("a a a d d c c")).flatMap(new FlatMapFunction<String,String>() {
public Iterator<String> call(String s) throws Exception {
return Arrays.asList(s.split(" ")).iterator();
}
}).mapToPair(
new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String x)
{ return new Tuple2(x, 1);
}
}).reduceByKey(
new Function2<Integer, Integer, Integer>() {
public Integer call(Integer a, Integer b) {
return a + b;
}
});
class avgCount implements Serializable{
public int total;
public int num;
public avgCount(int total, int num) {
this.total = total;
this.num = num;
}
public double avg(){
return total/(double)num;
}
}
avgCount initial = new avgCount(0,0);
/**
* def combineByKey[C](
* createCombiner: V => C,
* mergeValue: (C, V) => C,
* mergeCombiners: (C, C) => C,
* partitioner: Partitioner,
* mapSideCombine: Boolean = true,
* serializer: Serializer = null)
*createCombiner: V => C ,这个函数把当前的值作为参数,此时我们可以对其做些附加操作(类型转换)并把它返回 (这一步类似于初始化操作)
* mergeValue: (C, V) => C,该函数把元素V合并到之前的元素C(createCombiner)上 (这个操作在每个分区内进行)
* mergeCombiners: (C, C) => C,该函数把2个元素C合并 (这个操作在不同分区间进行)
*/
JavaPairRDD<String,avgCount> avgs= pariRdd.combineByKey(
/**这个函数把当前的值作为参数,此时我们可以对其做些附加操作(类型转换)并把它返回 (这一步类似于初始化操作)
*def createCombiner(value):(value, 1)
* (a,(value, 1))
*/
new Function<Integer, avgCount>() {
@Override
public avgCount call(Integer o) throws Exception {
return new avgCount(o, 1);
}
},
/**
* mergeValue: (C, V) => C,该函数把元素V合并到之前的元素C(createCombiner)上 (每个分区内合并)
*def mergeValue(acc, value):
* # 注意,这里的acc即为createCombiner产生的C。
* # 这里,用acc[0]表明为acc这个元组中的第一个元素,在scala中acc._1表示
* (acc[0]+value, acc[1]+1)
*/
new Function2<avgCount, Integer, avgCount>() {
@Override
public avgCount call(avgCount a, Integer x) throws Exception {
a.total += x;
a.num += 1;
return a;
}
},
/**
*mergeCombiners: (C, C) => C,该函数把2个元素C合并 (此函数作用范围在rdd的不同分区间内,跨分区合并)
*/
new Function2<avgCount,avgCount,avgCount>() {
@Override
public avgCount call(avgCount a, avgCount b) throws Exception {
a.total += b.total;
a.num += b.num;
return a;
}
});
Map<String,avgCount> countmap = avgs.collectAsMap();
for (Map.Entry<String,avgCount> entry: countmap.entrySet()
) {
System.out.println(entry.getKey()+":"+entry.getValue().avg());
}
}
RDD操作combineByKey学习
最新推荐文章于 2022-10-31 20:25:11 发布