package com.bjsxt.spark.transformations;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
public class Operator_AggregateByKey {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("AggregateOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
List<Tuple2<Integer,Integer>> dataList = new ArrayList<Tuple2<Integer,Integer>>();
dataList.add(new Tuple2<Integer, Integer>(1,99));
dataList.add(new Tuple2<Integer, Integer>(2,78));
dataList.add(new Tuple2<Integer, Integer>(1,89));
dataList.add(new Tuple2<Integer, Integer>(2,3));
dataList.add(new Tuple2<Integer, Integer>(3,3));
dataList.add(new Tuple2<Integer, Integer>(3,30));
JavaPairRDD<Integer, Integer> dataRdd = sc.parallelizePairs(dataList,2);
dataRdd.mapPartitionsWithIndex(new Function2<Integer, Iterator<Tuple2<Integer,Integer>>, Iterator<Tuple2<Integer,Integer>>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Iterator<Tuple2<Integer, Integer>> call(Integer index,
Iterator<Tuple2<Integer, Integer>> iter) throws Exception {
List<Tuple2<Integer, Integer>> list = new ArrayList<Tuple2<Integer, Integer>>();
while(iter.hasNext()){
/**
* partitions --0,value ---(1,99)
partitions --0,value ---(2,78)
partitions --0,value ---(1,89)
* partitions --1,value ---(2,3)
partitions --1,value ---(3,3)
partitions --1,value ---(3,30)
*(1,99)
(2,78)
(1,89)
*(2,3)
(3,3)
(3,30)
*/
System.out.println("partitions --"+index+",value ---"+iter.next());
}
return list.iterator();
}
}, true).collect();
System.out.println("*****************");
JavaPairRDD<Integer, Integer> aggregateByKey = dataRdd.aggregateByKey(80,
new Function2<Integer, Integer, Integer>() {
/**
*
* (1,99)
(2,78)
(1,89)
*(2,3)
(3,3)
(3,30)
* seq: 80 99
seq: 80 78
seq: 99 89
seq: 80 3
seq: 80 3
seq: 80 30
*/
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer t1, Integer t2) throws Exception {
System.out.println("seq: " + t1 + "\t " + t2);
return Math.max(t1, t2);
}
},new Function2<Integer, Integer, Integer>() {
/**
* comb: 80 80
*/
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer t1, Integer t2) throws Exception {
System.out.println("comb: " + t1 + "\t " + t2);
return t1+t2;
}
});
List<Tuple2<Integer,Integer>> resultRdd = aggregateByKey.collect();
for (Tuple2<Integer, Integer> tuple2 : resultRdd) {
System.out.println(tuple2._1+"\t"+tuple2._2);
}
/*
* 2 160
1 99
3 80
*/
}
}
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
public class Operator_AggregateByKey {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("AggregateOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
List<Tuple2<Integer,Integer>> dataList = new ArrayList<Tuple2<Integer,Integer>>();
dataList.add(new Tuple2<Integer, Integer>(1,99));
dataList.add(new Tuple2<Integer, Integer>(2,78));
dataList.add(new Tuple2<Integer, Integer>(1,89));
dataList.add(new Tuple2<Integer, Integer>(2,3));
dataList.add(new Tuple2<Integer, Integer>(3,3));
dataList.add(new Tuple2<Integer, Integer>(3,30));
JavaPairRDD<Integer, Integer> dataRdd = sc.parallelizePairs(dataList,2);
dataRdd.mapPartitionsWithIndex(new Function2<Integer, Iterator<Tuple2<Integer,Integer>>, Iterator<Tuple2<Integer,Integer>>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Iterator<Tuple2<Integer, Integer>> call(Integer index,
Iterator<Tuple2<Integer, Integer>> iter) throws Exception {
List<Tuple2<Integer, Integer>> list = new ArrayList<Tuple2<Integer, Integer>>();
while(iter.hasNext()){
/**
* partitions --0,value ---(1,99)
partitions --0,value ---(2,78)
partitions --0,value ---(1,89)
* partitions --1,value ---(2,3)
partitions --1,value ---(3,3)
partitions --1,value ---(3,30)
*(1,99)
(2,78)
(1,89)
*(2,3)
(3,3)
(3,30)
*/
System.out.println("partitions --"+index+",value ---"+iter.next());
}
return list.iterator();
}
}, true).collect();
System.out.println("*****************");
JavaPairRDD<Integer, Integer> aggregateByKey = dataRdd.aggregateByKey(80,
new Function2<Integer, Integer, Integer>() {
/**
*
* (1,99)
(2,78)
(1,89)
*(2,3)
(3,3)
(3,30)
* seq: 80 99
seq: 80 78
seq: 99 89
seq: 80 3
seq: 80 3
seq: 80 30
*/
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer t1, Integer t2) throws Exception {
System.out.println("seq: " + t1 + "\t " + t2);
return Math.max(t1, t2);
}
},new Function2<Integer, Integer, Integer>() {
/**
* comb: 80 80
*/
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer t1, Integer t2) throws Exception {
System.out.println("comb: " + t1 + "\t " + t2);
return t1+t2;
}
});
List<Tuple2<Integer,Integer>> resultRdd = aggregateByKey.collect();
for (Tuple2<Integer, Integer> tuple2 : resultRdd) {
System.out.println(tuple2._1+"\t"+tuple2._2);
}
/*
* 2 160
1 99
3 80
*/
}
}