spark version 2.0.0
package hellospark;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class JoinTest {
/**
* @param args
*/
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JoinTest");
JavaSparkContext sc = new JavaSparkContext(conf);
List<String> list1 = new ArrayList<String>();
list1.add("SB10001,Roger,Federer");
list1.add("SB10002,Pete,Sampras");
list1.add("SB10003,Rafael,Nadal");
JavaRDD<String> rdd1 = sc.parallelize(list1);
JavaPairRDD<String,String> rdd1Pair = rdd1.mapToPair(new PairFunction<String,String, String>() {
@Override
public Tuple2<String, String> call(String t) throws Exception {
String[] strings = t.split(",");
return new Tuple2<String, String>(strings[0],strings[1]+" "+strings[2]);
}
});
List<String> list2 = new ArrayList<String>();
list2.add("SB10001,100");
list2.add("SB10003,200");
JavaRDD<String> rdd2 = sc.parallelize(list2);
JavaPairRDD<String,Integer> rdd2Pair = rdd2.mapToPair(new PairFunction<String,String, Integer>() {
@Override
public Tuple2<String, Integer> call(String t) throws Exception {
String[] strings = t.split(",");
return new Tuple2<String, Integer>(strings[0],Integer.parseInt(strings[1]));
}
});
JavaPairRDD<String,Tuple2<String,Integer>> joinTuples = rdd1Pair.join(rdd2Pair).sortByKey();
System.out.println("join");
System.out.println(joinTuples.collect());
JavaPairRDD<String,Tuple2<String,Optional<Integer>>> leftJoinTuples = rdd1Pair.leftOuterJoin(rdd2Pair).sortByKey();
System.out.println("leftOuterJoin");
System.out.println(leftJoinTuples.collect());
}
}
提交spark作业,打印结果如下:
join
[(SB10001,(Roger Federer,100)), (SB10003,(Rafael Nadal,200))]
leftOuterJoin
[(SB10001,(Roger Federer,Optional[100])), (SB10002,(Pete Sampras,Optional.empty)), (SB10003,(Rafael Nadal,Optional[200]))]