spark join java

spark version 2.0.0

package hellospark;

import java.util.ArrayList;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.PairFunction;

import scala.Tuple2;

public class JoinTest {

    /**
     * @param args
     */
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("JoinTest");
        JavaSparkContext sc = new JavaSparkContext(conf);

        List<String> list1 = new ArrayList<String>();
        list1.add("SB10001,Roger,Federer");
        list1.add("SB10002,Pete,Sampras");
        list1.add("SB10003,Rafael,Nadal");
        JavaRDD<String> rdd1 = sc.parallelize(list1);
        JavaPairRDD<String,String> rdd1Pair = rdd1.mapToPair(new PairFunction<String,String, String>() {
            @Override
            public Tuple2<String, String> call(String t) throws Exception {
                String[] strings = t.split(",");
                return new Tuple2<String, String>(strings[0],strings[1]+" "+strings[2]);
            }
        });

        List<String> list2 = new ArrayList<String>();
        list2.add("SB10001,100");
        list2.add("SB10003,200");
        JavaRDD<String> rdd2 = sc.parallelize(list2);
        JavaPairRDD<String,Integer> rdd2Pair = rdd2.mapToPair(new PairFunction<String,String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(String t) throws Exception {
                String[] strings = t.split(",");
                return new Tuple2<String, Integer>(strings[0],Integer.parseInt(strings[1]));
            }
        });

        JavaPairRDD<String,Tuple2<String,Integer>> joinTuples = rdd1Pair.join(rdd2Pair).sortByKey();
        System.out.println("join");
        System.out.println(joinTuples.collect());

        JavaPairRDD<String,Tuple2<String,Optional<Integer>>> leftJoinTuples = rdd1Pair.leftOuterJoin(rdd2Pair).sortByKey();
        System.out.println("leftOuterJoin");
        System.out.println(leftJoinTuples.collect());

    }

}

提交spark作业,打印结果如下:
join
[(SB10001,(Roger Federer,100)), (SB10003,(Rafael Nadal,200))]
leftOuterJoin
[(SB10001,(Roger Federer,Optional[100])), (SB10002,(Pete Sampras,Optional.empty)), (SB10003,(Rafael Nadal,Optional[200]))]

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值