四种join算子

原始数据集

parallelizePairs 将list转换成RDD

   SparkConf conf = new SparkConf();
        conf.setMaster("local").setAppName("join");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaPairRDD<Integer, String> nameRDD = sc.parallelizePairs(Arrays.asList(
                new Tuple2<Integer, String>(0, "aa"),
                new Tuple2<Integer, String>(1, "a"),
                new Tuple2<Integer, String>(2, "b"),
                new Tuple2<Integer, String>(3, "c")
        ));

        JavaPairRDD<Integer, Integer> scoreRDD = sc.parallelizePairs(Arrays.asList(
                new Tuple2<Integer, Integer>(1, 100),
                new Tuple2<Integer, Integer>(2, 200),
                new Tuple2<Integer, Integer>(3, 300),
                new Tuple2<Integer, Integer>(4, 400)
        ));

jion

JavaPairRDD<Integer, Tuple2<String, Integer>> join = nameRDD.join(scoreRDD,3);
        System.out.println("join.partitions().size()--------"+join.partitions().size());
        join.foreach(new VoidFunction<Tuple2<Integer,Tuple2<String,Integer>>>() {
            private static final long serialVersionUID = 1L;
            @Override
            public void call(Tuple2<Integer, Tuple2<String, Integer>> t)
                    throws Exception {
                System.out.println(t);
            }
        });
    }

结果

(1,(a,100))
(3,(c,300))
(2,(b,200))

注意: jion后面增加的是分区数

leftOutJoin

        JavaPairRDD<Integer, Tuple2<String, Optional<Integer>>> leftOuterJoin = nameRDD.leftOuterJoin(scoreRDD);
        System.out.println("leftOuterJoin.partitions().size()--------"+leftOuterJoin.partitions().size());
        leftOuterJoin.foreach(new VoidFunction<Tuple2<Integer, Tuple2<String, Optional<Integer>>>>() {
            private static final long serialVersionUID = 1L;
            @Override
            public void call(
                    Tuple2<Integer, Tuple2<String, Optional<Integer>>> t)
                    throws Exception {
                Integer key = t._1;
                Tuple2<String, Optional<Integer>> tuple = t._2;
                Optional<Integer> option = t._2._2;
                if (option.isPresent()){
                    System.out.println(option.get());
                }
                System.out.println(t);
            }
        });

结果

(0,(aa,Optional.absent()))
100
(1,(a,Optional.of(100)))
300
(3,(c,Optional.of(300)))
200
(2,(b,Optional.of(200)))

rightOutJoin

代码

JavaPairRDD<Integer, Tuple2<Optional<String>, Integer>> rightOuterJoin = nameRDD.rightOuterJoin(scoreRDD);
        System.out.println("leftOuterJoin.partitions().size()--------" + rightOuterJoin.partitions().size());
        rightOuterJoin.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Optional<String>, Integer>>>() {
            @Override
            public void call(Tuple2<Integer, Tuple2<Optional<String>, Integer>> t) throws Exception {
                System.out.println(t._2 +"①");
                if (t._2._1.isPresent()){
                    System.out.println(t._2._1.get()+ "②");
                }
                System.out.println(t + "③");
            }
        });

结果

(Optional.absent(),400)(4,(Optional.absent(),400))(Optional.of(a),100)①
a②
(1,(Optional.of(a),100))(Optional.of(c),300)①
c②
(3,(Optional.of(c),300))(Optional.of(b),200)①
b②
(2,(Optional.of(b),200))

fullOuterJoin

    JavaPairRDD<Integer, Tuple2<Optional<String>, Optional<Integer>>> fullOuterJoin = nameRDD.fullOuterJoin(scoreRDD);
        System.out.println("leftOuterJoin.partitions().size()--------" + fullOuterJoin.partitions().size());
        fullOuterJoin.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Optional<String>, Optional<Integer>>>>() {
            @Override
            public void call(Tuple2<Integer, Tuple2<Optional<String>, Optional<Integer>>> t) throws Exception {
                Integer integer = t._1;
                Tuple2<Optional<String>, Optional<Integer>> optional = t._2;
                if (optional._1.isPresent()){
                    System.out.println(optional._1.get()+"①");
                }
                if (optional._2.isPresent()){
                    System.out.println(optional._2.get()+"②");
                }
            }
        });

结果

400②
19/04/23 01:20:32 INFO TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool 
aa①
a①
19/04/23 01:20:32 INFO DAGScheduler: ResultStage 2 (foreach at OperatorJion.java:35) finished in 0.094 s
100②
c①
300②
b①
200②
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值