Spark RDD算子:分区操作,mapPartitions和mapPartitionsWIthIndex

mapPartitions

mapPartitions的执行过程是先进行分区,然后对分区内的元素进行相应的操作。如果在映射的过程中需要频繁的创建额外的对象,使用mapPartitions比map要更加高效。
举一些例子:计算每个元素乘以2.
Java端

package Action;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

import java.util.*;
public class Partition {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("partition");
        JavaSparkContext sc = new JavaSparkContext(conf);
        
        JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6));
        //每个元素乘以2
        JavaRDD<Integer> mapPartitions = rdd.mapPartitions(new FlatMapFunction<Iterator<Integer>, Integer>() {
            @Override
            public Iterator<Integer> call(Iterator<Integer> i) throws Exception {
                List<Integer> list = new ArrayList<>();
                while (i.hasNext()) {
                    Integer num = i.next();
                    list.add(num * 2);
                }
                return list.iterator();
            }
        });
        mapPartitions.foreach(new VoidFunction<Integer>() {
            @Override
            public void call(Integer integer) throws Exception {
                System.out.println(integer);
            }
        });
        System.out.println("==================");
        //输出键值对形式
        JavaRDD<Tuple2<Integer, Integer>> tuple2JavaRDD = rdd.mapPartitions(new FlatMapFunction<Iterator<Integer>, Tuple2<Integer, Integer>>() {
            @Override
            public Iterator<Tuple2<Integer, Integer>> call(Iterator<Integer> it) throws Exception {
                ArrayList list = new ArrayList();
                while (it.hasNext()) {
                    Integer next = it.next();
                    list.add(new Tuple2<Integer, Integer>(next, next * 2));
                }
                return list.iterator();
            }
        });
        tuple2JavaRDD.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {
            @Override
            public void call(Tuple2<Integer, Integer> tuple2) throws Exception {
                System.out.println(tuple2);
            }
        });
        
    }
}

Scala端

object partitions {
  def main(args: Array[String]): Unit = {
    val conf= new SparkConf().setMaster("local[2]").setAppName("partition")
    val sc=new SparkContext(conf)

    val rdd = sc.parallelize(List(1,2,3,4,5))
    def mapPart(it:Iterator[Int]):Iterator[(Int,Int)]={
      var list=List[(Int,Int)]()
      while (it.hasNext) {
        val i = it.next()
        list=list.::(i,i*2)
      }
      list.iterator
    }
    val result = rdd.mapPartitions(mapPart)
    result.foreach(x=>println(x))
   
  }
}

把(i,j)变成(i,j*j)的形式
java端:

	JavaRDD<Tuple2<Integer, Integer>> rdd1 = sc.parallelize(Arrays.asList(
                new Tuple2<Integer, Integer>(1, 2),
                new Tuple2<Integer, Integer>(2, 2),
                new Tuple2<Integer, Integer>(2, 3),
                new Tuple2<Integer, Integer>(3, 4)
        ));
        JavaPairRDD<Integer, Integer> partPair = JavaPairRDD.fromJavaRDD(rdd1);
        JavaRDD<Tuple2<Integer, Integer>> result = partPair.mapPartitions(new FlatMapFunction<Iterator<Tuple2<Integer, Integer>>, Tuple2<Integer, Integer>>() {
            @Override
            public Iterator<Tuple2<Integer, Integer>> call(Iterator<Tuple2<Integer, Integer>> tp2) throws Exception {
                List<Tuple2<Integer, Integer>> list = new ArrayList<>();
                while (tp2.hasNext()) {
                    Tuple2<Integer, Integer> next = tp2.next();
                    list.add(new Tuple2<Integer, Integer>(next._1, next._2 * next._2));
                }
                return list.iterator();
            }
        });
        result.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {
            @Override
            public void call(Tuple2<Integer, Integer> tuple2) throws Exception {
                System.out.println(tuple2);
            }
        });

scala端

    val rdd2=sc.parallelize(List((1,2),(2,2),(2,3),(3,4)))
    def mapPart2(it:Iterator[(Int,Int)]):Iterator[(Int,Int)]={
      var list=List[(Int,Int)]()
      while (it.hasNext) {
        val tuple = it.next()
        list = list.::(tuple._1,tuple._2*tuple._2)
      }
      list.iterator
    }
    val result2 = rdd2.mapPartitions(mapPart2)
    result2.foreach(x=>println(x))

mapPartitionsWithIndex

按照分区进行操作,不过这里传入的参数多了一个分区的值。例如,统计下各个分区中的元素:
Scala端

 val rdd = sc.parallelize(List(1,2,3,4,5,6,7,8))
    def mapPairIndex(i:Int,it:Iterator[Int]):Iterator[(Int,Int)]={
      var list = List[(Int,Int)]()
      while (it.hasNext) {
        val next = it.next()
        list=list.::(i,next)
      }
      list.iterator
    }
    val result = rdd.mapPartitionsWithIndex(mapPairIndex)
    result.foreach(println)

结果:

(1,8)
(0,4)
(0,3)
(0,2)
(0,1)
(1,7)
(1,6)
(1,5)

Java端

JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8));
        JavaRDD<Tuple2<Integer, Integer>> tuple2 = rdd.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<Tuple2<Integer, Integer>>>() {
            @Override
            public Iterator<Tuple2<Integer, Integer>> call(Integer i, Iterator<Integer> it) throws Exception {
                List<Tuple2<Integer, Integer>> list = new ArrayList<>();
                while (it.hasNext()) {
                    Integer next = it.next();
                    //i是分区值,next是各个分区中的元素
                    list.add(new Tuple2<Integer, Integer>(i, next));
                }
                return list.iterator();
            }
        }, false);
        tuple2.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {
            @Override
            public void call(Tuple2<Integer, Integer> tuple2) throws Exception {
                System.out.println(tuple2);
            }
        });

结果:

(1,5)
(0,1)
(0,2)
(0,3)
(0,4)
(1,6)
(1,7)
(1,8)

最后的false指的是是否保持分区。这里选择false。看一下这个的源码:

  def mapPartitionsWithIndex[R](f :Function2[Integer, Iterator[T], 
  Iterator[R]], 
  preservesPartitioning : scala.Boolean = 
  { /* compiled code */ }) : org.apache.spark.api.java.JavaRDD[R] = 
  { /* compiled code */ }

Function2中先传入的是分区值和T类型元素的迭代器,再输出一个R类型的迭代器,最后判断是否分区。

统计各个分区中的键值对元素
scala版本

val rdd2 = sc.parallelize(List((1,2),(1,3),(2,3),(3,5),(4,5),(4,7)))
    def mapPart2(i:Int,it:Iterator[(Int,Int)]):Iterator[(Int,(Int,Int))]={
      var list = List[(Int,(Int,Int))]()
      while (it.hasNext) {
        var next = it.next()
        list=list.::(i,next)
      }
      list.iterator
    }
    val result2 = rdd2.mapPartitionsWithIndex(mapPart2)
    result2.foreach(x=>println(x))

结果:

(0,(2,3))
(0,(1,3))
(0,(1,2))
(1,(4,7))
(1,(4,5))
(1,(3,5))

Java版本

        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("partition2");
        JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8));
        JavaRDD<Tuple2<Integer, Integer>> tuple2 = rdd.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<Tuple2<Integer, Integer>>>() {
            @Override
            public Iterator<Tuple2<Integer, Integer>> call(Integer i, Iterator<Integer> it) throws Exception {
                List<Tuple2<Integer, Integer>> list = new ArrayList<>();
                while (it.hasNext()) {
                    Integer next = it.next();
                    //i是分区值,next是各个分区中的元素
                    list.add(new Tuple2<Integer, Integer>(i, next));
                }
                return list.iterator();
            }
        }, false);
        tuple2.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {
            @Override
            public void call(Tuple2<Integer, Integer> tuple2) throws Exception {
                System.out.println(tuple2);
            }
        });

使用glom打印各个分区
scala版

 val result3 = rdd2.glom()
    result3.foreach(x=>println("["+x.mkString(",")+"]"))

结果

[(1,2),(1,3),(2,3)]
[(3,5),(4,5),(4,7)]

Java版

JavaRDD<List<Integer>> glom = rdd.glom();
        glom.foreach(new VoidFunction<List<Integer>>() {
            @Override
            public void call(List<Integer> integers) throws Exception {
                System.out.println(integers);
            }
        });

结果:

[5, 6, 7, 8]
[1, 2, 3, 4]

大家如果想做更详细的了解,可以进一步参考这篇文章:
RDD算子分区操作

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值