2020.11.6课堂笔记(spark常用RDD算子2)

最新推荐文章于 2021-01-31 17:55:53 发布

超可爱慕之

最新推荐文章于 2021-01-31 17:55:53 发布

阅读量164

点赞数

分类专栏：笔记

本文链接：https://blog.csdn.net/m0_48758256/article/details/109591314

版权

笔记专栏收录该内容

138 篇文章 5 订阅

订阅专栏

reduceByKey

接收一个函数，按照相同的key进行reduce操作，类似于scala的reduce的操作

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object ReduceByKeyDemo {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName("reducebykey11").setMaster("local[*]")
    val sc: SparkContext = SparkContext.getOrCreate(conf)
    val rdd: RDD[String] = sc.textFile("in/sample.txt")
    val rdd2: RDD[String] = rdd.flatMap(x=>x.split("\\s+"))
    val rdd3: RDD[(String, Int)] = rdd2.map((_,1))
    val rdd4: RDD[(String, Int)] = rdd3.reduceByKey(_+_)
    rdd4.collect.foreach(println)
  }
}

java版本：

/**
 * Merge the values for each key using an associative and commutative reduce function. This will
 * also perform the merging locally on each mapper before sending results to a reducer, similarly
 * to a "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
 * parallelism level.
 */
def reduceByKey(func: JFunction2[V, V, V]): JavaPairRDD[K, V] = {
  fromRDD(reduceByKey(defaultPartitioner(rdd), func))
}

def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] = {
  new JavaPairRDD[K, V](rdd)
}

代码实现：

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * @Author: ChaoKeAiMuZhi
 * @Date: 2020/11/10 8:40
 * @Description:
 **/
public class ReduceByKeyJava {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("reduceByKey");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> stringJavaRDD = sc.textFile("in/sample.txt");
        JavaRDD<Tuple2<String, Integer>> tuple2JavaRDD = stringJavaRDD.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public Iterator<Tuple2<String, Integer>> call(String s) throws Exception {
                String[] split = s.split("\\s+");
                ArrayList<Tuple2<String, Integer>> tuple2 = new ArrayList<>();
                for (int i = 0; i < split.length; i++) {
                    Tuple2<String, Integer> tp2 = new Tuple2<>(split[i], 1);
                    tuple2.add(tp2);
                }
                return tuple2.iterator();
            }
        });
        JavaPairRDD<String, Integer> javaPairRDD = JavaPairRDD.fromJavaRDD(tuple2JavaRDD);
        JavaPairRDD<String, Integer> reduceByKeyRdd = javaPairRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });
        //第一遍历方式，打印的是Tuple2类型
        List<Tuple2<String, Integer>> collect = reduceByKeyRdd.collect();
        for (Tuple2<String, Integer> stringIntegerTuple2 : collect) {
            System.out.println(stringIntegerTuple2);
        }
        //第二种打印方式,因为是键值对的形式，可以转换成Map
        System.out.println("--------------------------");
        Map<String, Integer> map = reduceByKeyRdd.collectAsMap();
        for (String s : map.keySet()) {
            System.out.println("("+s+","+map.get(s)+")");
        }
    }
}

foldByKey：

该函数用于RDD[K,V]根据K将V做折叠、合并处理，其中的参数zeroValue表示先根据映射函数将zeroValue应用于V,进行初始化V,再将映射函数应用于初始化后的V.
与reduce不同的是 foldByKey开始折叠的第一个元素不是集合中的第一个元素，而是传入的一个元素

在Scala中：PairRDDFunctions
/**
 * Merge the values for each key using an associative function and a neutral "zero value" which
 * may be added to the result an arbitrary number of times, and must not change the result
 * (e.g., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
 */
def foldByKey(
    zeroValue: V,
    partitioner: Partitioner)(func: (V, V) => V): RDD[(K, V)] = self.withScope {
  // Serialize the zero value to a byte array so that we can get a new clone of it on each key
  val zeroBuffer = SparkEnv.get.serializer.newInstance().serialize(zeroValue)
  val zeroArray = new Array[Byte](zeroBuffer.limit)
  zeroBuffer.get(zeroArray)
  // When deserializing, use a lazy val to create just one instance of the serializer per task
  lazy val cachedSerializer = SparkEnv.get.serializer.newInstance()
  val createZero = () => cachedSerializer.deserialize[V](ByteBuffer.wrap(zeroArray))
  val cleanedFunc = self.context.clean(func)
  combineByKeyWithClassTag[V]((v: V) => cleanedFunc(createZero(), v),
    cleanedFunc, cleanedFunc, partitioner)
}
/**
 * Merge the values for each key using an associative function and a neutral "zero value" which
 * may be added to the result an arbitrary number of times, and must not change the result
 * (e.g., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
 */
def foldByKey(zeroValue: V, numPartitions: Int)(func: (V, V) => V): RDD[(K, V)] = self.withScope {
  foldByKey(zeroValue, new HashPartitioner(numPartitions))(func)
}
/**
 * Merge the values for each key using an associative function and a neutral "zero value" which
 * may be added to the result an arbitrary number of times, and must not change the result
 * (e.g., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
 */
def foldByKey(zeroValue: V)(func: (V, V) => V): RDD[(K, V)] = self.withScope {
  foldByKey(zeroValue, defaultPartitioner(self))(func)
}

//最终调用的方法combineByKeyWithClassTag
def combineByKeyWithClassTag[C](
    createCombiner: V => C,    //(v: V) => cleanedFunc(createZero(), v)
    mergeValue: (C, V) => C,    //cleanedFunc = self.context.clean(func)
    mergeCombiners: (C, C) => C,    //cleanedFunc = self.context.clean(func)
    partitioner: Partitioner,    //partitioner
    mapSideCombine: Boolean = true,
    serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
  require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
  if (keyClass.isArray) {
    if (mapSideCombine) {
      throw new SparkException("Cannot use map-side combining with array keys.")
    }
    if (partitioner.isInstanceOf[HashPartitioner]) {
      throw new SparkException("HashPartitioner cannot partition array keys.")
    }
  }
  val aggregator = new Aggregator[K, V, C](
    self.context.clean(createCombiner),
    self.context.clean(mergeValue),
    self.context.clean(mergeCombiners))
  if (self.partitioner == Some(partitioner)) {
    self.mapPartitions(iter => {
      val context = TaskContext.get()
      new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
    }, preservesPartitioning = true)
  } else {
    new ShuffledRDD[K, V, C](self, partitioner)
      .setSerializer(serializer)
      .setAggregator(aggregator)
      .setMapSideCombine(mapSideCombine)
  }
}

代码：

scala> var rdd1 = sc.makeRDD(Array(("A",0),("A",2),("B",1),("B",2),("C",1)))
scala> rdd1.foldByKey(0)(_+_).collect
res75: Array[(String, Int)] = Array((A,2), (B,3), (C,1)) 
//将rdd1中每个key对应的V进行累加，注意zeroValue=0,需要先初始化V,映射函数为+操作，比如("A",0), ("A",2)，先将zeroValue应用于每个V,得到：("A",0+0), ("A",2+0)，即：
//("A",0), ("A",2)，再将映射函数应用于初始化后的V，最后得到(A,0+2),即(A,2)
scala> rdd1.foldByKey(2)(_+_).collect
res76: Array[(String, Int)] = Array((A,6), (B,7), (C,3))
//先将zeroValue=2应用于每个V,得到：("A",0+2), ("A",2+2)，即：("A",2), ("A",4)，再将映射函数应用于初始化后的V，最后得到：(A,2+4)，即：(A,6)
scala> rdd1.foldByKey(0)(_*_).collect
res77: Array[(String, Int)] = Array((A,0), (B,0), (C,0))
//先将zeroValue=0应用于每个V,注意，这次映射函数为乘法，得到：("A",0*0), ("A",2*0)，
//即：("A",0), ("A",0)，再将映射函//数应用于初始化后的V，最后得到：(A,0*0)，即：(A,0)
//其他K也一样，最终都得到了V=0
scala> rdd1.foldByKey(1)(_*_).collect
res78: Array[(String, Int)] = Array((A,0), (B,2), (C,1))
//映射函数为乘法时，需要将zeroValue设为1，才能得到我们想要的结果。

java中：

在JavaPairRDD中：
/**
 * Merge the values for each key using an associative function and a neutral "zero value" which
 * may be added to the result an arbitrary number of times, and must not change the result
 * (e.g ., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
 */
def foldByKey(zeroValue: V, partitioner: Partitioner, func: JFunction2[V, V, V])
: JavaPairRDD[K, V] = fromRDD(rdd.foldByKey(zeroValue, partitioner)(func))

/**
 * Merge the values for each key using an associative function and a neutral "zero value" which
 * may be added to the result an arbitrary number of times, and must not change the result
 * (e.g., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
 */
def foldByKey(
    zeroValue: V,
    partitioner: Partitioner)(func: (V, V) => V): RDD[(K, V)] = self.withScope {
  // Serialize the zero value to a byte array so that we can get a new clone of it on each key
  val zeroBuffer = SparkEnv.get.serializer.newInstance().serialize(zeroValue)
  val zeroArray = new Array[Byte](zeroBuffer.limit)
  zeroBuffer.get(zeroArray)
  // When deserializing, use a lazy val to create just one instance of the serializer per task
  lazy val cachedSerializer = SparkEnv.get.serializer.newInstance()
  val createZero = () => cachedSerializer.deserialize[V](ByteBuffer.wrap(zeroArray))
  val cleanedFunc = self.context.clean(func)
  combineByKeyWithClassTag[V]((v: V) => cleanedFunc(createZero(), v),
    cleanedFunc, cleanedFunc, partitioner)
}

java代码实现：

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;

import java.util.Arrays;
import java.util.Map;

/**
 * @Author: ChaoKeAiMuZhi
 * @Date: 2020/11/10 9:51
 * @Description:
 **/
public class FoldByKeyJava {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("foldbykey");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<Tuple2<String, Integer>> rdd = sc.parallelize(Arrays.asList(
                new Tuple2<>("A", 2),
                new Tuple2<>("A", 3),
                new Tuple2<>("A", 4),
                new Tuple2<>("B", 5),
                new Tuple2<>("A", 6)
        ));
        JavaPairRDD<String, Integer> javaPairRDD = JavaPairRDD.fromJavaRDD(rdd);
        JavaPairRDD<String, Integer> foldByKeyRDD = javaPairRDD.foldByKey(1, new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                System.out.println("v1的值是："+v1+"，v2的值是："+v2);
                return v1 + v2;
            }
        });
        Map<String, Integer> map = foldByKeyRDD.collectAsMap();
        for (String s : map.keySet()) {
            System.out.println("("+s+","+map.get(s)+")");
        }
    }
}

sortByKey

SortByKey用于对pairRDD按照key进行排序，第一个参数可以设置true或者false，默认是true

/**
 * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
 * `collect` or `save` on the resulting RDD will return or output an ordered list of records
 * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
 * order of the keys).
 */
// TODO: this currently doesn't work on P other than Tuple2!
def sortByKey(ascending: Boolean = true, numPartitions: Int = self.partitions.length)
    : RDD[(K, V)] = self.withScope
{
  val part = new RangePartitioner(numPartitions, self, ascending)
  new ShuffledRDD[K, V, V](self, part)
    .setKeyOrdering(if (ascending) ordering else ordering.reverse)
}

scala代码：

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object SortByKey {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("sortbykey")
    val sc: SparkContext = SparkContext.getOrCreate(conf)
    val rdd: RDD[(String, Int)] = sc.parallelize(List(("A",1),("B",2),("C",3),("D",4),("E",5)))
    val rdd2: RDD[(String, Int)] = rdd.sortByKey(false)
    rdd2.collect.foreach(println)
  }
}

注意：最终结果生成的是ShuffledRDD，sortByKey是转换算子
java实现：

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;

import java.util.Arrays;
import java.util.Map;

/**
 * @Author: ChaoKeAiMuZhi
 * @Date: 2020/11/10 11:06
 * @Description:
 **/
public class SortByKeyJava {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("sortbykey");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<Tuple2<String, Integer>> rdd = sc.parallelize(Arrays.asList(
                new Tuple2<>("A", 1),
                new Tuple2<>("C", 2),
                new Tuple2<>("D", 3),
                new Tuple2<>("E", 4)));
        JavaPairRDD<String, Integer> javaPairRDD = JavaPairRDD.fromJavaRDD(rdd);
        JavaPairRDD<String, Integer> sortByKeyRdd = javaPairRDD.sortByKey();
        Map<String, Integer> map = sortByKeyRdd.collectAsMap();
        for (String s : map.keySet()) {
            System.out.println("("+s+","+map.get(s)+")");
        }
    }
}

将JavaRDD转成JavaPairRDD的两种写法：
第一种：

JavaPairRDD<String, Integer> javaPairRDD = rdd.mapToPair(new PairFunction<Tuple2<String, Integer>, String, Integer>() {
    @Override
    public Tuple2<String, Integer> call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
        return stringIntegerTuple2;
    }
});

第二种

JavaPairRDD<String, Integer> javaPairRDD = JavaPairRDD.fromJavaRDD(rdd);

超可爱慕之

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
2020.11.6课堂笔记(spark常用RDD算子2)

spark常用Rdd算子：reduceByKey、foldByKey、sortByKey
复制链接

扫一扫

专栏目录