reduceByKey
接收一个函数,按照相同的key进行reduce操作,类似于scala的reduce的操作
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object ReduceByKeyDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("reducebykey11").setMaster("local[*]")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val rdd: RDD[String] = sc.textFile("in/sample.txt")
val rdd2: RDD[String] = rdd.flatMap(x=>x.split("\\s+"))
val rdd3: RDD[(String, Int)] = rdd2.map((_,1))
val rdd4: RDD[(String, Int)] = rdd3.reduceByKey(_+_)
rdd4.collect.foreach(println)
}
}
java版本:
/**
* Merge the values for each key using an associative and commutative reduce function. This will
* also perform the merging locally on each mapper before sending results to a reducer, similarly
* to a "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
* parallelism level.
*/
def reduceByKey(func: JFunction2[V, V, V]): JavaPairRDD[K, V] = {
fromRDD(reduceByKey(defaultPartitioner(rdd), func))
}
def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] = {
new JavaPairRDD[K, V](rdd)
}
代码实现:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/10 8:40
* @Description:
**/
public class ReduceByKeyJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("reduceByKey");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> stringJavaRDD = sc.textFile("in/sample.txt");
JavaRDD<Tuple2<String, Integer>> tuple2JavaRDD = stringJavaRDD.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public Iterator<Tuple2<String, Integer>> call(String s) throws Exception {
String[] split = s.split("\\s+");
ArrayList<Tuple2<String, Integer>> tuple2 = new ArrayList<>();
for (int i = 0; i < split.length; i++) {
Tuple2<String, Integer> tp2 = new Tuple2<>(split[i], 1);
tuple2.add(tp2);
}
return tuple2.iterator();
}
});
JavaPairRDD<String, Integer> javaPairRDD = JavaPairRDD.fromJavaRDD(tuple2JavaRDD);
JavaPairRDD<String, Integer> reduceByKeyRdd = javaPairRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
//第一遍历方式,打印的是Tuple2类型
List<Tuple2<String, Integer>> collect = reduceByKeyRdd.collect();
for (Tuple2<String, Integer> stringIntegerTuple2 : collect) {
System.out.println(stringIntegerTuple2);
}
//第二种打印方式,因为是键值对的形式,可以转换成Map
System.out.println("--------------------------");
Map<String, Integer> map = reduceByKeyRdd.collectAsMap();
for (String s : map.keySet()) {
System.out.println("("+s+","+map.get(s)+")");
}
}
}
foldByKey:
该函数用于RDD[K,V]根据K将V做折叠、合并处理,其中的参数zeroValue表示先根据映射函数将zeroValue应用于V,进行初始化V,再将映射函数应用于初始化后的V.
与reduce不同的是 foldByKey开始折叠的第一个元素不是集合中的第一个元素,而是传入的一个元素
在Scala中:PairRDDFunctions
/**
* Merge the values for each key using an associative function and a neutral "zero value" which
* may be added to the result an arbitrary number of times, and must not change the result
* (e.g., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
*/
def foldByKey(
zeroValue: V,
partitioner: Partitioner)(func: (V, V) => V): RDD[(K, V)] = self.withScope {
// Serialize the zero value to a byte array so that we can get a new clone of it on each key
val zeroBuffer = SparkEnv.get.serializer.newInstance().serialize(zeroValue)
val zeroArray = new Array[Byte](zeroBuffer.limit)
zeroBuffer.get(zeroArray)
// When deserializing, use a lazy val to create just one instance of the serializer per task
lazy val cachedSerializer = SparkEnv.get.serializer.newInstance()
val createZero = () => cachedSerializer.deserialize[V](ByteBuffer.wrap(zeroArray))
val cleanedFunc = self.context.clean(func)
combineByKeyWithClassTag[V]((v: V) => cleanedFunc(createZero(), v),
cleanedFunc, cleanedFunc, partitioner)
}
/**
* Merge the values for each key using an associative function and a neutral "zero value" which
* may be added to the result an arbitrary number of times, and must not change the result
* (e.g., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
*/
def foldByKey(zeroValue: V, numPartitions: Int)(func: (V, V) => V): RDD[(K, V)] = self.withScope {
foldByKey(zeroValue, new HashPartitioner(numPartitions))(func)
}
/**
* Merge the values for each key using an associative function and a neutral "zero value" which
* may be added to the result an arbitrary number of times, and must not change the result
* (e.g., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
*/
def foldByKey(zeroValue: V)(func: (V, V) => V): RDD[(K, V)] = self.withScope {
foldByKey(zeroValue, defaultPartitioner(self))(func)
}
//最终调用的方法combineByKeyWithClassTag
def combineByKeyWithClassTag[C](
createCombiner: V => C, //(v: V) => cleanedFunc(createZero(), v)
mergeValue: (C, V) => C, //cleanedFunc = self.context.clean(func)
mergeCombiners: (C, C) => C, //cleanedFunc = self.context.clean(func)
partitioner: Partitioner, //partitioner
mapSideCombine: Boolean = true,
serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
if (keyClass.isArray) {
if (mapSideCombine) {
throw new SparkException("Cannot use map-side combining with array keys.")
}
if (partitioner.isInstanceOf[HashPartitioner]) {
throw new SparkException("HashPartitioner cannot partition array keys.")
}
}
val aggregator = new Aggregator[K, V, C](
self.context.clean(createCombiner),
self.context.clean(mergeValue),
self.context.clean(mergeCombiners))
if (self.partitioner == Some(partitioner)) {
self.mapPartitions(iter => {
val context = TaskContext.get()
new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
}, preservesPartitioning = true)
} else {
new ShuffledRDD[K, V, C](self, partitioner)
.setSerializer(serializer)
.setAggregator(aggregator)
.setMapSideCombine(mapSideCombine)
}
}
代码:
scala> var rdd1 = sc.makeRDD(Array(("A",0),("A",2),("B",1),("B",2),("C",1)))
scala> rdd1.foldByKey(0)(_+_).collect
res75: Array[(String, Int)] = Array((A,2), (B,3), (C,1))
//将rdd1中每个key对应的V进行累加,注意zeroValue=0,需要先初始化V,映射函数为+操作,比如("A",0), ("A",2),先将zeroValue应用于每个V,得到:("A",0+0), ("A",2+0),即:
//("A",0), ("A",2),再将映射函数应用于初始化后的V,最后得到(A,0+2),即(A,2)
scala> rdd1.foldByKey(2)(_+_).collect
res76: Array[(String, Int)] = Array((A,6), (B,7), (C,3))
//先将zeroValue=2应用于每个V,得到:("A",0+2), ("A",2+2),即:("A",2), ("A",4),再将映射函数应用于初始化后的V,最后得到:(A,2+4),即:(A,6)
scala> rdd1.foldByKey(0)(_*_).collect
res77: Array[(String, Int)] = Array((A,0), (B,0), (C,0))
//先将zeroValue=0应用于每个V,注意,这次映射函数为乘法,得到:("A",0*0), ("A",2*0),
//即:("A",0), ("A",0),再将映射函//数应用于初始化后的V,最后得到:(A,0*0),即:(A,0)
//其他K也一样,最终都得到了V=0
scala> rdd1.foldByKey(1)(_*_).collect
res78: Array[(String, Int)] = Array((A,0), (B,2), (C,1))
//映射函数为乘法时,需要将zeroValue设为1,才能得到我们想要的结果。
java中:
在JavaPairRDD中:
/**
* Merge the values for each key using an associative function and a neutral "zero value" which
* may be added to the result an arbitrary number of times, and must not change the result
* (e.g ., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
*/
def foldByKey(zeroValue: V, partitioner: Partitioner, func: JFunction2[V, V, V])
: JavaPairRDD[K, V] = fromRDD(rdd.foldByKey(zeroValue, partitioner)(func))
/**
* Merge the values for each key using an associative function and a neutral "zero value" which
* may be added to the result an arbitrary number of times, and must not change the result
* (e.g., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
*/
def foldByKey(
zeroValue: V,
partitioner: Partitioner)(func: (V, V) => V): RDD[(K, V)] = self.withScope {
// Serialize the zero value to a byte array so that we can get a new clone of it on each key
val zeroBuffer = SparkEnv.get.serializer.newInstance().serialize(zeroValue)
val zeroArray = new Array[Byte](zeroBuffer.limit)
zeroBuffer.get(zeroArray)
// When deserializing, use a lazy val to create just one instance of the serializer per task
lazy val cachedSerializer = SparkEnv.get.serializer.newInstance()
val createZero = () => cachedSerializer.deserialize[V](ByteBuffer.wrap(zeroArray))
val cleanedFunc = self.context.clean(func)
combineByKeyWithClassTag[V]((v: V) => cleanedFunc(createZero(), v),
cleanedFunc, cleanedFunc, partitioner)
}
java代码实现:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Map;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/10 9:51
* @Description:
**/
public class FoldByKeyJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("foldbykey");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Tuple2<String, Integer>> rdd = sc.parallelize(Arrays.asList(
new Tuple2<>("A", 2),
new Tuple2<>("A", 3),
new Tuple2<>("A", 4),
new Tuple2<>("B", 5),
new Tuple2<>("A", 6)
));
JavaPairRDD<String, Integer> javaPairRDD = JavaPairRDD.fromJavaRDD(rdd);
JavaPairRDD<String, Integer> foldByKeyRDD = javaPairRDD.foldByKey(1, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
System.out.println("v1的值是:"+v1+",v2的值是:"+v2);
return v1 + v2;
}
});
Map<String, Integer> map = foldByKeyRDD.collectAsMap();
for (String s : map.keySet()) {
System.out.println("("+s+","+map.get(s)+")");
}
}
}
sortByKey
SortByKey用于对pairRDD按照key进行排序,第一个参数可以设置true或者false,默认是true
/**
* Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
* `collect` or `save` on the resulting RDD will return or output an ordered list of records
* (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
* order of the keys).
*/
// TODO: this currently doesn't work on P other than Tuple2!
def sortByKey(ascending: Boolean = true, numPartitions: Int = self.partitions.length)
: RDD[(K, V)] = self.withScope
{
val part = new RangePartitioner(numPartitions, self, ascending)
new ShuffledRDD[K, V, V](self, part)
.setKeyOrdering(if (ascending) ordering else ordering.reverse)
}
scala代码:
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object SortByKey {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("sortbykey")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val rdd: RDD[(String, Int)] = sc.parallelize(List(("A",1),("B",2),("C",3),("D",4),("E",5)))
val rdd2: RDD[(String, Int)] = rdd.sortByKey(false)
rdd2.collect.foreach(println)
}
}
注意:最终结果生成的是ShuffledRDD,sortByKey是转换算子
java实现:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Map;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/11/10 11:06
* @Description:
**/
public class SortByKeyJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("sortbykey");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Tuple2<String, Integer>> rdd = sc.parallelize(Arrays.asList(
new Tuple2<>("A", 1),
new Tuple2<>("C", 2),
new Tuple2<>("D", 3),
new Tuple2<>("E", 4)));
JavaPairRDD<String, Integer> javaPairRDD = JavaPairRDD.fromJavaRDD(rdd);
JavaPairRDD<String, Integer> sortByKeyRdd = javaPairRDD.sortByKey();
Map<String, Integer> map = sortByKeyRdd.collectAsMap();
for (String s : map.keySet()) {
System.out.println("("+s+","+map.get(s)+")");
}
}
}
将JavaRDD转成JavaPairRDD的两种写法:
第一种:
JavaPairRDD<String, Integer> javaPairRDD = rdd.mapToPair(new PairFunction<Tuple2<String, Integer>, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
return stringIntegerTuple2;
}
});
第二种
JavaPairRDD<String, Integer> javaPairRDD = JavaPairRDD.fromJavaRDD(rdd);