shuffle算子分类
- 重分区算子
repartition
coalesce
- ByKey算子
groupByKey
reduceByKey
aggregateByKey
combineByKey
sortByKey
sortBy
- Join算子
cogroup
join
leftOuterJoin
intersection
subtract
subtractByKey
- 去重算子
distinct
Shuffle算子为甚么不一定产生宽依赖?
groupByKey
- 源码
- combineByKeyWithClassTag方法
def combineByKeyWithClassTag[C](
createCombiner: V => C,
mergeValue: (C, V) => C,
mergeCombiners: (C, C) => C,
partitioner: Partitioner,
mapSideCombine: Boolean = true,
serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
if (keyClass.isArray) {
if (mapSideCombine) {
throw new SparkException("Cannot use map-side combining with array keys.")
}
if (partitioner.isInstanceOf[HashPartitioner]) {
throw new SparkException("HashPartitioner cannot partition array keys.")
}
}
val aggregator = new Aggregator[K, V, C](
self.context.clean(createCombiner),
self.context.clean(mergeValue),
self.context.clean(mergeCombiners))
// 判断父子分区器是否一致,一致则直接使用mapPartitions
if (self.partitioner == Some(partitioner)) {
self.mapPartitions(iter => {
val context = TaskContext.get()
new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
}, preservesPartitioning = true)
}
// 父子分区器不一致,new ShuffledRDD
else{
new ShuffledRDD[K, V, C](self, partitioner)
.setSerializer(serializer)
.setAggregator(aggregator)
.setMapSideCombine(mapSideCombine)
}
}
上述源码我们可以看到,父与子分区器一致的话走的就是窄依赖,这个
怎么界定
?
有些算子是可以传入分区器的,这个分区器可以使用spark的RangePartitioner、HashPartitioner
也可以自定义Partitioner
。
下面给个demo
val rdd1: RDD[String] = sc.parallelize(List(
"spark", "hadoop", "hive", "spark",
"spark", "flink", "hive", "spark",
"kafka", "kafka", "kafka", "kafka",
"hadoop", "flink", "hive", "flink"
),4)
rdd1.map((_,1))
.partitionBy(new HashPartitioner(4))
.filter(x=>false)
.groupByKey(new HashPartitioner(4))
.foreach(println)
从DAG图我们可以看出来,groupByKey是窄依赖的
cartesian
def cartesian[U: ClassTag](other: RDD[U]): RDD[(T, U)] = withScope {
new CartesianRDD(sc, this, other)
}
class CartesianRDD[T: ClassTag, U: ClassTag](
sc: SparkContext,
var rdd1 : RDD[T],
var rdd2 : RDD[U])
extends RDD[(T, U)](sc, Nil)
with Serializable {
val numPartitionsInRdd2 = rdd2.partitions.length
override def getPartitions: Array[Partition] = {
// create the cross product split
val array = new Array[Partition](rdd1.partitions.length * rdd2.partitions.length)
for (s1 <- rdd1.partitions; s2 <- rdd2.partitions) {
val idx = s1.index * numPartitionsInRdd2 + s2.index
array(idx) = new CartesianPartition(idx, rdd1, rdd2, s1.index, s2.index)
}
array
}
....
//从此我们可以看到,笛卡尔积虽然是多对多的关系,但是返回的依然是窄依赖
override def getDependencies: Seq[Dependency[_]] = List(
new NarrowDependency(rdd1) {
def getParents(id: Int): Seq[Int] = List(id / numPartitionsInRdd2)
},
new NarrowDependency(rdd2) {
def getParents(id: Int): Seq[Int] = List(id % numPartitionsInRdd2)
}
)