问题来源:join 操作何时是宽依赖,何时是窄依赖
测试代码:
object JoinDemo {
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir", "D:\\hadoop-2.9.2")
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getCanonicalName.init).setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
val random: Random.type = scala.util.Random
val col1: immutable.IndexedSeq[(Int, String)] = Range(1, 50).map(idx => (random.nextInt(10), s"user$idx"))
val col2 = Array((0, "BJ"), (1, "SH"), (2, "GZ"), (3, "SZ"), (4, "TJ"), (5, "CQ"), (6, "HZ"), (7, "NJ"), (8, "WH"), (0,
"CD"))
val rdd1: RDD[(Int, String)] = sc.makeRDD(col1)
val rdd2: RDD[(Int, String)] = sc.makeRDD(col2)
//join时判断 关键在defaultPartitioner
val rdd3: RDD[(Int, (String, String))] = rdd1.join(rdd2)
println(rdd3.toDebugString)
println(rdd3.dependencies.toBuffer)
rdd3.count()
// partitionBy 有shuffle
val rdd4: RDD[(Int, (String, String))] =
rdd1.partitionBy(new HashPartitioner(3))
.join(rdd2.partitionBy(new HashPartitioner(3)))
rdd4.count()
println(rdd4.toDebugString)
println(rdd4.dependencies.mkString(","))
Thread.sleep(5000000L)
sc.stop()
}
}
先运行代码看最后的结果 然后通过结果反推源码
这个是rdd1.join(rdd2)的stage划分图,可以看到有明细的stage切分也是就宽依赖
这个是rdd1.partitionBy(new HashPartitioner(3)).join(rdd2.partitionBy(new HashPartitioner(3))) 可以看到partitionBy后变为窄依赖了
进入源码:可以看到这里基本没有什么操作 只是设置了默认的分区器 进入defaultPartitioner()方法
def join[W](other: RDD[(K, W)]): RDD[(K, (V, W))] = self.withScope {
join(other, defaultPartitioner(self, other))
}
def defaultPartitioner(rdd: RDD[_], others: RDD[_]*): Partitioner = {
val rdds = (Seq(rdd) ++ others)
// 判断传入的rdd有没有设置partitioner
val hasPartitioner = rdds.filter(_.partitioner.exists(_.numPartitions > 0))
//如果设置了partitioner 则取设置partitioner的最大分区数
val hasMaxPartitioner: Option[RDD[_]] = if (hasPartitioner.nonEmpty) {
Some(hasPartitioner.maxBy(_.partitions.length))
} else {
None
}
//判断是否设置了spark.default.parallelism 如果设置了则默认取spark.default.parallelism
val defaultNumPartitions = if (rdd.context.conf.contains("spark.default.parallelism")) {
rdd.context.defaultParallelism
} else {
rdds.map(_.partitions.length).max
}
// If the existing max partitioner is an eligible one, or its partitions number is larger
// than the default number of partitions, use the existing partitioner.
//主要判断传入rdd是否设置了默认的partitioner 以及设置的partitioner是否合法
//或者设置的partitioner分区数大于默认的分区数
//条件成立则取传入rdd最大的分区数 负责取默认的分区数
if (hasMaxPartitioner.nonEmpty && (isEligiblePartitioner(hasMaxPartitioner.get, rdds) ||
defaultNumPartitions < hasMaxPartitioner.get.getNumPartitions)) {
hasMaxPartitioner.get.partitioner.get
} else {
new HashPartitioner(defaultNumPartitions)
}
}
private def isEligiblePartitioner(
hasMaxPartitioner: RDD[_],
rdds: Seq[RDD[_]]): Boolean = {
val maxPartitions = rdds.map(_.partitions.length).max
log10(maxPartitions) - log10(hasMaxPartitioner.getNumPartitions) < 1
}
}
源码继续往下走:获取默认的分区器后 进入join的重载方法 里面最重要的是cogroup对象 new CoGroupedRDD[K](Seq(self, other), partitioner)
def join[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, W))] = self.withScope {
this.cogroup(other, partitioner).flatMapValues( pair =>
for (v <- pair._1.iterator; w <- pair._2.iterator) yield (v, w)
)
}
def cogroup[W](other: RDD[(K, W)], partitioner: Partitioner)
: RDD[(K, (Iterable[V], Iterable[W]))] = self.withScope {
if (partitioner.isInstanceOf[HashPartitioner] && keyClass.isArray) {
throw new SparkException("HashPartitioner cannot partition array keys.")
}
//partitioner 通过对比得到的默认分区器 主要是分区器中的分区数
val cg = new CoGroupedRDD[K](Seq(self, other), partitioner)
cg.mapValues { case Array(vs, w1s) =>
(vs.asInstanceOf[Iterable[V]], w1s.asInstanceOf[Iterable[W]])
}
}
这里也就是决定join是宽依赖还是窄依赖的地方:
override def getDependencies: Seq[Dependency[_]] = {
rdds.map { rdd: RDD[_] =>
//判断join 左右的rdd是否和上面选择的默认分区器分区数一致 如果一致则是窄依赖 否则就是宽依赖
if (rdd.partitioner == Some(part)) {
logDebug("Adding one-to-one dependency with " + rdd)
new OneToOneDependency(rdd)
} else {
logDebug("Adding shuffle dependency with " + rdd)
new ShuffleDependency[K, Any, CoGroupCombiner](
rdd.asInstanceOf[RDD[_ <: Product2[K, _]]], part, serializer)
}
}
}
到此对问题的回答也就结束了