当调用reduceByKey()、groupByKey()等操作后,会伴随着ShuffledRDD的生成,具体源码如下:
def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] = {
combineByKey[V]((v: V) => v, func, func, partitioner)
}
继续:
def combineByKey[C](createCombiner: V => C,
mergeValue: (C, V) => C,
mergeCombiners: (C, C) => C,
partitioner: Partitioner,
mapSideCombine: Boolean = true, //默认需要shuffle前(mapper端)的每个分区按key进行聚合,可以提升性能
serializer: Serializer = null): RDD[(K, C)] = {
require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
if (keyClass.isArray) { // key是数组时,不能再mapper端聚合,同时如果是 HashPartitioner,也不支持数组分区
if (mapSideCombine) {
throw new SparkException("Cannot use map-side combining with array keys.")
}
if (partitioner.isInstanceOf[HashPartitioner]) {
throw new SparkException("Default partitioner cannot partition array keys.")
}
}
val aggregator = new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners)
if (self.partitioner == Some(partitioner)) { //若是指定分区和原来一样(已经进行了shuffle操作),说明已经按key分好区了
self.mapPartitions(iter => {
val context = TaskContext.get()
new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context)) //此时,直接合并value就行,iter就是每个分区计算的结果
}, preservesPartitioning = true)
} else {
new ShuffledRDD[K, V, C](self, partitioner) //否则,按照指定partitioner,新生成一个ShuffledRDD
.setSerializer(serializer)
.setAggregator(aggregator)
.setMapSideCombine(mapSideCombine) //默认先在mapper端聚合
}
}
ShuffledRDD中关键的代码为:
override def getDependencies: Seq[Dependency[_]] = {
List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine)) //rdd的依赖为ShuffleDependency
}
override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
.read() // 每次reducer都读一个partition
.asInstanceOf[Iterator[(K, C)]]
}
当有action导致runJob时,会调用DAGScheduler里的runJob,具体调用过程为:runJob() ------ submitJob() -------- eventProcessActor ! JobSubmitted() ----- dagScheduler.handleJobSubmitted() ,其中最关键的代码为(只关注和shuffledRDD有关的操作):
finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite) //划分stage
继续查看newStage()的具体实现:
private def newStage(
rdd: RDD[_],
numTasks: Int,
shuffleDep: Option[ShuffleDependency[_, _, _]],
jobId: Int,
callSite: CallSite)
: Stage =
{
val parentStages = getParentStages(rdd, jobId) //获取父stages
val id = nextStageId.getAndIncrement() //获取stageId
val stage = new Stage(id, rdd, numTasks, shuffleDep, parentStages, jobId, callSite) //这个为finalRDD的stage,其shuffleDep为None
stageIdToStage(id) = stage
updateJobIdStageIdMaps(jobId, stage)
stage
}
查看 getParentStages()的具体实现:
private def getParentStages(rdd: RDD[_], jobId: Int): List[Stage] = {
val parents = new HashSet[Stage]
val visited = new HashSet[RDD[_]]
// We are manually maintaining a stack here to prevent StackOverflowError
// caused by recursively visiting
val waitingForVisit = new Stack[RDD[_]]
def visit(r: RDD[_]) {
if (!visited(r)) { //标记是否之前有访问过该rdd
visited += r
// Kind of ugly: need to register RDDs with the cache here since
// we can't do it in its constructor because # of partitions is unknown
for (dep <- r.dependencies) {
dep match {
case shufDep: ShuffleDependency[_, _, _] =>
parents += getShuffleMapStage(shufDep, jobId) //宽依赖时(shuffle),stage划分边界
case _ =>
waitingForVisit.push(dep.rdd) //窄依赖时,将其依赖的父rdd继续放入stack中,不进行stage划分
}
}
}
}
waitingForVisit.push(rdd) //将rdd入栈
while (!waitingForVisit.isEmpty) {
visit(waitingForVisit.pop()) //pop出栈,进入visit函数处理
}
parents.toList
}
可知,当依赖为 ShuffleDependency 时,就是Stage划分的边界。
继续查看 getShuffleMapStage()的具体实现:
private def getShuffleMapStage(shuffleDep: ShuffleDependency[_, _, _], jobId: Int): Stage = {
shuffleToMapStage.get(shuffleDep.shuffleId) match {
case Some(stage) => stage
case None =>
// We are going to register ancestor shuffle dependencies
registerShuffleDependencies(shuffleDep, jobId) //第一次执行时,进入该方法,获取shuffle依赖的父shuffle依赖们
// Then register current shuffleDep
val stage =
newOrUsedStage(
shuffleDep.rdd, shuffleDep.rdd.partitions.size, shuffleDep, jobId,
shuffleDep.rdd.creationSite) //这个为finalRDD stage的父stage
shuffleToMapStage(shuffleDep.shuffleId) = stage //将其加入HashMap中
stage
}
}
继续查看 registerShuffleDependencies() 方法:
private def registerShuffleDependencies(shuffleDep: ShuffleDependency[_, _, _], jobId: Int) = {
val parentsWithNoMapStage = getAncestorShuffleDependencies(shuffleDep.rdd) //获取shuffle依赖的父shuffle依赖们
while (!parentsWithNoMapStage.isEmpty) {
val currentShufDep = parentsWithNoMapStage.pop() //依次取出栈中的shuffle依赖们
val stage =
newOrUsedStage(
currentShufDep.rdd, currentShufDep.rdd.partitions.size, currentShufDep, jobId,
currentShufDep.rdd.creationSite)
shuffleToMapStage(currentShufDep.shuffleId) = stage //将它们的stage加入HashMap中
}
}
继续查看 getAncestorShuffleDependencies() 方法:
private def getAncestorShuffleDependencies(rdd: RDD[_]): Stack[ShuffleDependency[_, _, _]] = {
val parents = new Stack[ShuffleDependency[_, _, _]]
val visited = new HashSet[RDD[_]]
// We are manually maintaining a stack here to prevent StackOverflowError
// caused by recursively visiting
val waitingForVisit = new Stack[RDD[_]]
def visit(r: RDD[_]) {
if (!visited(r)) {
visited += r
for (dep <- r.dependencies) {
dep match {
case shufDep: ShuffleDependency[_, _, _] =>
if (!shuffleToMapStage.contains(shufDep.shuffleId)) {
parents.push(shufDep) //如果有shuffle依赖,则放入堆栈,如果有多个,也都压入栈
}
waitingForVisit.push(shufDep.rdd) //继续往前回溯
case _ =>
waitingForVisit.push(dep.rdd)
}
}
}
}
waitingForVisit.push(rdd)
while (!waitingForVisit.isEmpty) {
visit(waitingForVisit.pop())
}
parents
}
此时,已经在 shuffleToMapStage中加入了所有stage的相关信息。
接着,再回到之前的 finalStage 构建的阶段,完整源码如下:
private[scheduler] def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
allowLocal: Boolean,
callSite: CallSite,
listener: JobListener,
properties: Properties = null)
{
var finalStage: Stage = null
try {
// New stage creation may throw an exception if, for example, jobs are run on a
// HadoopRDD whose underlying HDFS files have been deleted.
finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite)
} catch {
case e: Exception =>
logWarning("Creating new stage failed due to exception - job: " + jobId, e)
listener.jobFailed(e)
return
}
if (finalStage != null) {
val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)
clearCacheLocs()
logInfo("Got job %s (%s) with %d output partitions (allowLocal=%s)".format(
job.jobId, callSite.shortForm, partitions.length, allowLocal))
logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")")
logInfo("Parents of final stage: " + finalStage.parents)
logInfo("Missing parents: " + getMissingParentStages(finalStage))
val shouldRunLocally =
localExecutionEnabled && allowLocal && finalStage.parents.isEmpty && partitions.length == 1
if (shouldRunLocally) { //是否在本地运行
// Compute very short actions like first() or take() with no parent stages locally.
listenerBus.post(SparkListenerJobStart(job.jobId, Seq.empty, properties))
runLocally(job)
} else {
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.resultOfJob = Some(job)
val stageIds = jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(SparkListenerJobStart(job.jobId, stageInfos, properties))
submitStage(finalStage) // 提交finalStage
}
}
submitWaitingStages()
}
接下来的过程就比较清晰了,当提交finalStage时,会调用:
private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
val missing = getMissingParentStages(stage).sortBy(_.id) //查看是否有父stage
logDebug("missing: " + missing)
if (missing == Nil) { //说明已经没有依赖的父stage,首先提交该stage(祖宗stage)
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
submitMissingTasks(stage, jobId.get)
} else {
for (parent <- missing) { //如果还有父stage,继续递归
submitStage(parent)
}
waitingStages += stage
}
}
} else {
abortStage(stage, "No active job for stage " + stage.id)
}
}
最后,会调用 submitMissingTasks() 中的如下代码 :
taskScheduler.submitTasks(
new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))
以TaskSet的方式将其提交给底层的调度器 TaskScheduler 进行处理。
************* The End *************