action 操作触发sparkcontext的runjob方法
dagScheduler.runJob
dagScheduler.submitJob
DAGSchedulerEventProcessLoop.onReceive(case JobSubmitted(handleJobSubmitted))
handleJobSubmitted //DagScheduler调度的核心入口
- 使用触发job的最后一个rdd,创建finalStage,创建一个Stage对象,并将stage加入DAGScheduler内部的内存缓存中
- 用finalStage创建一个job,就是说,这个job的最后一个stage就是finalStage
- 将job加入内存缓存中
- 使用submitStage方法提交finalStage(这个方法的调用,会导致第一个stage提交,并且导致其他所有的stage都放入waitingStages队列中) submitStage(finalStage)
- 提交等待的stage submitWaitingStages()
submitStage:
stage划分算法的入口, 由getMissingParentStages和submitStage共同组成,stage划分算法总结:
- 1.从finalStage倒推
- 2.通过宽依赖进行新的stage划分
- 3.使用递归,优先提交stage
- 调用getMissingParentStages 去获取当前这个stage的父stage getMissingParentStages(stage)
- 递归调用submit方法去提交父stage submitStage(parent)
- 将当前stage放入waitingStage等待执行的队列中
getMissingParentStages:
- 对一个stage,如果他的最后一个rdd的所有依赖都是窄依赖,就不会创建任何新的stage
- 但是,只要发现这个stage的rdd宽依赖了某个rdd,那么就用宽依赖的那个rdd创建一个新的stage
- 然后立即将新的stage返回
submitMissingTasks:
提交stage,为stage创建一批task,task数量与partition数量相同
getPreferredLocsInternal:
- 计算每个task对应的partition的最佳位置
- 从stage的最后一个位置开始,去找那个rdd的partition,是被cache了,或者checkpoint了
- 那么stage的最佳位置,就是缓存的/checkpoint的partition的位置
- 因为这样的话,task就在那个节点上执行,不需要计算之前的rdd了
handleJobSubmitted
/**
* TODO
* DagScheduler调度的核心入口 , 切分stage
*/
private[scheduler] def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
allowLocal: Boolean,
callSite: CallSite,
listener: JobListener,
properties: Properties = null)
{
// 1)使用触发job的最后一个rdd,创建finalStage
var finalStage: Stage = null
try {
// New stage creation may throw an exception if, for example, jobs are run on a
// HadoopRDD whose underlying HDFS files have been deleted.
// TODO 划分stage 创建一个Stage对象,并将stage加入DAGScheduler内部的内存缓存中
finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite)
} catch {
case e: Exception =>
logWarning("Creating new stage failed due to exception - job: " + jobId, e)
listener.jobFailed(e)
return
}
if (finalStage != null) {
// 2) 用finalStage创建一个job,就是说,这个job的最后一个stage就是finalStage
val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)
clearCacheLocs()
logInfo("Got job %s (%s) with %d output partitions (allowLocal=%s)".format(
job.jobId, callSite.shortForm, partitions.length, allowLocal))
logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")")
logInfo("Parents of final stage: " + finalStage.parents)
logInfo("Missing parents: " + getMissingParentStages(finalStage))
val shouldRunLocally =
localExecutionEnabled && allowLocal && finalStage.parents.isEmpty && partitions.length == 1
val jobSubmissionTime = clock.getTimeMillis()
// 是否本地运行
if (shouldRunLocally) {
// Compute very short actions like first() or take() with no parent stages locally.
/**
* 本地模式
*/
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, Seq.empty, properties))
runLocally(job)
} else {
/**
* 集群模式
*/
// 3) 将job加入内存缓存中
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.resultOfJob = Some(job)
val stageIds = jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
/**
* 4) 使用submitStage方法提交finalStage
* 这个方法的调用,会导致第一个stage提交,并且导致其他所有的stage都放入waitingStages队列中 --源码
*
* TODO 之前都在driver端进行,这个方法才开始提交stage
*/
submitStage(finalStage)
}
}
// 提交等待的stage
submitWaitingStages()
}
-> newStage
private def newStage(
rdd: RDD[_],
numTasks: Int, //TODO partitions.size 一个task 处理一个分区
shuffleDep: Option[ShuffleDependency[_, _, _]],
jobId: Int,
callSite: CallSite)
: Stage =
{
/**
* TODO 获取 parentStages
* rdd 是finalRDD
*/
val parentStages = getParentStages(rdd, jobId)
val id = nextStageId.getAndIncrement()
val stage = new Stage(id, rdd, numTasks, shuffleDep, parentStages, jobId, callSite)
stageIdToStage(id) = stage
updateJobIdStageIdMaps(jobId, stage)
stage
}
--> getParentStages 获取父stage
private def getParentStages(rdd: RDD[_], jobId: Int): List[Stage] = {
val parents = new HashSet[Stage]
val visited = new HashSet[RDD[_]]
// We are manually maintaining a stack here to prevent StackOverflowError
// caused by recursively visiting
val waitingForVisit = new Stack[RDD[_]]
def visit(r: RDD[_]) {
// visited(r) = HashSet.apply = visited.contains(r)
if (!visited(r)) {
visited += r
// Kind of ugly: need to register RDDs with the cache here since
// we can't do it in its constructor because # of partitions is unknown
for (dep <- r.dependencies) {
dep match {
case shufDep: ShuffleDependency[_, _, _] =>
// TODO 把宽依赖传进去,获得父stage
parents += getShuffleMapStage(shufDep, jobId)
case _ =>
waitingForVisit.push(dep.rdd)
}
}
}
}
// finalRDD 放入stack中
waitingForVisit.push(rdd)
while (!waitingForVisit.isEmpty) {
visit(waitingForVisit.pop())
}
parents.toList
}
-> submitStage(finalStage)
/**
* stage划分算法的入口
* 由getMissingParentStages和submitStage共同组成
* stage划分算法总结:
* 1.从finalStage倒推
* 2.通过宽依赖进行新的stage划分
* 3.使用递归,优先提交stage
*
* 根据最后一个stage,递归提交stage
*/
private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
// TODO 调用getMissingParentStages 去获取当前这个stage的父stage
val missing = getMissingParentStages(stage).sortBy(_.id)
logDebug("missing: " + missing)
/**
* 这里会递归调用,直到最初的stage没有父stage
* 首先提交第一个stage, stage0
* 其余stage,此时全部都在waitingStages里面
*
* 递归出口
* 判断父stage是否为空,为空意味他是第一个stage
*/
if (missing == Nil) {
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
// TODO 开始提交最前的stage,从前往后提交
submitMissingTasks(stage, jobId.get)
} else {
/**
* 递归调用submit方法去提交父stage
* 这里的递归,就是stage划分算法的推动者和精髓
*
* 有父stage ,就递归提交
*/
for (parent <- missing) {
submitStage(parent)
}
// 将当前stage放入waitingStage等待执行的队列中
waitingStages += stage
}
}
} else {
abortStage(stage, "No active job for stage " + stage.id)
}
}
-->getMissingParentStages
/**
* 对一个stage,如果他的最后一个rdd的所有依赖都是窄依赖,就不会创建任何新的stage
* 但是,只要发现这个stage的rdd宽依赖了某个rdd,那么就用宽依赖的那个rdd创建一个新的stage
* 然后立即将新的stage返回
*/
private def getMissingParentStages(stage: Stage): List[Stage] = {
val missing = new HashSet[Stage]
val visited = new HashSet[RDD[_]]
// We are manually maintaining a stack here to prevent StackOverflowError
// caused by recursively visiting
val waitingForVisit = new Stack[RDD[_]]
def visit(rdd: RDD[_]) {
if (!visited(rdd)) {
visited += rdd
if (getCacheLocs(rdd).contains(Nil)) {
/**
* 遍历rdd的依赖
* 对每一种有shuffle的操作,比如groupbyKey,reduceBykey,countBykey
* 底层对应了三个RDD:MapPartitionsRDD,ShuffleRDD,MapPartitionsRDD
*
*/
for (dep <- rdd.dependencies) {
dep match {
// 如果是宽依赖,
case shufDep: ShuffleDependency[_, _, _] =>
/**
* TODO
*那么使用宽依赖的那个rdd,创建一个stage,并且会将isShuffleMap设置为true
*默认最后一个stage不是ShuffleMap stage
*但是finalStage之前所有的stage,都是ShuffleMap stage
*/
val mapStage = getShuffleMapStage(shufDep, stage.jobId)
if (!mapStage.isAvailable) {
missing += mapStage
}
// 如果是窄依赖,那么将窄依赖的rdd放入栈中
case narrowDep: NarrowDependency[_] =>
waitingForVisit.push(narrowDep.rdd)
}
}
}
}
}
// 首先往栈中推入了stage最后的一个rdd
waitingForVisit.push(stage.rdd)
while (!waitingForVisit.isEmpty) {
// 对stage的最后一个rdd调用自己内部定义的visit方法
visit(waitingForVisit.pop())
}
missing.toList
}
--> submitMissingTasks // 开始提交最前的stage,从前往后提交
/**
* 提交stage,为stage创建一批task,task数量与partition数量相同
*/
private def submitMissingTasks(stage: Stage, jobId: Int) {
logDebug("submitMissingTasks(" + stage + ")")
// Get our pending tasks and remember them in our pendingTasks entry
stage.pendingTasks.clear()
// First figure out the indexes of partition ids to compute.
// 获取你要创建task的数量
val partitionsToCompute: Seq[Int] = {
if (stage.isShuffleMap) {
(0 until stage.numPartitions).filter(id => stage.outputLocs(id) == Nil)
} else {
val job = stage.resultOfJob.get
(0 until job.numPartitions).filter(id => !job.finished(id))
}
}
val properties = if (jobIdToActiveJob.contains(jobId)) {
jobIdToActiveJob(stage.jobId).properties
} else {
// this stage will be assigned to "default" pool
null
}
// 将stage加入runningStages队列
runningStages += stage
// SparkListenerStageSubmitted should be posted before testing whether tasks are
// serializable. If tasks are not serializable, a SparkListenerStageCompleted event
// will be posted, which should always come after a corresponding SparkListenerStageSubmitted
// event.
stage.latestInfo = StageInfo.fromStage(stage, Some(partitionsToCompute.size))
outputCommitCoordinator.stageStart(stage.id)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
// TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
// the serialized copy of the RDD and for each task we will deserialize it, which means each
// task gets a different copy of the RDD. This provides stronger isolation between tasks that
// might modify state of objects referenced in their closures. This is necessary in Hadoop
// where the JobConf/Configuration object is not thread-safe.
var taskBinary: Broadcast[Array[Byte]] = null
try {
// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
// For ResultTask, serialize and broadcast (rdd, func).
val taskBinaryBytes: Array[Byte] =
if (stage.isShuffleMap) {
closureSerializer.serialize((stage.rdd, stage.shuffleDep.get) : AnyRef).array()
} else {
closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func) : AnyRef).array()
}
taskBinary = sc.broadcast(taskBinaryBytes)
} catch {
// In the case of a failure during serialization, abort the stage.
case e: NotSerializableException =>
abortStage(stage, "Task not serializable: " + e.toString)
runningStages -= stage
return
case NonFatal(e) =>
abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}")
runningStages -= stage
return
}
/**
* TODO 创建多少个task
* 为stage创建制定数量的task
* 最关键:task最佳位置计算算法
*/
val tasks: Seq[Task[_]] = if (stage.isShuffleMap) {
partitionsToCompute.map { id =>
// 为每一个partition创建一个task
// task最佳位置计算算法
val locs = getPreferredLocs(stage.rdd, id)
val part = stage.rdd.partitions(id)
/**
* 然后对于finalStage之外的stage,他的isShuffleMap都是true,所以会创建shuffleMapTask
* ShuffleMapTask 用于拉去上游的数据
*/
new ShuffleMapTask(stage.id, taskBinary, part, locs)
}
} else {
// 如果不是shuffleMap,那么就是finalStage
// final stage 是创建ResultTask的
val job = stage.resultOfJob.get
partitionsToCompute.map { id =>
val p: Int = job.partitions(id)
val part = stage.rdd.partitions(p)
val locs = getPreferredLocs(stage.rdd, p)
/**
* ResultTask 将计算结果写入hdfs ,nosql等外部存储介质
*/
new ResultTask(stage.id, taskBinary, part, locs, id)
}
}
if (tasks.size > 0) {
logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
stage.pendingTasks ++= tasks
logDebug("New pending tasks: " + stage.pendingTasks)
/**
* 最后 ,针对stage的task,创建TaskSet对象,调用TaskScheduler的submitTask方法,提交TaskSet
* 默认情况下,我们的standalone模式下,是使用TaskSchedulerImpl,TaskScheduler只是一个trait
*
* TODO 调用传说中的taskScheduler。submitTasks 来提交taskSet
*/
taskScheduler.submitTasks(
new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
} else {
// Because we posted SparkListenerStageSubmitted earlier, we should post
// SparkListenerStageCompleted here in case there are no tasks to run.
outputCommitCoordinator.stageEnd(stage.id)
listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))
logDebug("Stage " + stage + " is actually done; %b %d %d".format(
stage.isAvailable, stage.numAvailableOutputs, stage.numPartitions))
runningStages -= stage
}
}
===> getPreferredLocs task最佳位置计算算法
/**
* 计算每个task对应的partition的最佳位置
* 从stage的最后一个位置开始,去找那个rdd的partition,是被cache了,或者checkpoint了
* 那么stage的最佳位置,就是缓存的/checkpoint的partition的位置
* 因为这样的话,task就在那个节点上执行,不需要计算之前的rdd了
*/
private def getPreferredLocsInternal(
rdd: RDD[_],
partition: Int,
visited: HashSet[(RDD[_],Int)])
: Seq[TaskLocation] =
{
// If the partition has already been visited, no need to re-visit.
// This avoids exponential path exploration. SPARK-695
if (!visited.add((rdd,partition))) {
// Nil has already been returned for previously visited partitions.
return Nil
}
// If the partition is cached, return the cache locations
// 寻找当前的rdd的partition是否缓存了
val cached = getCacheLocs(rdd)(partition)
if (!cached.isEmpty) {
return cached
}
// If the RDD has some placement preferences (as is the case for input RDDs), get those
// 寻找当前rdd的partition是否checkpoint了
val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList
if (!rddPrefs.isEmpty) {
return rddPrefs.map(TaskLocation(_))
}
// If the RDD has narrow dependencies, pick the first partition of the first narrow dep
// that has any placement preferences. Ideally we would choose based on transfer sizes,
// but this will do for now.
// 最后,递归调用自己,去寻找rdd的父rdd,看到对应的partition是否缓存或者checkpoint
rdd.dependencies.foreach {
case n: NarrowDependency[_] =>
for (inPart <- n.getParents(partition)) {
val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
if (locs != Nil) {
return locs
}
}
case _ =>
}
// 如果这个stage从最后一个rdd,到最开始的rdd,partition都没有被缓存后checkpoint
// 那么 task的最佳位置(priferrenLocs)就是Nil
Nil
}
taskScheduler.submitTasks 调用taskScheduler.submitTasks 来提交taskSet 后见taskscheduler源码分析