DAGScheduler源码分析
首先,在driver中我们初始化完成SparkContext之后,接下来我们就会创建一系列的rdd,并且会在某些rdd上调用action方法,下面我们以foreach()方法为例,看看转换是怎么执行的
RDD
def foreach(f: T => Unit): Unit = withScope {
val cleanF = sc.clean(f)
sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF))
}
上面的代码最终会执行SparkContext的runJob
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
resultHandler: (Int, U) => Unit): Unit = {
if (stopped.get()) {
throw new IllegalStateException("SparkContext has been shutdown")
}
val callSite = getCallSite
val cleanedFunc = clean(func)
logInfo("Starting job: " + callSite.shortForm)
if (conf.getBoolean("spark.logLineage", false)) {
logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
}
// sparkContext初始化时创建的dagScheduler
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
progressBar.foreach(_.finishAll())
rdd.doCheckpoint()
}
从上面可以看出,当调用action方法时,最终调用的是在初始化SparkContext时创建的DAGScheduler
下面分析DAGScheduler的runJob()
// 主要执行submitJob方法
def runJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): Unit = {
val start = System.nanoTime
/*
* stage划分算法总结:
* 1. 从finalStage倒退
* 2. 从宽依赖来进行新stage的划分
* 3. 使用递归有限提交父stage
* */
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
ThreadUtils.awaitReady(waiter.completionFuture, Duration.Inf)
waiter.completionFuture.value.get match {
case scala.util.Success(_) =>
logInfo("Job %d finished: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
case scala.util.Failure(exception) =>
logInfo("Job %d failed: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
// SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
val callerStackTrace = Thread.currentThread().getStackTrace.tail
exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
throw exception
}
}
def submitJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): JobWaiter[U] = {
// Check to make sure we are not launching a task on a partition that does not exist.
val maxPartitions = rdd.partitions.length
partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
throw new IllegalArgumentException(
"Attempting to access a non-existent partition: " + p + ". " +
"Total number of partitions: " + maxPartitions)
}
// 分配一个Jobid,从这里可以看出,一次action操作对应着一个job
val jobId = nextJobId.getAndIncrement()
if (partitions.size == 0) {
// Return immediately if the job is running 0 tasks
return new JobWaiter[U](this, jobId, 0, resultHandler)
}
assert(partitions.size > 0)
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
// 发送一个jobSubmitted消息
eventProcessLoop.post(JobSubmitted(
jobId, rdd, func2, partitions.toArray, callSite, waiter,
SerializationUtils.clone(properties)))
waiter
}
下面看DAGScheduler如何处理JobSubmitted
/*
*
* DAGScheduler的核心入口
* */
private[scheduler] def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
callSite: CallSite,
listener: JobListener,
properties: Properties) {
// 使用触发job的最后一个rdd,创建finalStage
var finalStage: ResultStage = null
try {
// 第一步:创建一个stage对象,并且将stage加入dag的内部缓存中
// 创建的stage是ResultStage
// 这里会创建之前的所有shuffleMapStage,创建依据就是是否是宽依赖
finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
} catch {
case e: BarrierJobSlotsNumberCheckFailed =>
logWarning(s"The job $jobId requires to run a barrier stage that requires more slots " +
"than the total number of slots in the cluster currently.")
// If jobId doesn't exist in the map, Scala coverts its value null to 0: Int automatically.
val numCheckFailures = barrierJobIdToNumTasksCheckFailures.compute(jobId,
new BiFunction[Int, Int, Int] {
override def apply(key: Int, value: Int): Int = value + 1
})
if (numCheckFailures <= maxFailureNumTasksCheck) {
messageScheduler.schedule(
new Runnable {
override def run(): Unit = eventProcessLoop.post(JobSubmitted(jobId, finalRDD, func,
partitions, callSite, listener, properties))
},
timeIntervalNumTasksCheck,
TimeUnit.SECONDS
)
return
} else {
// Job failed, clear internal data.
barrierJobIdToNumTasksCheckFailures.remove(jobId)
listener.jobFailed(e)
return
}
case e: Exception =>
logWarning("Creating new stage failed due to exception - job: " + jobId, e)
listener.jobFailed(e)
return
}
// Job submitted, clear internal data.
barrierJobIdToNumTasksCheckFailures.remove(jobId)
// 第二步:用finalStage创建一个job
// 这个job的最后一个stage就是finalStage
val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
clearCacheLocs()
logInfo("Got job %s (%s) with %d output partitions".format(
job.jobId, callSite.shortForm, partitions.length))
logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
logInfo("Parents of final stage: " + finalStage.parents)
logInfo("Missing parents: " + getMissingParentStages(finalStage))
val jobSubmissionTime = clock.getTimeMillis()
// 第三步:将job加入内存缓存中
// 当前存活的job
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.setActiveJob(job)
val stageIds = jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
// 第四步:使用submitStage方法提交finalStage
// 导致第一个stage的执行,其他stage放入等待队列中
submitStage(finalStage)
}
下面分析如何创建一个resultstage
private def createResultStage(
rdd: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
jobId: Int,
callSite: CallSite): ResultStage = {
checkBarrierStageWithDynamicAllocation(rdd)
checkBarrierStageWithNumSlots(rdd)
checkBarrierStageWithRDDChainPattern(rdd, partitions.toSet.size)
// 获取父stage
// 只返回直接父stage,不会返回祖先stage
val parents = getOrCreateParentStages(rdd, jobId)
// 获得一个不重复的id
// 到这一步,已经将之前的所有stage都创建完毕,并且分配了递增的stageId
val id = nextStageId.getAndIncrement()
// 使用父stage和id生成一个新的stage对象
val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)
stageIdToStage(id) = stage
updateJobIdStageIdMaps(jobId, stage)
stage
}
private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
// 获取距离最近的父stage
getShuffleDependencies(rdd).map { shuffleDep =>
getOrCreateShuffleMapStage(shuffleDep, firstJobId)
}.toList
}
private[scheduler] def getShuffleDependencies(
rdd: RDD[_]): HashSet[ShuffleDependency[_, _, _]] = {
val parents = new HashSet[ShuffleDependency[_, _, _]]
val visited = new HashSet[RDD[_]]
val waitingForVisit = new ArrayStack[RDD[_]]
waitingForVisit.push(rdd)
// parents只会返回最近的shuffle依赖,距离可能是1可能是2等等
// 就像函数说明说的,虽然A<-B<-C都是shuffle依赖,但是因为对于C来说B是最近的shuffle依赖,所以最终结果中只有shuffle
while (waitingForVisit.nonEmpty) {
val toVisit = waitingForVisit.pop()
if (!visited(toVisit)) {
visited += toVisit
// 从这里可以看出如果当前的依赖是宽依赖,那么不会在向前遍历了,而是直接将宽依赖添加到parents中
// 如果当前是窄依赖,那么会将窄依赖对应的rdd添加到栈中,继续向前遍历它的父依赖
toVisit.dependencies.foreach {
case shuffleDep: ShuffleDependency[_, _, _] =>
parents += shuffleDep
case dependency =>
waitingForVisit.push(dependency.rdd)
}
}
}
parents
}
private def getOrCreateShuffleMapStage(
shuffleDep: ShuffleDependency[_, _, _],
firstJobId: Int): ShuffleMapStage = {
// 判断当前shuffle是否已经见过了,如果没有见过,需要将该shuffle所有缺失的祖先shuffle依赖找出来
// 并且创建shuffleMapStage
shuffleIdToMapStage.get(shuffleDep.shuffleId) match {
case Some(stage) =>
stage
case None =>
// Create stages for all missing ancestor shuffle dependencies.
// 创建所有缺失的祖先shuffle依赖
getMissingAncestorShuffleDependencies(shuffleDep.rdd).foreach { dep =>
// Even though getMissingAncestorShuffleDependencies only returns shuffle dependencies
// that were not already in shuffleIdToMapStage, it's possible that by the time we
// get to a particular dependency in the foreach loop, it's been added to
// shuffleIdToMapStage by the stage creation process for an earlier dependency. See
// SPARK-13902 for more information.
if (!shuffleIdToMapStage.contains(dep.shuffleId)) {
createShuffleMapStage(dep, firstJobId)
}
}
// Finally, create a stage for the given shuffle dependency.
// 先创建父shuffleMapStage,再创建子shuffleMapStage
// stageId逐渐增大
createShuffleMapStage(shuffleDep, firstJobId)
}
}
现在看一下submitStage()
/*
* 提交一个stage,会递归地执行之前缺失的父stage
* 如果当前stage的父stage都运行完毕了,那么该stage会划分成task进行执行
* 如果当前stage的父stage没有执行完毕,那么会将该stage加入到waitingStages中,然后递归地对父stage滴啊用submitStage
* */
private def submitStage(stage: Stage) {
// 找到使用该stage的job的id
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
// waitingStage代表父stage还没有完成的stage
// runningStages代表正在运行的stage
// failedStages代表执行失败,需要重新运行的stage
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
// 获得当前stage之前缺失的父stage
// 当前stage没有缺失的父stage,那么就代表当前stage可以划分成task进行计算了
val missing = getMissingParentStages(stage).sortBy(_.id)
logDebug("missing: " + missing)
if (missing.isEmpty) {
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
// 在执行task完了之后,会遍历依赖于已完成stage的子stage,对它们进行task执行
submitMissingTasks(stage, jobId.get)
} else {
// 针对每个缺失的父stage来提交stage
// 递归添加待执行的stage并且执行
// 将当前stage放入等待队列中,提交当前stage的父stage
// 直到一个stage的父stage全部计算完成,进入上面的if else中进行 task执行
for (parent <- missing) {
submitStage(parent)
}
waitingStages += stage
}
}
} else {
abortStage(stage, "No active job for stage " + stage.id, None)
}
}
// 找到当前stage还没有执行的父stage
private def getMissingParentStages(stage: Stage): List[Stage] = {
val missing = new HashSet[Stage]
val visited = new HashSet[RDD[_]]
// We are manually maintaining a stack here to prevent StackOverflowError
// caused by recursively visiting
// 使用堆栈来维护
val waitingForVisit = new ArrayStack[RDD[_]]
def visit(rdd: RDD[_]) {
// 判断当前rdd是否访问过
if (!visited(rdd)) {
visited += rdd
val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)
if (rddHasUncachedPartitions) {
// 遍历当前rdd的依赖
for (dep <- rdd.dependencies) {
// 判断是宽依赖还是窄依赖
dep match {
// 如果是宽依赖
// 那么使用宽依赖的Rdd创建一个shuffleMapStage
// 如果是一个宽依赖就创建一个新的stage
// 如果依赖的是一个宽依赖,那么不会将rdd放入堆栈中
case shufDep: ShuffleDependency[_, _, _] =>
val mapStage = getOrCreateShuffleMapStage(shufDep, stage.firstJobId)
// 这里就是判断该shuffleMapStage是否执行完毕
if (!mapStage.isAvailable) {
missing += mapStage
}
// 如果是窄依赖,将依赖的rdd放入栈中
case narrowDep: NarrowDependency[_] =>
waitingForVisit.push(narrowDep.rdd)
}
}
}
}
}
// 首先往栈中推入了该stage最后的rdd
// 然后进行while循环
waitingForVisit.push(stage.rdd)
while (waitingForVisit.nonEmpty) {
// 对每个rdd调用visit方法
visit(waitingForVisit.pop())
}
missing.toList
}
下面接着看submitMissingTasks()
private def submitMissingTasks(stage: Stage, jobId: Int) {
logDebug("submitMissingTasks(" + stage + ")")
// First figure out the indexes of partition ids to compute.
// 获取需要计算的partition的数量
val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()
// Use the scheduling pool, job group, description, etc. from an ActiveJob associated
// with this Stage
val properties = jobIdToActiveJob(jobId).properties
// 将stage加入runningStages
runningStages += stage
// SparkListenerStageSubmitted should be posted before testing whether tasks are
// serializable. If tasks are not serializable, a SparkListenerStageCompleted event
// will be posted, which should always come after a corresponding SparkListenerStageSubmitted
// event.
stage match {
case s: ShuffleMapStage =>
outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions - 1)
case s: ResultStage =>
outputCommitCoordinator.stageStart(
stage = s.id, maxPartitionId = s.rdd.partitions.length - 1)
}
// 这里在计算taskId和location的最佳映射,即将task放到哪里运行
// 返回的是一个map,key是partitionId,value是TaskLocation的序列
val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
stage match {
case s: ShuffleMapStage =>
partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
case s: ResultStage =>
partitionsToCompute.map { id =>
val p = s.partitions(id)
(id, getPreferredLocs(stage.rdd, p))
}.toMap
}
} catch {
case NonFatal(e) =>
stage.makeNewStageAttempt(partitionsToCompute.size)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
runningStages -= stage
return
}
stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
// If there are tasks to execute, record the submission time of the stage. Otherwise,
// post the even without the submission time, which indicates that this stage was
// skipped.
if (partitionsToCompute.nonEmpty) {
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
}
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
// TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
// the serialized copy of the RDD and for each task we will deserialize it, which means each
// task gets a different copy of the RDD. This provides stronger isolation between tasks that
// might modify state of objects referenced in their closures. This is necessary in Hadoop
// where the JobConf/Configuration object is not thread-safe.
var taskBinary: Broadcast[Array[Byte]] = null
var partitions: Array[Partition] = null
try {
// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
// For ResultTask, serialize and broadcast (rdd, func).
var taskBinaryBytes: Array[Byte] = null
// taskBinaryBytes and partitions are both effected by the checkpoint status. We need
// this synchronization in case another concurrent job is checkpointing this RDD, so we get a
// consistent view of both variables.
// DAGScheduelr在向taskScheduler发送TastSet之前,会将当前stage的rdd shuffleDep序列化然后作为广播变量广播出去
// 这样当executor执行指定task时,可以通过广播变量来获取当前task的rdd和shuffleDep
RDDCheckpointData.synchronized {
taskBinaryBytes = stage match {
case stage: ShuffleMapStage =>
JavaUtils.bufferToArray(
closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))
case stage: ResultStage =>
JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))
}
partitions = stage.rdd.partitions
}
taskBinary = sc.broadcast(taskBinaryBytes)
} catch {
// In the case of a failure during serialization, abort the stage.
case e: NotSerializableException =>
abortStage(stage, "Task not serializable: " + e.toString, Some(e))
runningStages -= stage
// Abort execution
return
case NonFatal(e) =>
abortStage(stage, s"Task serialization failed: $e\n${Utils.exceptionString(e)}", Some(e))
runningStages -= stage
return
}
// 为stage创建指定数量的task
val tasks: Seq[Task[_]] = try {
val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
stage match {
case stage: ShuffleMapStage =>
stage.pendingPartitions.clear()
partitionsToCompute.map { id =>
// 遍历每个需要计算的partition
// 给每个task计算最佳位置
val locs = taskIdToLocations(id)
val part = partitions(id)
stage.pendingPartitions += id
// 对于shuffleMapStage创建ShuffleMapTask
new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
Option(sc.applicationId), sc.applicationAttemptId, stage.rdd.isBarrier())
}
case stage: ResultStage =>
partitionsToCompute.map { id =>
val p: Int = stage.partitions(id)
val part = partitions(p)
val locs = taskIdToLocations(id)
// 对于ResultStage创建ResultTask
new ResultTask(stage.id, stage.latestInfo.attemptNumber,
taskBinary, part, locs, id, properties, serializedTaskMetrics,
Option(jobId), Option(sc.applicationId), sc.applicationAttemptId,
stage.rdd.isBarrier())
}
}
} catch {
case NonFatal(e) =>
abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
runningStages -= stage
return
}
if (tasks.size > 0) {
logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +
s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")
// 针对stage的task,创建taskset对象,调用taskScheduler的submitTasks方法,提交tasksets
// 默认情况下,standalone使用的是taskSchedulerImpl作为taskScheduler
taskScheduler.submitTasks(new TaskSet(
tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))
} else {
// Because we posted SparkListenerStageSubmitted earlier, we should mark
// the stage as completed here in case there are no tasks to run
markStageAsFinished(stage, None)
stage match {
case stage: ShuffleMapStage =>
logDebug(s"Stage ${stage} is actually done; " +
s"(available: ${stage.isAvailable}," +
s"available outputs: ${stage.numAvailableOutputs}," +
s"partitions: ${stage.numPartitions})")
markMapStageJobsAsFinished(stage)
case stage : ResultStage =>
logDebug(s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})")
}
// 提交子stage
submitWaitingChildStages(stage)
}
}
下面看一下如何确定partition的最佳计算位置
def getPreferredLocs(rdd: RDD[_], partition: Int): Seq[TaskLocation] = {
getPreferredLocsInternal(rdd, partition, new HashSet)
}
/*
* 为每个task计算最佳位置
* 说白了,就是从stage的最后一个rdd,往前寻找,判断哪个rdd的partition被checkpoint或者cache了
* 那么task的位置就是缓存的partition的位置或者是checkpoint的位置,这样就不需要执行之前的rdd了
* 返回的位置是host或者是(host,executorId)的形式
* */
private def getPreferredLocsInternal(
rdd: RDD[_],
partition: Int,
visited: HashSet[(RDD[_], Int)]): Seq[TaskLocation] = {
// If the partition has already been visited, no need to re-visit.
// This avoids exponential path exploration. SPARK-695
if (!visited.add((rdd, partition))) {
// Nil has already been returned for previously visited partitions.
return Nil
}
// If the partition is cached, return the cache locations
// 判断当前rdd的partiton是否缓存了
val cached = getCacheLocs(rdd)(partition)
if (cached.nonEmpty) {
return cached
}
// If the RDD has some placement preferences (as is the case for input RDDs), get those
// 判断是否checkpoint了
val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList
if (rddPrefs.nonEmpty) {
return rddPrefs.map(TaskLocation(_))
}
// If the RDD has narrow dependencies, pick the first partition of the first narrow dependency
// that has any placement preferences. Ideally we would choose based on transfer sizes,
// but this will do for now.
// 寻找父rdd,判断对应的rdd是否缓存或者checkpoint
rdd.dependencies.foreach {
case n: NarrowDependency[_] =>
for (inPart <- n.getParents(partition)) {
val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
if (locs != Nil) {
return locs
}
}
case _ =>
}
// 如果这个stage,从最后一个rdd到第一个rdd都没有缓存或者checkpoint,那么就返回空
Nil
}
下面看一下TaskScheduler如何处理taskSet
TaskSchedulerImpl
// 提交task
override def submitTasks(taskSet: TaskSet) {
val tasks = taskSet.tasks
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
this.synchronized {
// 给每个taskSet创建一个taskSetManager
// manager实际上会负责taskset的任务执行,
val manager = createTaskSetManager(taskSet, maxTaskFailures)
// 加入到内存缓存中
val stage = taskSet.stageId
val stageTaskSets =
taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
stageTaskSets(taskSet.stageAttemptId) = manager
val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
ts.taskSet != taskSet && !ts.isZombie
}
if (conflictingTaskSet) {
throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
}
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
if (!isLocal && !hasReceivedTask) {
starvationTimer.scheduleAtFixedRate(new TimerTask() {
override def run() {
if (!hasLaunchedTask) {
logWarning("Initial job has not accepted any resources; " +
"check your cluster UI to ensure that workers are registered " +
"and have sufficient resources")
} else {
this.cancel()
}
}
}, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
}
hasReceivedTask = true
}
/*sparkContext原理剖析的时候,讲过,taskScheduler创建的时候,会创建SparkDeployBackendImpl,使用的就是在sparkContext中创建的
* */
backend.reviveOffers()
}
CoarseGrainedSchedulerBackend
// 向driver发送信息
override def reviveOffers() {
driverEndpoint.send(ReviveOffers)
}
DriverEndpoint
override def receive: PartialFunction[Any, Unit] = {
case ReviveOffers =>
makeOffers()
}
private def makeOffers() {
/*
* 第一步,调用TaskSchedulerImpl的resourceOffers方法,执行任务分配算法,将各个task分配到executor上去
* 第二步,分配好task的executor之后,执行自己的launchTask方法,将分配的task发送LaunchTask消息到executor,
* */
// Make sure no executor is killed while some task is launching on it
val taskDescs = CoarseGrainedSchedulerBackend.this.synchronized {
// Filter out executors under killing
// 获得当前可用的executor
// executorData包括exeuctor所在地址,资源剩余量等信息
val activeExecutors = executorDataMap.filterKeys(executorIsAlive)
// workOffers代表每个可用executor的资源信息
val workOffers = activeExecutors.map {
case (id, executorData) =>
new WorkerOffer(id, executorData.executorHost, executorData.freeCores,
Some(executorData.executorAddress.hostPort))
}.toIndexedSeq
// 传入当前所有的exexutor,并且将其封装成了WorkerOffer,代表每个executor可用资源
scheduler.resourceOffers(workOffers)
}
// 一个二维数组,记录了每个executor上需要执行的task情况
if (!taskDescs.isEmpty) {
launchTasks(taskDescs)
}
}
接下来看scheduler当获得executor资源余量时如何调度task
def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
// Mark each slave as alive and remember its hostname
// Also track if new executor is added
var newExecAvail = false
for (o <- offers) {
if (!hostToExecutors.contains(o.host)) {
hostToExecutors(o.host) = new HashSet[String]()
}
if (!executorIdToRunningTaskIds.contains(o.executorId)) {
hostToExecutors(o.host) += o.executorId
executorAdded(o.executorId, o.host)
executorIdToHost(o.executorId) = o.host
executorIdToRunningTaskIds(o.executorId) = HashSet[Long]()
newExecAvail = true
}
for (rack <- getRackForHost(o.host)) {
hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
}
}
// Before making any offers, remove any nodes from the blacklist whose blacklist has expired. Do
// this here to avoid a separate thread and added synchronization overhead, and also because
// updating the blacklist is only relevant when task offers are being made.
blacklistTrackerOpt.foreach(_.applyBlacklistTimeout())
val filteredOffers = blacklistTrackerOpt.map { blacklistTracker =>
offers.filter { offer =>
!blacklistTracker.isNodeBlacklisted(offer.host) &&
!blacklistTracker.isExecutorBlacklisted(offer.executorId)
}
}.getOrElse(offers)
// 首先将executor顺序打乱,尽量负载均衡
val shuffledOffers = shuffleOffers(filteredOffers)
// Build a list of tasks to assign to each worker.
// 针对workerOffer创建出一堆需要用到的东西
// tasks是一个二维数组
// 每个子数组的元素个数是固定的,和每个executor的cpu核数有关
// 数组的第一维是可用executor的个数
// 数组的第二维代表每个executor最多可执行task的个数
// 代表每个executor需要执行哪些task
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))
val availableCpus = shuffledOffers.map(o => o.cores).toArray
val availableSlots = shuffledOffers.map(o => o.cores / CPUS_PER_TASK).sum
// 从rootPool中取出排序的taskset
// rootPool是一个调度池
// 所有提交的taskSet首先会放入调度池
// 然后在执行task分配算法的时候,会从这个调度池中,取出排好队的TaskSet
// 可能会分配多个taskSet
val sortedTaskSets = rootPool.getSortedTaskSetQueue
for (taskSet <- sortedTaskSets) {
logDebug("parentName: %s, name: %s, runningTasks: %s".format(
taskSet.parent.name, taskSet.name, taskSet.runningTasks))
if (newExecAvail) {
taskSet.executorAdded()
}
}
// Take each TaskSet in our scheduling order, and then offer it each node in increasing order
// of locality levels so that it gets a chance to launch local tasks on all of them.
// NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
// 针对每个taskSet,然后通过递增的本地化水平分配到各个节点上
// 主要有以下本地化级别:在同一进程,在同一节点,无偏好,在同一个机架,任何地方
/*
* 进程本地化:rdd的partition和task进入一个executors内,速度最快
* node_local:rdd的partitoin和task不在一个executor中,但是在一个worker节点上
* no_pref:没有所谓的本地化级别
* rack_local:rdd的partiton和task在同一个机架上
* any:任意本地化级别
* */
for (taskSet <- sortedTaskSets) {
// Skip the barrier taskSet if the available slots are less than the number of pending tasks.
// 判断当前的总slot个数是否多余taskSet的task个数,如果不够跳过当前的taskSet
if (taskSet.isBarrier && availableSlots < taskSet.numTasks) {
// Skip the launch process.
// TODO SPARK-24819 If the job requires more slots than available (both busy and free
// slots), fail the job on submit.
logInfo(s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " +
s"because the barrier taskSet requires ${taskSet.numTasks} slots, while the total " +
s"number of available slots is $availableSlots.")
} else {
var launchedAnyTask = false
// Record all the executor IDs assigned barrier tasks on.
val addressesWithDescs = ArrayBuffer[(String, TaskDescription)]()
// 本地化水平从低到高开始分配任务
// 让当前taskset中 的某些task在某些节点上运行
for (currentMaxLocality <- taskSet.myLocalityLevels) {
var launchedTaskAtCurrentMaxLocality = false
do {
launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(taskSet,
currentMaxLocality, shuffledOffers, availableCpus, tasks, addressesWithDescs)
launchedAnyTask |= launchedTaskAtCurrentMaxLocality
} while (launchedTaskAtCurrentMaxLocality)
}
if (!launchedAnyTask) {
taskSet.abortIfCompletelyBlacklisted(hostToExecutors)
}
if (launchedAnyTask && taskSet.isBarrier) {
// Check whether the barrier tasks are partially launched.
// TODO SPARK-24818 handle the assert failure case (that can happen when some locality
// requirements are not fulfilled, and we should revert the launched tasks).
require(addressesWithDescs.size == taskSet.numTasks,
s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " +
s"because only ${addressesWithDescs.size} out of a total number of " +
s"${taskSet.numTasks} tasks got resource offers. The resource offers may have " +
"been blacklisted or cannot fulfill task locality requirements.")
// materialize the barrier coordinator.
maybeInitBarrierCoordinator()
// Update the taskInfos into all the barrier task properties.
val addressesStr = addressesWithDescs
// Addresses ordered by partitionId
.sortBy(_._2.partitionId)
.map(_._1)
.mkString(",")
addressesWithDescs.foreach(_._2.properties.setProperty("addresses", addressesStr))
logInfo(s"Successfully scheduled all the ${addressesWithDescs.size} tasks for barrier " +
s"stage ${taskSet.stageId}.")
}
}
}
// TODO SPARK-24823 Cancel a job that contains barrier stage(s) if the barrier tasks don't get
// launched within a configured time.
if (tasks.size > 0) {
hasLaunchedTask = true
}
return tasks
}
流程总结:
DAGScheduler
- 对一个rdd执行action算子,底层会执行SparkContext的runJob方法
- runJob()方法又会执行submitJob方法,在该方法中,向DAGScheduler提交JobSubmitted消息
- DAGScheduler接收到JobSubmitted消息之后会调用handleJobSubmitted方法
- 在DAGScheduler的handleJobSubmitted方法中,首先会以当前rdd的所有父宽依赖rdd作为一个stage的最终rdd创建ShuffleMapStage,并且会以当前rdd创建一个ResultState,也叫做finalStage
- 以finalStage创建一个job,并且将job的信息存放在内存中,最后调用submitStage()
- 在submitStage()方法中,首先判断需要执行的stage的父stage是否计算完毕,如果计算完毕,那么当前stage可以通过调用submitMissingTasks来进行计算,否则将会将当前stage加入等待队列中,递归地对该stage的所有父stage调用submitiStage()
- 在submitMissingTask()中,首先计算缺失的partition都有哪些,然后计算这些partition的偏好计算位置(host 或者是 (host,executorId)),偏好位置是指哪些缓存了partition或者进行了checkpoint的partition的位置,然后针对每个partition创建task,task主要分两种ShuffleMapTask和ResultTask
- 调用taskSheduler的submitTasks方法来提交TaskSet(这个TaskSet由之前生成的task组成)
- 当执行完当前stage之后,会导致该stage的子stage开始运行
TaskScheduler
上面说完了DAGScheduler的过程,下面说一下TaskScheduler接着如何处理TaskSet - 在上面的第8步骤,会调用taskScheduler的submitTasks方法来提交task,在该方法中,首先创建一个TaskSetManager用来管理taskset的运行,然后调用schedulerBackend的reviveOffers来向driverEndPoint发送ReviveOffers信息
- DriverEndpoint接收到消息之后,会调用makeOffers,在该方法中会调用taskscheduler的resourceOffers方法,执行task分配算法,将每个task分配到executors,然后将task发送给对应的executor来执行,分配规则是,尽量让task在本地化水平高的位置运行