接上篇DAGScheduler.submitMissingTasks.taskScheduler.submitTasks
TaskSchedulerImpl
/**
* TaskScheduler提交tasks任务的入口
*/
override def submitTasks(taskSet: TaskSet) {
val tasks = taskSet.tasks
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
this.synchronized {
//给每一个TaskSet都会创建一个TaskSetManager
//TaskManager实际上,在后面,会负责他的那个TaskSet的任务执行状况的监视和管理
val manager = createTaskSetManager(taskSet, maxTaskFailures)
//加入内存缓存中
activeTaskSets(taskSet.id) = manager
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
if (!isLocal && !hasReceivedTask) {
starvationTimer.scheduleAtFixedRate(new TimerTask() {
override def run() {
if (!hasLaunchedTask) {
logWarning("Initial job has not accepted any resources; " +
"check your cluster UI to ensure that workers are registered " +
"and have sufficient resources")
} else {
this.cancel()
}
}
}, STARVATION_TIMEOUT, STARVATION_TIMEOUT)
}
hasReceivedTask = true
}
/**
* 这里的backend指的是之前创建好的SparkDeploySchedulerBackend
* 而且这个backend是负责创建Appclient,向Master注册Application的
*
* SparkDeploySchedulerBackend 是 CoarseGrainedSchedulerBackend 的子类
* SparkDeploySchedulerBackend中无reviveOffers方法,所以调用CoarseGrainedSchedulerBackend.reviveOffers()
*
* -> CoarseGrainedSchedulerBackend.reviveOffers()
*/
backend.reviveOffers()
=> createTaskSetManager-> new TaskSetManager
/**
* 在TaskSchedulerImpl中,对一个单独的TaskSet的任务进行调度,这个类负责追踪每一个task
* 如果task失败的话,会负责重试task,直到超过重试次数限制,并且会通过延时调整,为这个TaskSet
* 处理本地化调度机制。它的主要接口是resourcOffer,在这个接口中,TaskSet会希望在一个节点上
* 执行一个任务,并且接受任务的状态改变消息,来知道它负责的task的状态改变了
*/
private[spark] class TaskSetManager(
=> backend.reviveOffers()
/**
* 这里的backend指的是之前创建好的SparkDeploySchedulerBackend
* 而且这个backend是负责创建Appclient,向Master注册Application的
*
* SparkDeploySchedulerBackend 是 CoarseGrainedSchedulerBackend 的子类
* SparkDeploySchedulerBackend中无reviveOffers方法,所以调用CoarseGrainedSchedulerBackend.reviveOffers()
*
* -> CoarseGrainedSchedulerBackend.reviveOffers()
*/
backend.reviveOffers()
-> CoarseGrainedSchedulerBackend.reviveOffers()
override def reviveOffers() {
// TODO 向DriverActor发消息
driverActor ! ReviveOffers
}
=> /**
* 调用makeOffers 向executor提交task
*/
case ReviveOffers =>
makeOffers()
def makeOffers() {
/**
* 1)调用TaskSchedulerImpl的resourceOffers()方法,执行任务分配算法,将各个task分配到executor上去
* 2)分配好task到executor之后,执行自己的launchTasks()方法,将分配的task发送到LaunchTask消息对应
* 的executor上去,由executor启动并执行
*
* 给resouceoffers方法传入的是这个application所有可用的executor,并且将其封装成了workeroffer,
* 每个workeroffer代表了每个executor可用的cpu资源数量
*
* TODO scheduler.resourceOffers -> TaskSchedulerImpl.resourceOffers
* TODO 调用 launchTasks 向executor提交task
*/
launchTasks(scheduler.resourceOffers(executorDataMap.map { case (id, executorData) =>
new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
}.toSeq))
}
==> TaskSchedulerImpl.resourceOffers
def resourceOffers(offers: Seq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
// Mark each slave as alive and remember its hostname
// Also track if new executor is added
var newExecAvail = false
for (o <- offers) {
executorIdToHost(o.executorId) = o.host
activeExecutorIds += o.executorId
if (!executorsByHost.contains(o.host)) {
executorsByHost(o.host) = new HashSet[String]()
executorAdded(o.executorId, o.host)
newExecAvail = true
}
for (rack <- getRackForHost(o.host)) {
hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
}
}
// Randomly shuffle offers to avoid always placing tasks on the same set of workers.
// 1) 将可用的executor进行shuffle,也就是说进行打散,从而做到尽量可以进行负载均衡
val shuffledOffers = Random.shuffle(offers)
//
/**
* Build a list of tasks to assign to each worker.
* 2) 然后针对WorkerOffers,创建一堆需要用的的东西,
* 比如tasks,它可以理解为一个二维数组,ArrayBuffer,元素是一个ArrayBuffer,
* 并且每个子ArrayBuffer的数量是固定的,也就是这个executor可用的cpu数量
*/
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
val availableCpus = shuffledOffers.map(o => o.cores).toArray
/**
* 3) 从rootPool中取出了排序的TaskSet,
* 之前讲解TaskScheduler初始化的时候,创建完TaskSchedulerImpl,SparkDeploySchedulerBackend
* 之后执行一个initialize()方法,在这个方法中,其实会创建一个调度池
* 这里相当于说,所有提交的TaskSet,先会放入这个调度池,
* 然后在执行task分配算法的时候会从这个调度池中,取出排好队的TaskSet
*/
val sortedTaskSets = rootPool.getSortedTaskSetQueue
for (taskSet <- sortedTaskSets) {
logDebug("parentName: %s, name: %s, runningTasks: %s".format(
taskSet.parent.name, taskSet.name, taskSet.runningTasks))
if (newExecAvail) {
taskSet.executorAdded()
}
}
/**
* Take each TaskSet in our scheduling order, and then offer it each node in increasing order
* of locality levels so that it gets a chance to launch local tasks on all of them.
* NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
*
* 任务分配算法的核心:
* 双重for循环,遍历所有taskset,以及每一种本地化级别
* 本地化级别:PROCESS_LOCAL,进程本地化,rdd的partition和task,进入一个executor内,那么速度当然快
* NODE_LOCAL:rdd的partition和task不在一个executor中,不在一个进程,但是在一个worker节点上
* NO_PREF:无,没有所谓的本地化级别
* RACK_LOCAL:机架本地化,至少rdd的partition和task在一个机架上
* ANY: 任意的本地化级别
*
* 4)对每个taskset从最好的一种本地化级别,开始遍历
*/
var launchedTask = false
for (taskSet <- sortedTaskSets; maxLocality <- taskSet.myLocalityLevels) {
do {
/**
* 对当前的taskset
* 尝试优先使用最小的本地化级别,将taskset的task,在executor上进行启动
* 如果启动不了,跳出这个dowhile循环,进入下一种本地化级别,也就是放大本地化级别
* 以此类推,直到尝试将taskset在某些本地化级别下让task在executor上全部启动
*/
launchedTask = resourceOfferSingleTaskSet(
taskSet, maxLocality, shuffledOffers, availableCpus, tasks)
} while (launchedTask)
}
if (tasks.size > 0) {
hasLaunchedTask = true
}
return tasks
}
====> resourceOfferSingleTaskSet
private def resourceOfferSingleTaskSet(
taskSet: TaskSetManager,
maxLocality: TaskLocality,
shuffledOffers: Seq[WorkerOffer],
availableCpus: Array[Int],
tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = {
var launchedTask = false
// 遍历所有executor
for (i <- 0 until shuffledOffers.size) {
val execId = shuffledOffers(i).executorId
val host = shuffledOffers(i).host
// 如果当前executor的cpu数量至少大于每个task要使用的cpu数量,默认为1
if (availableCpus(i) >= CPUS_PER_TASK) {
try {
/**
* 调用tasksetmanager的resourceOffer方法
* 去找到在这个executor上就用这种本地化级别,taskset哪些task可以启动
* 遍历使用当前本地化级别,可以在该executor上启动task
*/
for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
// 放入tasks这个二维数组,给指定的executor加上要启动的task
tasks(i) += task
/**
* task分配算法的实现:
* 尝试着用本地化级别这种模型,去优化task的分配和启动,优先希望在最佳本地化的地方启动task
* 然后将task分配给executor
*
* 将相应的分配信息加入内存缓存
*/
val tid = task.taskId
taskIdToTaskSetId(tid) = taskSet.taskSet.id
taskIdToExecutorId(tid) = execId
executorsByHost(host) += execId
availableCpus(i) -= CPUS_PER_TASK
assert(availableCpus(i) >= 0)
launchedTask = true
}
} catch {
case e: TaskNotSerializableException =>
logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
// Do not offer resources for this task, but don't throw an error to allow other
// task sets to be submitted.
return launchedTask
}
}
}
return launchedTask
}
再回到 CoarseGrainedSchedulerBackend.makeOffers.launchTasks
/**
* Launch tasks returned by a set of resource offers
* 根据分配好的情况,去在executor上启动相应的task
*/
def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
for (task <- tasks.flatten) {
// 1)首先将每个executor要执行的task信息,统一进行序列化操作
val ser = SparkEnv.get.closureSerializer.newInstance()
val serializedTask = ser.serialize(task)
if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
val taskSetId = scheduler.taskIdToTaskSetId(task.taskId)
scheduler.activeTaskSets.get(taskSetId).foreach { taskSet =>
try {
var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
"spark.akka.frameSize (%d bytes) - reserved (%d bytes). Consider increasing " +
"spark.akka.frameSize or using broadcast variables for large values."
msg = msg.format(task.taskId, task.index, serializedTask.limit, akkaFrameSize,
AkkaUtils.reservedSizeBytes)
taskSet.abort(msg)
} catch {
case e: Exception => logError("Exception in error callback", e)
}
}
}
else {
// 2)找到对应的executor
val executorData = executorDataMap(task.executorId)
// 3)给executor上的资源,减去要使用的cpu资源
executorData.freeCores -= scheduler.CPUS_PER_TASK
/**
* TODO 4)向executor发送LaunchTask消息,来在executor上启动task
* -> CoarseGrainedExecutorBackend.receiveWithLogging.LaunchTask
*/
executorData.executorActor ! LaunchTask(new SerializableBuffer(serializedTask))
}
}
}
到此 向executor发送LaunchTask消息,来在executor上启动task 后见Executor源码分析