接上submitTasks
submitTasks
//TaskSchedule的入口
override def submitTasks(taskSet: TaskSet) {
val tasks = taskSet.tasks
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
this.synchronized {
//给每一个TaskSet,都会创建一个TaskSetManager,负责它的那个TaskSet的任务执行状况的监视和管理
val manager = createTaskSetManager(taskSet, maxTaskFailures)
val stage = taskSet.stageId
val stageTaskSets =
taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
//加入内存缓存中,
stageTaskSets(taskSet.stageAttemptId) = manager
val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
ts.taskSet != taskSet && !ts.isZombie
}
if (conflictingTaskSet) {
throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
}
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
if (!isLocal && !hasReceivedTask) {
starvationTimer.scheduleAtFixedRate(new TimerTask() {
override def run() {
if (!hasLaunchedTask) {
logWarning("Initial job has not accepted any resources; " +
"check your cluster UI to ensure that workers are registered " +
"and have sufficient resources")
} else {
this.cancel()
}
}
}, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
}
hasReceivedTask = true
}
backend.reviveOffers()
}
TaskSetManager
在TaskSchedulerImpl中的单个任务集中调度任务。此类跟踪每个任务,在任务失败时重试(次数有限),并通过延迟调度处理此TaskSet的位置感知调度。它的主要接口是ResourceOffer,它询问任务集是否要在一个节点上运行任务,以及StatusUpdate,告诉它的一个任务更改了状态(例如,完成)。
backend.reviveOffers()实际调用CoarseGrainedSchedulerBackend中的makeOffers
private def makeOffers() {
// Make sure no executor is killed while some task is launching on it
val taskDescs = CoarseGrainedSchedulerBackend.this.synchronized {
// Filter out executors under killing
val activeExecutors = executorDataMap.filterKeys(executorIsAlive)
val workOffers = activeExecutors.map {
case (id, executorData) =>
//这个Application所有可用的executor,并且将其封装成了WorkerOffer,每个WorkerOffer代表了每个executor可用的cpu资源数量
new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
}.toIndexedSeq
//1:调用TaskSchedulerImpl的resourceOffers方法,执行任务分配算法,将各个task分配到executor上去
scheduler.resourceOffers(workOffers)
}
if (!taskDescs.isEmpty) {
//2:分配好task到executor之后,执行自己的launchTasks方法,将分配的task发送launchTasks消息到对应的executor上去,由executor启动并执行task
launchTasks(taskDescs)
}
}
scheduler.resourceOffers(workOffers)
尝试用本地化级别这种模型,去优化task的分配和启动,优先希望在最佳本地化的地方启动task,然后将task分配给executor
def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
// Mark each slave as alive and remember its hostname
// Also track if new executor is added
var newExecAvail = false
for (o <- offers) {
if (!hostToExecutors.contains(o.host)) {
hostToExecutors(o.host) = new HashSet[String]()
}
if (!executorIdToRunningTaskIds.contains(o.executorId)) {
hostToExecutors(o.host) += o.executorId
executorAdded(o.executorId, o.host)
executorIdToHost(o.executorId) = o.host
executorIdToRunningTaskIds(o.executorId) = HashSet[Long]()
newExecAvail = true
}
for (rack <- getRackForHost(o.host)) {
hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
}
}
// Before making any offers, remove any nodes from the blacklist whose blacklist has expired. Do
// this here to avoid a separate thread and added synchronization overhead, and also because
// updating the blacklist is only relevant when task offers are being made.
blacklistTrackerOpt.foreach(_.applyBlacklistTimeout())
val filteredOffers = blacklistTrackerOpt.map { blacklistTracker =>
offers.filter { offer =>
!blacklistTracker.isNodeBlacklisted(offer.host) &&
!blacklistTracker.isExecutorBlacklisted(offer.executorId)
}
}.getOrElse(offers)
//首先,将可用的executor进行shuffle,也就是说,进行打散,尽量可以进行负载均衡
val shuffledOffers = shuffleOffers(filteredOffers)
// Build a list of tasks to assign to each worker.
//针对WorkerOffer,创建出一堆需要用的东西,tasks 、availableCpus、sortedTaskSets
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))
val availableCpus = shuffledOffers.map(o => o.cores).toArray
//从rootPool中取出了排序的TaskSet。所有提交的TaskSet,首先会放入调度池,然后在执行task分配算法的时候,取出排好队的TaskSet
val sortedTaskSets = rootPool.getSortedTaskSetQueue
for (taskSet <- sortedTaskSets) {
logDebug("parentName: %s, name: %s, runningTasks: %s".format(
taskSet.parent.name, taskSet.name, taskSet.runningTasks))
if (newExecAvail) {
taskSet.executorAdded()
}
}
// Take each TaskSet in our scheduling order, and then offer it each node in increasing order
// of locality levels so that it gets a chance to launch local tasks on all of them.
// NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
//双重for循环,遍历所有TaskSet,以及每一种本地化级别,从最好的本地化级别开始遍历,
for (taskSet <- sortedTaskSets) {
var launchedAnyTask = false
var launchedTaskAtCurrentMaxLocality = false
for (currentMaxLocality <- taskSet.myLocalityLevels) {
do {
//对当前taskset,尝试使用本地化级别,一个个尝试,直到将TaskSet在某些本地化级别下,让task在executor上全部启动
launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(
taskSet, currentMaxLocality, shuffledOffers, availableCpus, tasks)
launchedAnyTask |= launchedTaskAtCurrentMaxLocality
} while (launchedTaskAtCurrentMaxLocality)
}
if (!launchedAnyTask) {
taskSet.abortIfCompletelyBlacklisted(hostToExecutors)
}
}
if (tasks.size > 0) {
hasLaunchedTask = true
}
return tasks
}
resourceOfferSingleTaskSet
private def resourceOfferSingleTaskSet(
taskSet: TaskSetManager,
maxLocality: TaskLocality,
shuffledOffers: Seq[WorkerOffer],
availableCpus: Array[Int],
tasks: IndexedSeq[ArrayBuffer[TaskDescription]]) : Boolean = {
var launchedTask = false
// nodes and executors that are blacklisted for the entire application have already been
// filtered out by this point
//遍历所有executor
for (i <- 0 until shuffledOffers.size) {
val execId = shuffledOffers(i).executorId
val host = shuffledOffers(i).host
//如果当前executor的cpu数量至少大于每个task要使用的cpu数量,默认是1
if (availableCpus(i) >= CPUS_PER_TASK) {
try {
//调用TaskSetManager的resourceOffer,找到在这个executor上用这种本地化级别,taskset哪些task可以启动的所有task,
//遍历这些task,
for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
//放入tasks这个二维数组,给指定的executor加上要启动的task
tasks(i) += task
//尝试用本地化级别这种模型,去优化task的分配和启动,优先希望在最佳本地化的地方启动task,然后将task分配给executor
//将相应的分配信息加入内存缓存
val tid = task.taskId
taskIdToTaskSetManager(tid) = taskSet
taskIdToExecutorId(tid) = execId
executorIdToRunningTaskIds(execId).add(tid)
availableCpus(i) -= CPUS_PER_TASK
assert(availableCpus(i) >= 0)
launchedTask = true
}
} catch {
case e: TaskNotSerializableException =>
logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
// Do not offer resources for this task, but don't throw an error to allow other
// task sets to be submitted.
return launchedTask
}
}
}
return launchedTask
}
taskSet.resourceOffer(execId, host, maxLocality))
通过查找任务来响应调度程序提供的单个执行器
*
*注意:此函数要么使用将由延迟调度算法调整的maxlocality调用,要么使用不可修改的特殊no_pref locality调用。…
方法说明:判断这个executor在这个本地化级别,之前的等待时间是多少,如果本地化级别的等待赶时间在一定范围内,那么就认为task使用本地化级别可以在executor上启动
launchTasks
根据分配好的,在executor上启动相应的task
private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
for (task <- tasks.flatten) {
//首先将每个executor要执行的task信息,统一进行序列化(二进制字节数组)操作
val serializedTask = TaskDescription.encode(task)
if (serializedTask.limit() >= maxRpcMessageSize) {
scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr =>
try {
var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
"spark.rpc.message.maxSize (%d bytes). Consider increasing " +
“spark.rpc.message.maxSize or using broadcast variables for large values.”
msg = msg.format(task.taskId, task.index, serializedTask.limit(), maxRpcMessageSize)
taskSetMgr.abort(msg)
} catch {
case e: Exception => logError(“Exception in error callback”, e)
}
}
}
else {
//找到对应的exector
val executorData = executorDataMap(task.executorId)
//给executor上的资源,减去要使用的cpu资源
executorData.freeCores -= scheduler.CPUS_PER_TASK
logDebug(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " +
s"${executorData.executorHost}.")
//要executor发送LaunchTask消息,来在executor上启动task
executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))
}
}
}
**注:本地化级别,有几种:
1:PROCESS_LOCAL,进程本地化,rdd的partition和task,进入一个executor内,那么速度当然快。
2:NODE_LOCEL,也就是说,rdd的partition和task不在一个executor中,不在一个进程,但是在一个worker节点上。
3:NO_PREF,无没有所谓的本地化级别。
4:RACK_LOCAL,机架本地化,至少rdd的partition和task,在一个机架上。 5:ANY,任意的本地化级别。**