在DAGSchduler.scala中,封装taskset,使用TaskSchduler提交了taskset,下面通过源码解析,TaskSchduler对task分配到executor和本地化级别。
TaskSchdulerImpl.scala
/**
* taskSchduler 提交taskset的入口
*/
override def submitTasks(taskSet: TaskSet) {
val tasks = taskSet.tasks
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
this.synchronized {
// 给taskset创建一个TaskSetMananger
// TaskSetManager会负责后续它负责、监视taskset的任务执行情况
val manager = createTaskSetManager(taskSet, maxTaskFailures)
val stage = taskSet.stageId
val stageTaskSets =
taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
// 加入内存缓存
stageTaskSets(taskSet.stageAttemptId) = manager
val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
ts.taskSet != taskSet && !ts.isZombie
}
if (conflictingTaskSet) {
throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
}
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
if (!isLocal && !hasReceivedTask) {
starvationTimer.scheduleAtFixedRate(new TimerTask() {
override def run() {
if (!hasLaunchedTask) {
logWarning("Initial job has not accepted any resources; " +
"check your cluster UI to ensure that workers are registered " +
"and have sufficient resources")
} else {
this.cancel()
}
}
}, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
}
hasReceivedTask = true
}
// 在SparkContext中创建TaskSchduler的时候,为TaskSchduler创建了一个SparkDeploySchdulerBackend/StandloneSchdulerBackend
// 这里的backend就是SparkDeploySchdulerBackend/StandloneSchdulerBackend,这个backend负责创建AppClient,向Master注册Application
backend.reviveOffers()
}
backend.reviveOffers()方法会掉到CoarseGrainedSchdulerBackend.scala的reviveOffers()方法上
CoarseGrainedSchdulerBackend.scala
override def reviveOffers() {
driverEndpoint.send(ReviveOffers)
}
class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
extends ThreadSafeRpcEndpoint with Logging {
...
case ReviveOffers =>
makeOffers()
case KillTask(taskId, executorId, interruptThread) =>
...
}
// Make fake resource offers on all executors
private def makeOffers() {
// Filter out executors under killing
val activeExecutors = executorDataMap.filterKeys(executorIsAlive)
// WorkOffer
val workOffers = activeExecutors.map { case (id, executorData) =>
new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
}.toIndexedSeq
/**
* ①调用TaskSchdulerImpl的resourceOffers()方法,执行任务调度算法,将各个task分配到executor上
* ②分配好task到executor上后,执行自己的launchTasks()方法,将分配的task发送LuanchTask消息到
* 对应的executor上,由executor启动并执行task
*/
launchTasks(scheduler.resourceOffers(workOffers))
}
TaskSchdulerImpl.scala
def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
// Mark each slave as alive and remember its hostname
// Also track if new executor is added
var newExecAvail = false
for (o <- offers) {
if (!hostToExecutors.contains(o.host)) {
hostToExecutors(o.host) = new HashSet[String]()
}
if (!executorIdToRunningTaskIds.contains(o.executorId)) {
hostToExecutors(o.host) += o.executorId
executorAdded(o.executorId, o.host)
executorIdToHost(o.executorId) = o.host
executorIdToRunningTaskIds(o.executorId) = HashSet[Long]()
newExecAvail = true
}
for (rack <- getRackForHost(o.host)) {
hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
}
}
// Randomly shuffle offers to avoid always placing tasks on the same set of workers.
// 对可以用的executor进行shuffle,负载均衡
val shuffledOffers = Random.shuffle(offers)
// Build a list of tasks to assign to each worker.
// 对WorkerOffer创建了一些task运行需要的对象资源,这里的ArrayBuffer的数量都是固定的,即executor可用的cup数量
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
val availableCpus = shuffledOffers.map(o => o.cores).toArray
// 从rootPool中取出排序的TaskSet,即执行task分配算法的时候,从这个调度池中取出排好队的TaskSet
val sortedTaskSets = rootPool.getSortedTaskSetQueue
for (taskSet <- sortedTaskSets) {
logDebug("parentName: %s, name: %s, runningTasks: %s".format(
taskSet.parent.name, taskSet.name, taskSet.runningTasks))
if (newExecAvail) {
taskSet.executorAdded()
}
}
// Take each TaskSet in our scheduling order, and then offer it each node in increasing order
// of locality levels so that it gets a chance to launch local tasks on all of them.
// NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
for (taskSet <- sortedTaskSets) {
var launchedAnyTask = false
var launchedTaskAtCurrentMaxLocality = false
// 双重循环,遍历所有taskset和每一种本地化级别
/*
本地化级别 PROCESS_LOCAL,进程本地化,rdd的partition和task进入一个executor
NODE_LOCAL,rdd的partition和task,不在一个executor中,不在一个进程,但在同一个worker节点上
NO_PREF 没有本地化级别
RACK_LOCAL 机架本地化 至少rdd的partition和task,在一个机架上
ANY 任意本地化级别
*/
// 对每个taskset从最好的本地化基本开始遍历
for (currentMaxLocality <- taskSet.myLocalityLevels) {
do {
// 尝试优先级最优的本地化级别,将task在executor上启动
// 如果启动不了,进入下一种本地化级别。
launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(
taskSet, currentMaxLocality, shuffledOffers, availableCpus, tasks)
launchedAnyTask |= launchedTaskAtCurrentMaxLocality
} while (launchedTaskAtCurrentMaxLocality)
}
if (!launchedAnyTask) {
taskSet.abortIfCompletelyBlacklisted(hostToExecutors)
}
}
if (tasks.size > 0) {
hasLaunchedTask = true
}
return tasks
}
private def resourceOfferSingleTaskSet(
taskSet: TaskSetManager,
maxLocality: TaskLocality,
shuffledOffers: Seq[WorkerOffer],
availableCpus: Array[Int],
tasks: IndexedSeq[ArrayBuffer[TaskDescription]]) : Boolean = {
var launchedTask = false
// 遍历所有executor
for (i <- 0 until shuffledOffers.size) {
val execId = shuffledOffers(i).executorId
val host = shuffledOffers(i).host
// 如果当期executor的cup数量至少大于每个task需要使用的cup数量,默认是1
if (availableCpus(i) >= CPUS_PER_TASK) {
try {
// 调用TaskSetManager的resourceOffer()方法,关联上executor,关联到本地化级别,以及可以启动的task
// 优先的本地化级别是最佳的本地化上启动task
for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
// 指定executor要启动的task
tasks(i) += task
// 将相应的内存信息加入到内存缓存
val tid = task.taskId
taskIdToTaskSetManager(tid) = taskSet
taskIdToExecutorId(tid) = execId
executorIdToRunningTaskIds(execId).add(tid)
availableCpus(i) -= CPUS_PER_TASK
assert(availableCpus(i) >= 0)
launchedTask = true
}
} catch {
case e: TaskNotSerializableException =>
logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
// Do not offer resources for this task, but don't throw an error to allow other
// task sets to be submitted.
return launchedTask
}
}
}
return launchedTask
}
回到CoarseGrainedSchdulerBackend.scala
// Launch tasks returned by a set of resource offers
// 根据分配好的条件,在executor上启动对应的task
private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
for (task <- tasks.flatten) {
// 将每个executor要执行的task信息,进行序列化
val serializedTask = ser.serialize(task)
if (serializedTask.limit >= maxRpcMessageSize) {
scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr =>
try {
var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
"spark.rpc.message.maxSize (%d bytes). Consider increasing " +
"spark.rpc.message.maxSize or using broadcast variables for large values."
msg = msg.format(task.taskId, task.index, serializedTask.limit, maxRpcMessageSize)
taskSetMgr.abort(msg)
} catch {
case e: Exception => logError("Exception in error callback", e)
}
}
}
else {
// 找到对应的executor
val executorData = executorDataMap(task.executorId)
// 减去要使用的资源
executorData.freeCores -= scheduler.CPUS_PER_TASK
logDebug(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " +
s"${executorData.executorHost}.")
// 向executor发送LaunchTask消息,在executor上启动task
executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))
}
}
}
TaskSetManager.scala
/**
*对taskset的任务进行调度,并追踪每一个task,如果task失败,
* 就会重试task,直到超过重试的次数,并通过延迟调度,为taskset处理本地化调度机制。
* 主要的接口是resourceOffer,在这个接口中,taskset在一个节点上运行一个任务,并
* 接受任务状态改变消息,从而知道它负责的task状态改变了。
*/
private[spark] class TaskSetManager(
...
/*
判断executor在当前本地化基本,之前等待的时间长度,
本地化级别的等待时间在一定范围内,就认为task使用的本地化级别可以在executor是启动
*/
@throws[TaskNotSerializableException]
def resourceOffer(
...
....