Spark TaskSchduler任务分配源码解析

在DAGSchduler.scala中,封装taskset,使用TaskSchduler提交了taskset,下面通过源码解析,TaskSchduler对task分配到executor和本地化级别。

TaskSchdulerImpl.scala

/**
    * taskSchduler 提交taskset的入口
    */
  override def submitTasks(taskSet: TaskSet) {
    val tasks = taskSet.tasks
    logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
    this.synchronized {

      // 给taskset创建一个TaskSetMananger
      // TaskSetManager会负责后续它负责、监视taskset的任务执行情况
      val manager = createTaskSetManager(taskSet, maxTaskFailures)
      val stage = taskSet.stageId
      val stageTaskSets =
        taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])

      // 加入内存缓存
      stageTaskSets(taskSet.stageAttemptId) = manager

      val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
        ts.taskSet != taskSet && !ts.isZombie
      }
      if (conflictingTaskSet) {
        throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
          s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
      }
      schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)

      if (!isLocal && !hasReceivedTask) {
        starvationTimer.scheduleAtFixedRate(new TimerTask() {
          override def run() {
            if (!hasLaunchedTask) {
              logWarning("Initial job has not accepted any resources; " +
                "check your cluster UI to ensure that workers are registered " +
                "and have sufficient resources")
            } else {
              this.cancel()
            }
          }
        }, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
      }
      hasReceivedTask = true
    }

    // 在SparkContext中创建TaskSchduler的时候,为TaskSchduler创建了一个SparkDeploySchdulerBackend/StandloneSchdulerBackend
    // 这里的backend就是SparkDeploySchdulerBackend/StandloneSchdulerBackend,这个backend负责创建AppClient,向Master注册Application
    backend.reviveOffers()
  }

backend.reviveOffers()方法会掉到CoarseGrainedSchdulerBackend.scala的reviveOffers()方法上

CoarseGrainedSchdulerBackend.scala

  override def reviveOffers() {
    driverEndpoint.send(ReviveOffers)
  }
 class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
    extends ThreadSafeRpcEndpoint with Logging {

    ...
      case ReviveOffers =>
        makeOffers()

      case KillTask(taskId, executorId, interruptThread) =>
        ...
    }
    // Make fake resource offers on all executors
    private def makeOffers() {
      // Filter out executors under killing
      val activeExecutors = executorDataMap.filterKeys(executorIsAlive)

      // WorkOffer
      val workOffers = activeExecutors.map { case (id, executorData) =>
        new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
      }.toIndexedSeq

      /**
        * ①调用TaskSchdulerImpl的resourceOffers()方法,执行任务调度算法,将各个task分配到executor上
        * ②分配好task到executor上后,执行自己的launchTasks()方法,将分配的task发送LuanchTask消息到
        *   对应的executor上,由executor启动并执行task
        */
      launchTasks(scheduler.resourceOffers(workOffers))
    }

TaskSchdulerImpl.scala

def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
    // Mark each slave as alive and remember its hostname
    // Also track if new executor is added
    var newExecAvail = false
    for (o <- offers) {
      if (!hostToExecutors.contains(o.host)) {
        hostToExecutors(o.host) = new HashSet[String]()
      }
      if (!executorIdToRunningTaskIds.contains(o.executorId)) {
        hostToExecutors(o.host) += o.executorId
        executorAdded(o.executorId, o.host)
        executorIdToHost(o.executorId) = o.host
        executorIdToRunningTaskIds(o.executorId) = HashSet[Long]()
        newExecAvail = true
      }
      for (rack <- getRackForHost(o.host)) {
        hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
      }
    }

    // Randomly shuffle offers to avoid always placing tasks on the same set of workers.
    // 对可以用的executor进行shuffle,负载均衡
    val shuffledOffers = Random.shuffle(offers)

    // Build a list of tasks to assign to each worker.
    // 对WorkerOffer创建了一些task运行需要的对象资源,这里的ArrayBuffer的数量都是固定的,即executor可用的cup数量
    val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
    val availableCpus = shuffledOffers.map(o => o.cores).toArray

    // 从rootPool中取出排序的TaskSet,即执行task分配算法的时候,从这个调度池中取出排好队的TaskSet
    val sortedTaskSets = rootPool.getSortedTaskSetQueue
    for (taskSet <- sortedTaskSets) {
      logDebug("parentName: %s, name: %s, runningTasks: %s".format(
        taskSet.parent.name, taskSet.name, taskSet.runningTasks))
      if (newExecAvail) {
        taskSet.executorAdded()
      }
    }

    // Take each TaskSet in our scheduling order, and then offer it each node in increasing order
    // of locality levels so that it gets a chance to launch local tasks on all of them.
    // NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
    for (taskSet <- sortedTaskSets) {
      var launchedAnyTask = false
      var launchedTaskAtCurrentMaxLocality = false

      // 双重循环,遍历所有taskset和每一种本地化级别
      /*
      本地化级别 PROCESS_LOCAL,进程本地化,rdd的partition和task进入一个executor
                 NODE_LOCAL,rdd的partition和task,不在一个executor中,不在一个进程,但在同一个worker节点上
                 NO_PREF 没有本地化级别
                 RACK_LOCAL 机架本地化 至少rdd的partition和task,在一个机架上
                 ANY 任意本地化级别
     */
      // 对每个taskset从最好的本地化基本开始遍历
      for (currentMaxLocality <- taskSet.myLocalityLevels) {
        do {

          // 尝试优先级最优的本地化级别,将task在executor上启动
          // 如果启动不了,进入下一种本地化级别。
          launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(
            taskSet, currentMaxLocality, shuffledOffers, availableCpus, tasks)

          launchedAnyTask |= launchedTaskAtCurrentMaxLocality
        } while (launchedTaskAtCurrentMaxLocality)
      }

      if (!launchedAnyTask) {
        taskSet.abortIfCompletelyBlacklisted(hostToExecutors)
      }
    }

    if (tasks.size > 0) {
      hasLaunchedTask = true
    }
    return tasks
  }
private def resourceOfferSingleTaskSet(
      taskSet: TaskSetManager,
      maxLocality: TaskLocality,
      shuffledOffers: Seq[WorkerOffer],
      availableCpus: Array[Int],
      tasks: IndexedSeq[ArrayBuffer[TaskDescription]]) : Boolean = {
    var launchedTask = false

    // 遍历所有executor
    for (i <- 0 until shuffledOffers.size) {
      val execId = shuffledOffers(i).executorId
      val host = shuffledOffers(i).host

      // 如果当期executor的cup数量至少大于每个task需要使用的cup数量,默认是1
      if (availableCpus(i) >= CPUS_PER_TASK) {
        try {
          // 调用TaskSetManager的resourceOffer()方法,关联上executor,关联到本地化级别,以及可以启动的task
          // 优先的本地化级别是最佳的本地化上启动task
          for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
            // 指定executor要启动的task
            tasks(i) += task

            // 将相应的内存信息加入到内存缓存
            val tid = task.taskId
            taskIdToTaskSetManager(tid) = taskSet
            taskIdToExecutorId(tid) = execId
            executorIdToRunningTaskIds(execId).add(tid)
            availableCpus(i) -= CPUS_PER_TASK
            assert(availableCpus(i) >= 0)

            launchedTask = true
          }
        } catch {
          case e: TaskNotSerializableException =>
            logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
            // Do not offer resources for this task, but don't throw an error to allow other
            // task sets to be submitted.
            return launchedTask
        }
      }
    }
    return launchedTask
  }

回到CoarseGrainedSchdulerBackend.scala

// Launch tasks returned by a set of resource offers
    // 根据分配好的条件,在executor上启动对应的task
    private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
      for (task <- tasks.flatten) {
        // 将每个executor要执行的task信息,进行序列化
        val serializedTask = ser.serialize(task)
        if (serializedTask.limit >= maxRpcMessageSize) {
          scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr =>
            try {
              var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
                "spark.rpc.message.maxSize (%d bytes). Consider increasing " +
                "spark.rpc.message.maxSize or using broadcast variables for large values."
              msg = msg.format(task.taskId, task.index, serializedTask.limit, maxRpcMessageSize)
              taskSetMgr.abort(msg)
            } catch {
              case e: Exception => logError("Exception in error callback", e)
            }
          }
        }
        else {

          // 找到对应的executor
          val executorData = executorDataMap(task.executorId)

          // 减去要使用的资源
          executorData.freeCores -= scheduler.CPUS_PER_TASK

          logDebug(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " +
            s"${executorData.executorHost}.")

          // 向executor发送LaunchTask消息,在executor上启动task
          executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))
        }
      }
    }

TaskSetManager.scala

/**
  *对taskset的任务进行调度,并追踪每一个task,如果task失败,
  * 就会重试task,直到超过重试的次数,并通过延迟调度,为taskset处理本地化调度机制。
  * 主要的接口是resourceOffer,在这个接口中,taskset在一个节点上运行一个任务,并
  * 接受任务状态改变消息,从而知道它负责的task状态改变了。
  */
private[spark] class TaskSetManager(
  ...
  /*
    判断executor在当前本地化基本,之前等待的时间长度,
    本地化级别的等待时间在一定范围内,就认为task使用的本地化级别可以在executor是启动
   */
  @throws[TaskNotSerializableException]
  def resourceOffer(
     ...
....




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值