Spark Master资源调度算算法源码分析

最新推荐文章于 2019-09-01 16:01:00 发布

发布了一场Chat

最新推荐文章于 2019-09-01 16:01:00 发布

阅读量453

点赞数

分类专栏： spark深入学习文章标签： Master资源调度算算法源码分析

本文链接：https://blog.csdn.net/u013174239/article/details/80293604

版权

spark深入学习专栏收录该内容

17 篇文章 2 订阅

订阅专栏

本文深入探讨Spark Master在Driver和application分配过程中的资源调度算法，通过分析Master.scala的关键代码，揭示其内部工作原理。

摘要由CSDN通过智能技术生成

Master.scala的核心方法

private def schedule(): Unit = {
    // 对master状态判断，是否为ALIVE，因为standby是不会进行资源调度的
    if (state != RecoveryState.ALIVE) {
      return
    }
    // Drivers take strict precedence over executors

    // Random.shuffle 对转入的集合元素随机打乱
    // 取出workers中之前注册的worker，进行过滤，状态必须是ALIVE的worker
    // 对状态ALIVE的worker调用Random.shuffle方法
    val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE))
    val numWorkersAlive = shuffledAliveWorkers.size
    var curPos = 0

    /**
      * 这里的代码只会在yarn-cluster模式下运行
      * 这里调度diver，在yarn-cluster提交模式下，才会注册driver，yarn-client和standalone模式
      * 都只会在本地启动diver，不会住处driver
      */
    for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers
      // We assign workers to each waiting driver in a round-robin fashion. For each driver, we
      // start from the last worker that was assigned a driver, and continue onwards until we have
      // explored all alive workers.
      var launched = false
      var numWorkersVisited = 0

      // 如果有alive的worker没有遍历到，就继续遍历，而且diver还没有被启动
      while (numWorkersVisited < numWorkersAlive && !launched) {
        val worker = shuffledAliveWorkers(curPos)
        numWorkersVisited += 1

        // 如果当期worker的空闲内存量大于等于driver需要的内存，并且worker的空闲cup数量大于等于diver需要的cup数量，就启动driver
        // 并且将diver从watingDrivers等待调度的队列中溢出
        if (worker.memoryFree >= driver.desc.mem && worker.coresFree >= driver.desc.cores) {
          launchDriver(worker, driver)

          //并指定下一个调度的worker
          waitingDrivers -= driver
          launched = true
        }
        curPos = (curPos + 1) % numWorkersAlive
      }
    }
    startExecutorsOnWorkers()
  }

①Driver的分配

  // 在某一个worker上启动diver
  private def launchDriver(worker: WorkerInfo, driver: DriverInfo) {
    logInfo("Launching driver " + driver.id + " on worker " + worker.id)
    // 将driver加入worker的内存缓存中
    // 将worker内使用的内存和数量，都加上dirver需要的内存和数量
    worker.addDriver(driver)

    // 同时把worker也加入到diver的内存缓存中，这里互相之间就可以相互找到对方
    driver.worker = Some(worker)

    // 发送LanchDriver消息，让worker启动Driver
    worker.endpoint.send(LaunchDriver(driver.id, driver.desc))

    // 将driver的状态置为RUNNING
    driver.state = DriverState.RUNNING
  }

  private def removeDriver(
      driverId: String,
      finalState: DriverState,
      exception: Option[Exception]) {
    // 使用高阶函数，找到 id对应的dirver
    drivers.find(d => d.id == driverId) match {
      case Some(driver) =>
        logInfo(s"Removing driver: $driverId")
        drivers -= driver // 将driver从内存转成中清除
        if (completedDrivers.size >= RETAINED_DRIVERS) {
          val toRemove = math.max(RETAINED_DRIVERS / 10, 1)
          completedDrivers.trimStart(toRemove)
        }

        // 加入已完成driver中
        completedDrivers += driver // 添加到已完成的dirver中
        persistenceEngine.removeDriver(driver) // 从持久化引擎中删除没有连接成功的driver
        driver.state = finalState
        driver.exception = exception
        driver.worker.foreach(w => w.removeDriver(driver)) // 从worker里移除driver
        schedule()
      case None =>
        logWarning(s"Asked to remove unknown driver: $driverId")
    }
  }
}

②application的分配

  /**
   * Schedule and launch executors on workers
   */
  /**
    * app的调度机制，两种调度机制
    * ①spreadOut  将app要使用的资源分平均分配到workers上
    * ②非spreaOut 将app尽可能多的分配到一个或几个worker上，这样其他的worker就不用分配了
    */
  /
  private def startExecutorsOnWorkers(): Unit = {
    // Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
    // in the queue, then the second app, etc.

    // 遍历等待调度的app，同时过滤出它的core
    for (app <- waitingApps if app.coresLeft > 0) {
      val coresPerExecutor: Option[Int] = app.desc.coresPerExecutor

      // Filter out workers that don't have enough resources to launch an executor
      //过滤出状态为ALIVE，可以被app使用的worker
      // 内存可以至少启动一个executor，并且之前没有启动过executor
      val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
        .filter(worker => worker.memoryFree >= app.desc.memoryPerExecutorMB &&
          worker.coresFree >= coresPerExecutor.getOrElse(1))
        .sortBy(_.coresFree).reverse
      val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)

      // Now that we've decided how many cores to allocate on each worker, let's allocate them
      for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
        allocateWorkerResourceToExecutors(
          app, assignedCores(pos), coresPerExecutor, usableWorkers(pos))
      }
    }
  }

  private def scheduleExecutorsOnWorkers(
      app: ApplicationInfo,
      usableWorkers: Array[WorkerInfo],
      spreadOutApps: Boolean): Array[Int] = {
    val coresPerExecutor = app.desc.coresPerExecutor
    val minCoresPerExecutor = coresPerExecutor.getOrElse(1)
    val oneExecutorPerWorker = coresPerExecutor.isEmpty
    val memoryPerExecutor = app.desc.memoryPerExecutorMB
    val numUsable = usableWorkers.length
    val assignedCores = new Array[Int](numUsable) // Number of cores to give to each worker
    val assignedExecutors = new Array[Int](numUsable) // Number of new executors on each worker
    var coresToAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum) // 要给app分配的的每个workr的cup数量 //coresToAssign app要分配的cup数量和worker总共使用cup数量的最小值

    ...
    while (freeWorkers.nonEmpty) {
      freeWorkers.foreach { pos =>
        var keepScheduling = true
        while (keepScheduling && canLaunchExecutor(pos)) {
          coresToAssign -= minCoresPerExecutor
          assignedCores(pos) += minCoresPerExecutor

          // If we are launching one executor per worker, then every iteration assigns 1 core
          // to the executor. Otherwise, every iteration assigns cores to a new executor.
          // 将每个app要启动的executor平均分配到各个workers上，例如：
          // 有20个cup core， 实际分配会循环两遍，第一次给每个worker分配一个core
          // 最后最每个worker分配2个cup core,如果有更多cup core分配，则后续依次累加
          if (oneExecutorPerWorker) {
            assignedExecutors(pos) = 1
          } else {
            assignedExecutors(pos) += 1
          }

          // Spreading out an application means spreading out its executors across as
          // many workers as possible. If we are not spreading out, then we should keep
          // scheduling executors on this worker until we use all of its resources.
          // Otherwise, just move on to the next worker.
          if (spreadOutApps) {
            keepScheduling = false
          }
        }
      }
      freeWorkers = freeWorkers.filter(canLaunchExecutor)
    }
    assignedCores
  }

  private def allocateWorkerResourceToExecutors(
      app: ApplicationInfo,
      assignedCores: Int,
      coresPerExecutor: Option[Int],
      worker: WorkerInfo): Unit = {
    // If the number of cores per executor is specified, we divide the cores assigned
    // to this worker evenly among the executors with no remainder.
    // Otherwise, we launch a single executor that grabs all the assignedCores on this worker.
    val numExecutors = coresPerExecutor.map { assignedCores / _ }.getOrElse(1)
    val coresToAssign = coresPerExecutor.getOrElse(assignedCores)
    for (i <- 1 to numExecutors) {
      val exec = app.addExecutor(worker, coresToAssign)
      launchExecutor(worker, exec)
      app.state = ApplicationState.RUNNING
    }
  }

各个版本的spark，有细微的差异，但是主要的结构和思想都是一样的。