spark源码--master注册原理_spark --master-CSDN博客

本文链接：https://blog.csdn.net/weixin_44024821/article/details/85637925

spark源码中，需要在master上进行注册的一共有三种：worker，yarn-cluster模式下的driver，application
现在就从worker注册来进入spark源码。
由于worker注册是在master中完成的，所以我们进入到spark-core这个包下，org\apache\spark\deploy\master\Master.scala这个类中，master完成worker注册的方法叫做receive()，这个方法传进来一个any，没有返回值。这是一个样例类，会匹配worker和application的注册

case RegisterWorker(
      id, workerHost, workerPort, workerRef, cores, memory, workerWebUiUrl, masterAddress) =>
      logInfo("Registering worker %s:%d with %d cores, %s RAM".format(
        workerHost, workerPort, cores, Utils.megabytesToString(memory)))
      // 如果这个master的状态是standby 就向worker发送MasterInStandby，不做注册
      if (state == RecoveryState.STANDBY) {
        workerRef.send(MasterInStandby)
        // 如果已经有这个workerId了 就发送已经存在这个worker的消息
      } else if (idToWorker.contains(id)) {
        workerRef.send(RegisterWorkerFailed("Duplicate worker ID"))
      } else {
        // 新建一个这个worker的info信息
        val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory,
          workerRef, workerWebUiUrl)
        // 如果这个worker是可以注册成功的，那么就注册这个worker。
        //这里主要是判断这个worker的状态不是dead和unknow
        if (registerWorker(worker)) {
          // 持久化引擎持久化这个worker，并且向work发生master的地址
          persistenceEngine.addWorker(worker)
          workerRef.send(RegisteredWorker(self, masterWebUiUrl, masterAddress))
          // 将worker加入到调度队列中，下一步就是对worker进行调度
          schedule()
        } else {
          val workerAddress = worker.endpoint.address
          logWarning("Worker registration failed. Attempted to re-register worker at same " +
            "address: " + workerAddress)
          workerRef.send(RegisterWorkerFailed("Attempted to re-register worker at same address: "
            + workerAddress))
        }
      }

registerWorker()是对将要注册的worker进行过滤

 private def registerWorker(worker: WorkerInfo): Boolean = {
    // 一个节点可能有一个或着多个死亡的worker
    // There may be one or more refs to dead workers on this same node (w/ different ID's),
    // remove them.
    //如果worker的状态为dead，就把这个worker从列表中移除
    workers.filter { w =>
      (w.host == worker.host && w.port == worker.port) && (w.state == WorkerState.DEAD)
    }.foreach { w =>
      workers -= w
    }
    
    val workerAddress = worker.endpoint.address
    if (addressToWorker.contains(workerAddress)) {
      val oldWorker = addressToWorker(workerAddress)
      if (oldWorker.state == WorkerState.UNKNOWN) {
        //如果一个worker的状态是UNKNOWN，意味着在恢复期间这个worker被重启过，旧的worker肯定是dead
        //所以，要清除就的worker信息,替换成新的worker
        // A worker registering from UNKNOWN implies that the worker was restarted during recovery.
        // The old worker must thus be dead, so we will remove it and accept the new worker.
        removeWorker(oldWorker, "Worker replaced by a new worker with same address")
      } else {
        logInfo("Attempted to re-register worker at same address: " + workerAddress)
        return false
      }
    }
    // 将正常的worker加入workers队列
    workers += worker
    idToWorker(worker.id) = worker
    addressToWorker(workerAddress) = worker
    true
  }

移除状态为dead和unknow的worker。由removeWorker()来完成

 private def removeWorker(worker: WorkerInfo, msg: String) {
    logInfo("Removing worker " + worker.id + " on " + worker.host + ":" + worker.port)
    // 将需要移除的worker的状态设置为dead，移除Id和address
    worker.setState(WorkerState.DEAD)
    idToWorker -= worker.id
    addressToWorker -= worker.endpoint.address
    
    // 将这个worker上的executor状态全部设置为lost，并且移除对应应用上的executor
    for (exec <- worker.executors.values) {
      logInfo("Telling app of lost executor: " + exec.id)
      exec.application.driver.send(ExecutorUpdated(
        exec.id, ExecutorState.LOST, Some("worker lost"), None, workerLost = true))
      exec.state = ExecutorState.LOST
      exec.application.removeExecutor(exec)
    }
    // 如果这个worker上driver是存活状态，再次启动这个driver, 否则直接移除这个driver
    for (driver <- worker.drivers.values) {
      if (driver.desc.supervise) {
        logInfo(s"Re-launching ${driver.id}")
        relaunchDriver(driver)
      } else {
        logInfo(s"Not re-launching ${driver.id} because it was not supervised")
        removeDriver(driver.id, DriverState.ERROR, None)
      }
    }
    // 向app的driver发送消息，通知这个worker被移除
    logInfo(s"Telling app of lost worker: " + worker.id)
    apps.filterNot(completedApps.contains(_)).foreach { app =>
      app.driver.send(WorkerRemoved(worker.id, worker.host, msg))
    }
    // 从持久化引擎中移除这个worker
    persistenceEngine.removeWorker(worker)
  }

重新启动driver的方法。由relaunchDriver()完成

private def relaunchDriver(driver: DriverInfo) {
    // We must setup a new driver with a new driver id here, because the original driver may
    // be still running. Consider this scenario: a worker is network partitioned with master,
    // the master then relaunches driver driverID1 with a driver id driverID2, then the worker
    // reconnects to master. From this point on, if driverID2 is equal to driverID1, then master
    // can not distinguish the statusUpdate of the original driver and the newly relaunched one,
    // for example, when DriverStateChanged(driverID1, KILLED) arrives at master, master will
    // remove driverID1, so the newly relaunched driver disappears too. See SPARK-19900 for details.
    // 我们必须要给这个driver一个新的id，因为原来的driver可能还在运行。
    // 因为如果有两个driverid是一样的，当worker连接master的时候，master就不能识别哪个是新的driver 哪个是旧的driver
    // 如果有个driver状态改变需要kill,那么两个id一样的driver都会被kill掉

    // 移除掉状态为RELAUNCHING的driver 用他的信息重新持久化一个新的driver,然后加入到等待调度的队列中
    removeDriver(driver.id, DriverState.RELAUNCHING, None)
    val newDriver = createDriver(driver.desc)
    persistenceEngine.addDriver(newDriver)
    drivers.add(newDriver)
    waitingDrivers += newDriver
// 这个schedule是调度driver的方法
    schedule()
  }

回到最初的worker注册，worker一旦注册成功，master就可以在worker上调度driver，由schedule()方法来完成

/**
   * Schedule the currently available resources among waiting apps. This method will be called
   * every time a new app joins or resource availability changes.
    * 我们会对等待调度的程序进行资源调度，每次资源变动都会唤醒这个schedule()方法
   */
  private def schedule(): Unit = {
      // 如果master的状态不是alive,什么都不做，直接返回
    if (state != RecoveryState.ALIVE) {
      return
    }
    // driver严格按照executor来执行
    // Drivers take strict precedence over executors
    // 获取之前注册的所有alive的worker，并随机打散
    val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE))
    val numWorkersAlive = shuffledAliveWorkers.size
    var curPos = 0
    // 首先调度driver. yarn-cluster模式下的调度
    //对于standbyalone和yarn-client模式，driver都是在本地启动，不会向master进行注册。
    for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers
    //以轮询的方式来分发driver
      // We assign workers to each waiting driver in a round-robin fashion. For each driver, we
      // start from the last worker that was assigned a driver, and continue onwards until we have
      // explored all alive workers.
      var launched = false
      var numWorkersVisited = 0
      // 只要还有活着的worker没有遍历到，并且当前这个driver还没有启动
      while (numWorkersVisited < numWorkersAlive && !launched) {
        //取出一个活着的worker，分发数量+1
        val worker = shuffledAliveWorkers(curPos)
        numWorkersVisited += 1
        // 如果这个worker的空闲内存大于等于这个driver需要的内存并且worker上的空闲核数大于driver需要的核数
        // 就在这个worker上启动这个driver，然后将这个driver从等待队列中移除
        if (worker.memoryFree >= driver.desc.mem && worker.coresFree >= driver.desc.cores) {
          launchDriver(worker, driver)
          waitingDrivers -= driver
          launched = true
        }
        //指针指向下一个worker
        curPos = (curPos + 1) % numWorkersAlive
      }
    }
    // 启动driver之后，在这个worker上启动executor
    startExecutorsOnWorkers()
  }

driver在worker上启动成功之后，就会在worker上启动运行app所需要的executor.由startExecutorsOnWorkers()来完成

 /**
   * Schedule and launch executors on workers
    * 调度启动executor
   */
  private def startExecutorsOnWorkers(): Unit = {
    // Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
    // in the queue, then the second app, etc.
    // FIFO先进先出的方式调度每个application
    for (app <- waitingApps) {
      // 拿到这个app中每个executor需要的core,如果没有，默认给1
      val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(1)
      // 如果剩余的core少于这个executor需要的core,剩余的core就不会分配了
      // If the cores left is less than the coresPerExecutor,the cores left will not be allocated
      if (app.coresLeft >= coresPerExecutor) {
        // Filter out workers that don't have enough resources to launch an executor
        // 过滤掉没有足够资源启动executor的worker，拿到的都是可以用的worker
        val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
          // worker的空闲内存大于executor需要的内存 worker剩余的core大于executor需要的core
          .filter(worker => worker.memoryFree >= app.desc.memoryPerExecutorMB &&
            worker.coresFree >= coresPerExecutor)
          // 按照worker空闲核数倒序排序
          .sortBy(_.coresFree).reverse
        // 传进来当前的app 可用worker 默认spreadOutApps算法。
        //优先满足core 然后再启动executor。先给每个worker分配完这个application要求的core，然后再调度executor
        val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)

        // Now that we've decided how many cores to allocate on each worker, let's allocate them
        for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
          allocateWorkerResourceToExecutors(
            app, assignedCores(pos), app.desc.coresPerExecutor, usableWorkers(pos))
        }
      }
    }
  }

调度executor需要分发cpu core,这个由scheduleExecutorsOnWorkers()方法完成

/**
   * Schedule executors to be launched on the workers.
   * Returns an array containing number of cores assigned to each worker.
   *在worker上启动executor,返回每个worker上被分配了多少个core的一个数组
    *
   * There are two modes of launching executors. The first attempts to spread out an application's
   * executors on as many workers as possible, while the second does the opposite (i.e. launch them
   * on as few workers as possible). The former is usually better for data locality purposes and is
   * the default.
   *有两种模式来启动executor:
    *   第一种spreadOutApps尝试分发app的executor到尽可能多的worker上，也就是均匀分布
    *   第二种相反，将executor启动在尽可能少的worker上。
    * 由于第一种有更好的数据本地性，所以spreadOutApps是默认调度算法
    *
   * The number of cores assigned to each executor is configurable. When this is explicitly set,
   * multiple executors from the same application may be launched on the same worker if the worker
   * has enough cores and memory. Otherwise, each executor grabs all the cores available on the
   * worker by default, in which case only one executor per application may be launched on each
   * worker during one single schedule iteration.
    * 每个executor被分配的core都是配置的，当这个值被设定了，如果一个worker上有足够多空闲的core和内存，
    * 一个APP的多个executor可能就在同一个worker上启动。如果没有设定每个executor对应的core,那么executor
    * 会尽可能获取这个worker上所有可用的core,这种情况下，一次调度一个worker上每个app只会有一个executor被启动
    *
   * Note that when `spark.executor.cores` is not set, we may still launch multiple executors from
   * the same application on the same worker. Consider appA and appB both have one executor running
   * on worker1, and appA.coresLeft > 0, then appB is finished and release all its cores on worker1,
   * thus for the next schedule iteration, appA launches a new executor that grabs all the free
   * cores on worker1, therefore we get multiple executors from appA running on worker1.
    * 假如spark.executor.cores没有被设定，我们仍然有可能在同一个worker上给同一个app启动多个executor.
    * 我们appA和appB在worker1上共用一个executor，当appA还有需要分配的core,这是appB任务执行完毕，并且释放了他在
    * worker1中的所有的core,这个时候appA就会有多个executor运行在worker1
   *
   * It is important to allocate coresPerExecutor on each worker at a time (instead of 1 core
   * at a time). Consider the following example: cluster has 4 workers with 16 cores each.
   * User requests 3 executors (spark.cores.max = 48, spark.executor.cores = 16). If 1 core is
   * allocated at a time, 12 cores from each worker would be assigned to each executor.
   * Since 12 < 16, no executors would launch [SPARK-8881].
    * 给每个executor分配core很重要，而不是一次给每个executor分配一个core.比如下面这中情况：
    * 集群4个worker,每个worker16个core，一共64个core。用户要求启动3个executor,每个executor16个核，最多需要48core
    * 如果一次分发一个core,那么4个worker每个worker会分发12个给executor.
    * Since 12 < 16, no executors would launch???
   */

// 接受一个app,可用的worker，调度模型，返回一个数组
private def scheduleExecutorsOnWorkers(
app: ApplicationInfo,
usableWorkers: Array[WorkerInfo],
spreadOutApps: Boolean): Array[Int] = {
val coresPerExecutor = app.desc.coresPerExecutor // 每个executor需要的core
val minCoresPerExecutor = coresPerExecutor.getOrElse(1) //每个executor的最小core,没有就给1
val oneExecutorPerWorker = coresPerExecutor.isEmpty // 如果executor核为1，那么一个worker上启动一个executor
val memoryPerExecutor = app.desc.memoryPerExecutorMB // 每个executor需要的内存
val numUsable = usableWorkers.length // 可用的worker
val assignedCores = new Array[Int](numUsable) // Number of cores to give to each worker 每个worker分配的core数量
val assignedExecutors = new Array[Int](numUsable) // Number of new executors on each worker 每个worker要分配的executor数量
// 要分配的核为app需要的core数量和可用的core的最小值
var coresToAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum)

/** Return whether the specified worker can launch an executor for this app. */
// 返回这个指定的worker能否启动executor. 这个pos是指的可用worker的数组下标
def canLaunchExecutor(pos: Int): Boolean = {
val keepScheduling = coresToAssign >= minCoresPerExecutor
// 如果可用的worker上的空闲core - 给这个worker的核数 > 每个executor需要的核
// 因为是spreadOutApps算法，要尽量将executor均匀分配在不同的executor上
val enoughCores = usableWorkers(pos).coresFree - assignedCores(pos) >= minCoresPerExecutor

// 如果我们允许每个worker上多个executor,我们就能够在这个worker上启动一个新的executor,不然，就只能给已经存在的这个worker增加core
// If we allow multiple executors per worker, then we can always launch new executors.
// Otherwise, if there is already an executor on this worker, just give it more cores.
val launchingNewExecutor = !oneExecutorPerWorker || assignedExecutors(pos) == 0
// 如果允许启动新的executor
if (launchingNewExecutor) {
  // 已分配内存 = 这个worker上的executor数量 * 每个executor需要的内存
  val assignedMemory = assignedExecutors(pos) * memoryPerExecutor
  // 内存足够
  val enoughMemory = usableWorkers(pos).memoryFree - assignedMemory >= memoryPerExecutor
  // app executor数量限制
  val underLimit = assignedExecutors.sum + app.executors.size < app.executorLimit
  keepScheduling && enoughCores && enoughMemory && underLimit
} else {
  // 如果不能启动新的executor,那么就把core分配在老的executor中，worker就不需要空闲内存来启动executor,也不存在executor数量限制
  // We're adding cores to an existing executor, so no need
  // to check memory and executor limits
  keepScheduling && enoughCores
}
}

 
 // Keep launching executors until no more workers can accommodate any
// more executors, or if we have reached this application's limits
// 遍历所有能够启动executor的worker,知道所有worker都不能再启动任何executor ,或者executor达到了application的限制
var freeWorkers = (0 until numUsable).filter(canLaunchExecutor)
while (freeWorkers.nonEmpty) { // 只要还有空闲的worker
freeWorkers.foreach { pos => // 取出这个worker的下标
  var keepScheduling = true
  while (keepScheduling && canLaunchExecutor(pos)) {  // 如果这个worker还能够启动executor
    // 每次循环待分配的core要减去每个executor需要的最小core
    coresToAssign -= minCoresPerExecutor
    // worker上已分配的core 要加上对应的core
    assignedCores(pos) += minCoresPerExecutor

    // If we are launching one executor per worker, then every iteration assigns 1 core
    // to the executor. Otherwise, every iteration assigns cores to a new executor.
    if (oneExecutorPerWorker) {
      assignedExecutors(pos) = 1
    } else {
      assignedExecutors(pos) += 1
    }
    
    // Spreading out an application means spreading out its executors across as
    // many workers as possible. If we are not spreading out, then we should keep
    // scheduling executors on this worker until we use all of its resources.
    // Otherwise, just move on to the next worker.
    // 如果是spreadOutApps算法，那就把executor尽可能的启动在多个worker,否则就启动在一个worker上
    // 注意 while循环开始和结束的地方
    if (spreadOutApps) {
      keepScheduling = false
    }
  }
}
// 非 spreadOutApps 算法 过滤出下一个能够启动executor的worker
freeWorkers = freeWorkers.filter(canLaunchExecutor)
}
// 返回了一个Int数组，里面装的是每个worker上用了多少core
assignedCores
}