Spark-YarnClusterSchedulerBackend,YarnSchedulerBackend,CoarseGrainedSchedulerBackend 源码阅读

最新推荐文章于 2022-02-24 00:06:01 发布

姜上清风

最新推荐文章于 2022-02-24 00:06:01 发布

阅读量728

点赞数

分类专栏： Spark 源码文章标签： spark hadoop

本文链接：https://blog.csdn.net/u010374412/article/details/105119244

版权

Spark 源码专栏收录该内容

21 篇文章 0 订阅

订阅专栏

YarnClusterSchedulerBackend

主要负责和 executors 通信，在 driver 的用户线程中工作,TaskSchedulerImpl 的任务会通过这个类及其父类分发到executor。

//主要负责 和 executors 通信，在 driver 的用户线程中工作
 private[spark] class YarnClusterSchedulerBackend(
    scheduler: TaskSchedulerImpl, //持有 TaskSchedulerImpl 对象
    sc: SparkContext)
  extends YarnSchedulerBackend(scheduler, sc) {

  //start 方法，在 TaskSchedulerImpl的start 方法中 被调用
  override def start() {
    val attemptId = ApplicationMaster.getAttemptId //获取 attemptID
    bindToYarn(attemptId.getApplicationId(), Some(attemptId))
    super.start()
    totalExpectedExecutors = SchedulerBackendUtils.getInitialTargetExecutorNumber(sc.conf) //spark-submit --num-executors 指定的executor 的数量 ，不指定的话 默认 2个
  }

  // driver 的 log url
  override def getDriverLogUrls: Option[Map[String, String]] = {
    var driverLogs: Option[Map[String, String]] = None
    try {
      val yarnConf = new YarnConfiguration(sc.hadoopConfiguration)
      val containerId = YarnSparkHadoopUtil.getContainerId

      val httpAddress = System.getenv(Environment.NM_HOST.name()) +
        ":" + System.getenv(Environment.NM_HTTP_PORT.name())
      // lookup appropriate http scheme for container log urls
      val yarnHttpPolicy = yarnConf.get(
        YarnConfiguration.YARN_HTTP_POLICY_KEY,
        YarnConfiguration.YARN_HTTP_POLICY_DEFAULT
      )
      val user = Utils.getCurrentUserName()
      val httpScheme = if (yarnHttpPolicy == "HTTPS_ONLY") "https://" else "http://"
      val baseUrl = s"$httpScheme$httpAddress/node/containerlogs/$containerId/$user"
      logDebug(s"Base URL for logs: $baseUrl")
      driverLogs = Some(Map(
        "stdout" -> s"$baseUrl/stdout?start=-4096",
        "stderr" -> s"$baseUrl/stderr?start=-4096"))
    } catch {
      case e: Exception =>
        logInfo("Error while building AM log links, so AM" +
          " logs link will not appear in application UI", e)
    }
    driverLogs
  }
}

YarnSchedulerBackend

/**
 * 注意
 * 这个 子类是 YarnClusterSchedulerBackend 和 YarnClientSchedulerBackend
 * 所以 在 初始化 这2个子类的时候 会 初始化 YarnSchedulerBackend类
 * driver install yarnSchedulerEndpointRef
 * @param scheduler
 * @param sc
 */
private[spark] abstract class YarnSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext)
  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) {

  private val stopped = new AtomicBoolean(false)

  override val minRegisteredRatio = //最小注册到 driver的executor的比例 默认 0。8
    if (conf.getOption("spark.scheduler.minRegisteredResourcesRatio").isEmpty) {
      0.8
    } else {
      super.minRegisteredRatio
    }

  protected var totalExpectedExecutors = 0 //spark-submit --num-executors 指定的executor 的数量，不指定的话 默认 2个

  private val yarnSchedulerEndpoint = new YarnSchedulerEndpoint(rpcEnv) //主要的作用是 driver用户线程 和 ApplicationMaster Spark AM的线程 交互
  //driver install YarnSchedulerBackend endPoint
  private val yarnSchedulerEndpointRef = rpcEnv.setupEndpoint(
    YarnSchedulerBackend.ENDPOINT_NAME, yarnSchedulerEndpoint)

  private implicit val askTimeout = RpcUtils.askRpcTimeout(sc.conf)

  /** Application ID. */
  protected var appId: Option[ApplicationId] = None

  /** Attempt ID. This is unset for client-mode schedulers */
  private var attemptId: Option[ApplicationAttemptId] = None

  /** Scheduler extension services. */
  private val services: SchedulerExtensionServices = new SchedulerExtensionServices() //SchedulerExtensionService 的 自己扩展服务，可以自己定制服务

  /**
   * Bind to YARN. This *must* be done before calling [[start()]].
   *
   * @param appId YARN application ID
   * @param attemptId Optional YARN attempt ID
   */
  protected def bindToYarn(appId: ApplicationId, attemptId: Option[ApplicationAttemptId]): Unit = {
    this.appId = Some(appId)
    this.attemptId = attemptId
  }

  override def start() {
    require(appId.isDefined, "application ID unset")
    val binding = SchedulerExtensionServiceBinding(sc, appId.get, attemptId)
    services.start(binding) //SchedulerExtensionService 的 自己扩展服务
    super.start()
  }

  override def stop(): Unit = {
    try {
      // SPARK-12009: To prevent Yarn allocator from requesting backup for the executors which
      // was Stopped by SchedulerBackend.
      requestTotalExecutors(0, 0, Map.empty)
      super.stop()
    } finally {
      stopped.set(true)
      services.stop()
    }
  }

  /**
   * Get the attempt ID for this run, if the cluster manager supports multiple
   * attempts. Applications run in client mode will not have attempt IDs.
   * This attempt ID only includes attempt counter, like "1", "2".
   *
   * @return The application attempt id, if available.
   */
  override def applicationAttemptId(): Option[String] = {
    attemptId.map(_.getAttemptId.toString)
  }

  /**
   * Get an application ID associated with the job.
   * This returns the string value of [[appId]] if set, otherwise
   * the locally-generated ID from the superclass.
   * @return The application ID
   */
  override def applicationId(): String = {
    appId.map(_.toString).getOrElse {
      logWarning("Application ID is not initialized yet.")
      super.applicationId
    }
  }

  //申请 一定量的 executors，动态申请资源的时候 会使用到
  private[cluster] def prepareRequestExecutors(requestedTotal: Int): RequestExecutors = {
    val nodeBlacklist: Set[String] = scheduler.nodeBlacklist() //Spark 黑名单机制
    // For locality preferences, ignore preferences for nodes that are blacklisted
    val filteredHostToLocalTaskCount =
      hostToLocalTaskCount.filter { case (k, v) => !nodeBlacklist.contains(k) }
    RequestExecutors(requestedTotal, localityAwareTasks, filteredHostToLocalTaskCount,
      nodeBlacklist)
  }

  /**
   * Request executors from the ApplicationMaster by specifying the total number desired.
   * This includes executors already pending or running.
   */
  //这里使用 YarnSchedulerEndpoint endPoint 向 AM Yarn 申请新的 executor Container
  override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = {
    yarnSchedulerEndpointRef.ask[Boolean](prepareRequestExecutors(requestedTotal))
  }

  /**
   * Request that the ApplicationMaster kill the specified executors.
   */
  //让 yarn KillExecutors
  override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = {
    yarnSchedulerEndpointRef.ask[Boolean](KillExecutors(executorIds))
  }

  override def sufficientResourcesRegistered(): Boolean = {
    //totalRegisteredExecutors = 总的 executor 注册executor
    totalRegisteredExecutors.get() >= totalExpectedExecutors * minRegisteredRatio //spark-submot --nums-executos * 0。8
  }

  /**
   * Add filters to the SparkUI.
   */
  private def addWebUIFilter(
      filterName: String,
      filterParams: Map[String, String],
      proxyBase: String): Unit = {
    if (proxyBase != null && proxyBase.nonEmpty) {
      System.setProperty("spark.ui.proxyBase", proxyBase)
    }

    val hasFilter =
      filterName != null && filterName.nonEmpty &&
      filterParams != null && filterParams.nonEmpty
    if (hasFilter) {
      logInfo(s"Add WebUI Filter. $filterName, $filterParams, $proxyBase")
      conf.set("spark.ui.filters", filterName)
      filterParams.foreach { case (k, v) => conf.set(s"spark.$filterName.param.$k", v) }
      scheduler.sc.ui.foreach { ui => JettyUtils.addFilters(ui.getHandlers, conf) }
    }
  }

  //重写的 YarnDriverEndpoint，目前没有使用到
  override def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
    new YarnDriverEndpoint(rpcEnv, properties)
  }

  /**
   * Reset the state of SchedulerBackend to the initial state. This is happened when AM is failed
   * and re-registered itself to driver after a failure. The stale state in driver should be
   * cleaned.
   */
  override protected def reset(): Unit = {
    super.reset()
    sc.executorAllocationManager.foreach(_.reset())
  }

  /**
   * Override the DriverEndpoint to add extra logic for the case when an executor is disconnected.
   * This endpoint communicates with the executors and queries the AM for an executor's exit
   * status when the executor is disconnected.
   */
  //重写 父类的 CoarseGrainedSchedulerBackend的内部类的 DriverEndpoint
  private class YarnDriverEndpoint(rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
      extends DriverEndpoint(rpcEnv, sparkProperties) {

    /**
     * When onDisconnected is received at the driver endpoint, the superclass DriverEndpoint
     * handles it by assuming the Executor was lost for a bad reason and removes the executor
     * immediately.
     *
     * In YARN's case however it is crucial to talk to the application master and ask why the
     * executor had exited. If the executor exited for some reason unrelated to the running tasks
     * (e.g., preemption), according to the application master, then we pass that information down
     * to the TaskSetManager to inform the TaskSetManager that tasks on that lost executor should
     * not count towards a job failure.
     */
    override def onDisconnected(rpcAddress: RpcAddress): Unit = {
      addressToExecutorId.get(rpcAddress).foreach { executorId =>
        if (!stopped.get) {
          if (disableExecutor(executorId)) {
            yarnSchedulerEndpoint.handleExecutorDisconnectedFromDriver(executorId, rpcAddress)
          }
        }
      }
    }
  }

  /**
   * An [[RpcEndpoint]] that communicates with the ApplicationMaster.
   */
  //主要的作用是 driver用户线程 和 ApplicationMaster Spark AM的线程 交互
  private class YarnSchedulerEndpoint(override val rpcEnv: RpcEnv)
    extends ThreadSafeRpcEndpoint with Logging {
    private var amEndpoint: Option[RpcEndpointRef] = None

    private[YarnSchedulerBackend] def handleExecutorDisconnectedFromDriver(
        executorId: String,
        executorRpcAddress: RpcAddress): Unit = {
      //获取 executor lost 的 原因
      val removeExecutorMessage = amEndpoint match {
        case Some(am) =>
          val lossReasonRequest = GetExecutorLossReason(executorId)
          am.ask[ExecutorLossReason](lossReasonRequest, askTimeout)
            .map { reason => RemoveExecutor(executorId, reason) }(ThreadUtils.sameThread)
            .recover {
              case NonFatal(e) =>
                logWarning(s"Attempted to get executor loss reason" +
                  s" for executor id ${executorId} at RPC address ${executorRpcAddress}," +
                  s" but got no response. Marking as slave lost.", e)
                RemoveExecutor(executorId, SlaveLost())
            }(ThreadUtils.sameThread)
        case None =>
          logWarning("Attempted to check for an executor loss reason" +
            " before the AM has registered!")
          Future.successful(RemoveExecutor(executorId, SlaveLost("AM is not yet registered.")))
      }

      removeExecutorMessage.foreach { message => driverEndpoint.send(message) }
    }

    override def receive: PartialFunction[Any, Unit] = {
      case RegisterClusterManager(am) => //在 ApplicationMaster 的 AMEndpoint 中 start 方法中 会 driver.send(RegisterClusterManager(self))
        logInfo(s"ApplicationMaster registered as $am")
        amEndpoint = Option(am) //开始持有 AM主线程 的 AMEndpoint endPoint
        reset()

      case AddWebUIFilter(filterName, filterParams, proxyBase) =>
        addWebUIFilter(filterName, filterParams, proxyBase)

      case r @ RemoveExecutor(executorId, reason) =>
        if (!stopped.get) {
          logWarning(s"Requesting driver to remove executor $executorId for reason $reason")
          driverEndpoint.send(r)
        }
    }


    override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
      case r: RequestExecutors => //向yarn 申请 Container ， 响应在  AM 线程的 AMEndpoint， 在本类中的 line 150 的 doRequestTotalExecutors 方法中 请求了这个
        amEndpoint match {
          case Some(am) =>
            am.ask[Boolean](r).andThen {
              case Success(b) => context.reply(b)
              case Failure(NonFatal(e)) =>
                logError(s"Sending $r to AM was unsuccessful", e)
                context.sendFailure(e)
            }(ThreadUtils.sameThread)
          case None =>
            logWarning("Attempted to request executors before the AM has registered!")
            context.reply(false)
        }

      case k: KillExecutors =>
        amEndpoint match {
          case Some(am) =>
            am.ask[Boolean](k).andThen { //向 yarn 提交 KillExecutors 信息 ，响应在  AM 线程的 AMEndpoint
              case Success(b) => context.reply(b)
              case Failure(NonFatal(e)) =>
                logError(s"Sending $k to AM was unsuccessful", e)
                context.sendFailure(e)
            }(ThreadUtils.sameThread)
          case None =>
            logWarning("Attempted to kill executors before the AM has registered!")
            context.reply(false)
        }
      //响应YarnAllocator 获取 executor ID的值
      case RetrieveLastAllocatedExecutorId =>
        context.reply(currentExecutorIdCounter)//currentExecutorIdCounter = 0
    }

    override def onDisconnected(remoteAddress: RpcAddress): Unit = {
      if (amEndpoint.exists(_.address == remoteAddress)) {
        logWarning(s"ApplicationMaster has disassociated: $remoteAddress")
        amEndpoint = None
      }
    }
  }
}

private[spark] object YarnSchedulerBackend {
  val ENDPOINT_NAME = "YarnScheduler"
}

CoarseGrainedSchedulerBackend

这个类是 YarnSchedulerBackend 的父类，所以在YarnSchedulerBackend 中会调用此类 start 方法
这个类的内部有一个 DriverEndpoint 负责 driver 的 Backend 与 executors 的通信，包括分发任务，接收exec的任务运行结果通知，注册executor

//这个类是 YarnSchedulerBackend 的父类，所以在YarnSchedulerBackend 中会调用 此类 start 方法
//这个类的内部 有一个 DriverEndpoint 负责 driver 的 Backend 与 executors 的通信，包括 分发任务，接收exec的任务运行结果通知，注册executor
private[spark]
class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: RpcEnv)
  extends ExecutorAllocationClient with SchedulerBackend with Logging {

  // Use an atomic variable to track total number of cores in the cluster for simplicity and speed
  protected val totalCoreCount = new AtomicInteger(0) //总核数 注册executor，removeExecutor 会更新
  // Total number of executors that are currently registered
  protected val totalRegisteredExecutors = new AtomicInteger(0) //总的 executor 注册executor，removeExecutor 会更新
  protected val conf = scheduler.sc.conf
  private val maxRpcMessageSize = RpcUtils.maxMessageSizeBytes(conf) //128M
  private val defaultAskTimeout = RpcUtils.askRpcTimeout(conf) //spark.rpc.askTimeout 120s
  // Submit tasks only after (registered resources / total expected resources)
  // is equal to at least this value, that is double between 0 and 1.
  private val _minRegisteredRatio =
    math.min(1, conf.getDouble("spark.scheduler.minRegisteredResourcesRatio", 0))
  // Submit tasks after maxRegisteredWaitingTime milliseconds
  // if minRegisteredRatio has not yet been reached
  private val maxRegisteredWaitingTimeMs = //最大 executor 注册等待时间
    conf.getTimeAsMs("spark.scheduler.maxRegisteredResourcesWaitingTime", "30s")
  private val createTime = System.currentTimeMillis()

  // Accessing `executorDataMap` in `DriverEndpoint.receive/receiveAndReply` doesn't need any
  // protection. But accessing `executorDataMap` out of `DriverEndpoint.receive/receiveAndReply`
  // must be protected by `CoarseGrainedSchedulerBackend.this`. Besides, `executorDataMap` should
  // only be modified in `DriverEndpoint.receive/receiveAndReply` with protection by
  // `CoarseGrainedSchedulerBackend.this`.

  //保存已经在driver注册过的 executor id 和 ExecutorData 键值对
  private val executorDataMap = new HashMap[String, ExecutorData] //ExecutorData中有这个 executor 的 通信 引用
  // Number of executors requested by the cluster manager, [[ExecutorAllocationManager]]
  @GuardedBy("CoarseGrainedSchedulerBackend.this")
  private var requestedTotalExecutors = 0

  // Number of executors requested from the cluster manager that have not registered yet
  @GuardedBy("CoarseGrainedSchedulerBackend.this")
  private var numPendingExecutors = 0

  private val listenerBus = scheduler.sc.listenerBus

  // Executors we have requested the cluster manager to kill that have not died yet; maps
  // the executor ID to whether it was explicitly killed by the driver (and thus shouldn't
  // be considered an app-related failure).
  @GuardedBy("CoarseGrainedSchedulerBackend.this")
  private val executorsPendingToRemove = new HashMap[String, Boolean]

  // A map to store hostname with its possible task number running on it
  @GuardedBy("CoarseGrainedSchedulerBackend.this")
  protected var hostToLocalTaskCount: Map[String, Int] = Map.empty

  // The number of pending tasks which is locality required
  @GuardedBy("CoarseGrainedSchedulerBackend.this")
  protected var localityAwareTasks = 0

  // The num of current max ExecutorId used to re-register appMaster
  @volatile protected var currentExecutorIdCounter = 0

  private val reviveThread = //receiver 线程，接收 本driver scheduler 的消息
    ThreadUtils.newDaemonSingleThreadScheduledExecutor("driver-revive-thread")


  //driver endPoint
  class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
    extends ThreadSafeRpcEndpoint with Logging {

    // Executors that have been lost, but for which we don't yet know the real exit reason.
    protected val executorsPendingLossReason = new HashSet[String]
    //保存 注册的 executor RpcAddress和ID 消息
    protected val addressToExecutorId = new HashMap[RpcAddress, String]

    override def onStart() {
      // Periodically revive offers to allow delay scheduling to work
      val reviveIntervalMs = conf.getTimeAsMs("spark.scheduler.revive.interval", "1s")

      //定时向自己发送 ReviveOffers 消息 ，响应在 自己的 receive 方法的 case ReviveOffers
      reviveThread.scheduleAtFixedRate(new Runnable {
        override def run(): Unit = Utils.tryLogNonFatalError {
          Option(self).foreach(_.send(ReviveOffers))
        }
      }, 0, reviveIntervalMs, TimeUnit.MILLISECONDS)
    }

    //不需要 回复的消息
    override def receive: PartialFunction[Any, Unit] = {
      case StatusUpdate(executorId, taskId, state, data) => //executor 通知 driver 更新自己的状态 RUNNING的话
        scheduler.statusUpdate(taskId, state, data.value) //更新 dirver 的 task 状态 ，会使用到 TaskSchedulerImpl 的 statusUpdate 方法
        if (TaskState.isFinished(state)) {
          executorDataMap.get(executorId) match {
            case Some(executorInfo) =>
              executorInfo.freeCores += scheduler.CPUS_PER_TASK
              makeOffers(executorId)
            case None =>
              // Ignoring the update since we don't know about the executor.
              logWarning(s"Ignored task status update ($taskId state $state) " +
                s"from unknown executor with ID $executorId")
          }
        }

      case ReviveOffers => //处理自己 定时的 ReviveOffers 消息，
        makeOffers() //这里的意思 是 定时监测 task的存在来执行 task

      // driver kill 某个  executorId 的 taskId
        //响应 line 477 的 killTask 方法
      case KillTask(taskId, executorId, interruptThread, reason) =>
        executorDataMap.get(executorId) match {
          case Some(executorInfo) =>
            executorInfo.executorEndpoint.send(
              KillTask(taskId, executorId, interruptThread, reason))
          case None =>
            // Ignoring the task kill since the executor is not registered.
            logWarning(s"Attempted to kill task $taskId for unknown executor $executorId.")
        }

      case KillExecutorsOnHost(host) => //executor 请求 KillExecutorsOnHost
        scheduler.getExecutorsAliveOnHost(host).foreach { exec =>
          killExecutors(exec.toSeq, adjustTargetNumExecutors = false, countFailures = false,
            force = true) //里面调用  doKillExecutors 方法，doKillExecutors 这个方法 YarnSchedulerBackend 会重写，重写的里面 使用 YarnDriverEndpoint
          // 与 AM的 AMEndpoint 通信 ，通知 yarn Kill 这个 container
        }

      case UpdateDelegationTokens(newDelegationTokens) =>
        executorDataMap.values.foreach { ed =>
          ed.executorEndpoint.send(UpdateDelegationTokens(newDelegationTokens))
        }

      case RemoveExecutor(executorId, reason) =>
        // We will remove the executor's state and cannot restore it. However, the connection
        // between the driver and the executor may be still alive so that the executor won't exit
        // automatically, so try to tell the executor to stop itself. See SPARK-13519.
        executorDataMap.get(executorId).foreach(_.executorEndpoint.send(StopExecutor))
        removeExecutor(executorId, reason)
    }

    //需要回复的消息
    override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
      /**
       * 这里是 响应 executor （CoarseGrainedExecutorBackend 类 onStart 方法 ）的 RegisterExecutor 的部分，
       * 但是要注意 executorRef（NettyRpcEndpointRef）的 反序列化过程
       */
      case RegisterExecutor(executorId, executorRef, hostname, cores, logUrls) =>
        if (executorDataMap.contains(executorId)) {//检查这个 executor是否已经注册过了
          //executorRef 这个ref 已经是 可以和executor 通信了，里面的NettyRpcEnv已经是 driver 节点的已经存在的了
          //所以这个 ref 就可以直接当做driver的 ref 来使用了
          //发送给 此 executor 注册失败消息 见CoarseGrainedExecutorBackend 的receive 方法的 case RegisterExecutorFailed 这个消息是发送即忘记类型的
          executorRef.send(RegisterExecutorFailed("Duplicate executor ID: " + executorId))
          //最后处理 executor 的异步消息 这里是要恢复的 因为本类的本方法是 receiveAndReply
          context.reply(true)
        } else if (scheduler.nodeBlacklist != null &&
          scheduler.nodeBlacklist.contains(hostname)) {//这里是 机器的黑名单控制
          // If the cluster manager gives us an executor on a blacklisted node (because it
          // already started allocating those resources before we informed it of our blacklist,
          // or if it ignored our blacklist), then we reject that executor immediately.
          logInfo(s"Rejecting $executorId as it has been blacklisted.")
          executorRef.send(RegisterExecutorFailed(s"Executor is blacklisted: $executorId"))
          context.reply(true)
        } else {
          // If the executor's rpc env is not listening for incoming connections, `hostPort`
          // will be null, and the client connection should be used to contact the executor.
          val executorAddress = if (executorRef.address != null) {
              executorRef.address
            } else {
              context.senderAddress
            }
          logInfo(s"Registered executor $executorRef ($executorAddress) with ID $executorId")
          addressToExecutorId(executorAddress) = executorId //保存 注册的 executor RpcAddress和ID 消息
          totalCoreCount.addAndGet(cores) //核数 统计
          totalRegisteredExecutors.addAndGet(1) //注册的 executor 数 统计
          val data = new ExecutorData(executorRef, executorAddress, hostname,
            cores, cores, logUrls) //new ExecutorData 对象
          // This must be synchronized because variables mutated
          // in this block are read when requesting executors
          CoarseGrainedSchedulerBackend.this.synchronized {
            executorDataMap.put(executorId, data)
            if (currentExecutorIdCounter < executorId.toInt) {
              currentExecutorIdCounter = executorId.toInt
            }
            if (numPendingExecutors > 0) {
              numPendingExecutors -= 1
              logDebug(s"Decremented number of pending executors ($numPendingExecutors left)")
            }
          }
          executorRef.send(RegisteredExecutor) //发送 executor 注册成功的 one-way message executor 的对应处理逻辑在 CoarseGrainedExecutorBackend receive 中
          // Note: some tests expect the reply to come after we put the executor in the map
          //最后处理 executor 的异步消息 这里是要恢复的 因为本类的本方法是 receiveAndReply
          context.reply(true)
          listenerBus.post(
            SparkListenerExecutorAdded(System.currentTimeMillis(), executorId, data))
          makeOffers()
        }

      case StopDriver =>
        context.reply(true)
        stop()

      case StopExecutors =>
        logInfo("Asking each executor to shut down")
        for ((_, executorData) <- executorDataMap) {
          executorData.executorEndpoint.send(StopExecutor) //通知 每个 executor StopExecutor
        }
        context.reply(true)

      case RemoveWorker(workerId, host, message) =>
        removeWorker(workerId, host, message)
        context.reply(true)
      //处理 executor的 启动时的 object CoarseGrainedExecutorBackend run 方法 的获取 SparkAppConfig
      case RetrieveSparkAppConfig =>
        val reply: SparkAppConfig = SparkAppConfig(
          sparkProperties,
          SparkEnv.get.securityManager.getIOEncryptionKey(),
          fetchHadoopDelegationTokens())
        context.reply(reply)
    }

    // Make fake resource offers on all executors
    //获取所有 executor 的 空闲核数
    //执行 任务 分发的入口函数
    private def makeOffers() {
      // Make sure no executor is killed while some task is launching on it
      val taskDescs: Seq[Seq[TaskDescription]] = withLock {
        // Filter out executors under killing
        val activeExecutors: collection.Map[String, ExecutorData] = executorDataMap.filterKeys(executorIsAlive)  //过滤到 存活的 executor
        val workOffers = activeExecutors.map {//返回 存活的 executor 的 空闲 核数
          case (id, executorData) =>
            new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
        }.toIndexedSeq
        scheduler.resourceOffers(workOffers)//workOffers是 存活的 executor 的 空闲 核数，拿到 可能要运行的 TaskDescription，使用 TaskSchedulerImpl的resourceOffers方法 拿到任务的 TaskDescription
      }
      if (!taskDescs.isEmpty) { //如果有任务存在，则就会 launchTasks 这个 TaskDescription
        launchTasks(taskDescs) //通知目标 executors 运行这些 TaskDescription
      }
    }

    override def onDisconnected(remoteAddress: RpcAddress): Unit = {
      addressToExecutorId
        .get(remoteAddress)
        .foreach(removeExecutor(_, SlaveLost("Remote RPC client disassociated. Likely due to " +
          "containers exceeding thresholds, or network issues. Check driver logs for WARN " +
          "messages.")))
    }

    // Make fake resource offers on just one executor
    private def makeOffers(executorId: String) {
      // Make sure no executor is killed while some task is launching on it
      val taskDescs = withLock {
        // Filter out executors under killing
        if (executorIsAlive(executorId)) {
          val executorData = executorDataMap(executorId)
          val workOffers = IndexedSeq(
            new WorkerOffer(executorId, executorData.executorHost, executorData.freeCores))
          scheduler.resourceOffers(workOffers)
        } else {
          Seq.empty
        }
      }
      if (!taskDescs.isEmpty) {
        launchTasks(taskDescs)
      }
    }

    private def executorIsAlive(executorId: String): Boolean = synchronized {
      !executorsPendingToRemove.contains(executorId) &&
        !executorsPendingLossReason.contains(executorId)
    }

    // Launch tasks returned by a set of resource offers
    //通知目标 executors 运行这些 TaskDescription
    private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
      for (task <- tasks.flatten) {//拍平 Seq[Seq[TaskDescription]]
        val serializedTask = TaskDescription.encode(task) //序列化 TaskDescription
        if (serializedTask.limit() >= maxRpcMessageSize) { //序列化的 长度超过 RPC Msg的限制，则会提示任务失败
          Option(scheduler.taskIdToTaskSetManager.get(task.taskId)).foreach { taskSetMgr =>
            try {
              var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
                "spark.rpc.message.maxSize (%d bytes). Consider increasing " +
                "spark.rpc.message.maxSize or using broadcast variables for large values."
              msg = msg.format(task.taskId, task.index, serializedTask.limit(), maxRpcMessageSize)
              taskSetMgr.abort(msg)
            } catch {
              case e: Exception => logError("Exception in error callback", e)
            }
          }
        }
        else {
          val executorData = executorDataMap(task.executorId) //拿到 目标的 executorData
          executorData.freeCores -= scheduler.CPUS_PER_TASK //更新 目标的 executorData的可用核数

          logDebug(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " +
            s"${executorData.executorHost}.")

          executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask))) //发送这个 executor 运行 LaunchTask 的消息
        }
      }
    }

    // Remove a disconnected slave from the cluster
    private def removeExecutor(executorId: String, reason: ExecutorLossReason): Unit = {
      logDebug(s"Asked to remove executor $executorId with reason $reason")
      executorDataMap.get(executorId) match {
        case Some(executorInfo) =>
          // This must be synchronized because variables mutated
          // in this block are read when requesting executors
          val killed = CoarseGrainedSchedulerBackend.this.synchronized {
            addressToExecutorId -= executorInfo.executorAddress
            executorDataMap -= executorId
            executorsPendingLossReason -= executorId
            executorsPendingToRemove.remove(executorId).getOrElse(false)
          }
          totalCoreCount.addAndGet(-executorInfo.totalCores)
          totalRegisteredExecutors.addAndGet(-1)
          scheduler.executorLost(executorId, if (killed) ExecutorKilled else reason)
          listenerBus.post(
            SparkListenerExecutorRemoved(System.currentTimeMillis(), executorId, reason.toString))
        case None =>
          // SPARK-15262: If an executor is still alive even after the scheduler has removed
          // its metadata, we may receive a heartbeat from that executor and tell its block
          // manager to reregister itself. If that happens, the block manager master will know
          // about the executor, but the scheduler will not. Therefore, we should remove the
          // executor from the block manager when we hit this case.
          scheduler.sc.env.blockManager.master.removeExecutorAsync(executorId)
          logInfo(s"Asked to remove non-existent executor $executorId")
      }
    }

    // Remove a lost worker from the cluster
    private def removeWorker(workerId: String, host: String, message: String): Unit = {
      logDebug(s"Asked to remove worker $workerId with reason $message")
      scheduler.workerRemoved(workerId, host, message)
    }

    /**
     * Stop making resource offers for the given executor. The executor is marked as lost with
     * the loss reason still pending.
     *
     * @return Whether executor should be disabled
     */
    protected def disableExecutor(executorId: String): Boolean = {
      val shouldDisable = CoarseGrainedSchedulerBackend.this.synchronized {
        if (executorIsAlive(executorId)) {
          executorsPendingLossReason += executorId
          true
        } else {
          // Returns true for explicitly killed executors, we also need to get pending loss reasons;
          // For others return false.
          executorsPendingToRemove.contains(executorId)
        }
      }

      if (shouldDisable) {
        logInfo(s"Disabling executor $executorId.")
        scheduler.executorLost(executorId, LossReasonPending)
      }

      shouldDisable
    }
  }

  var driverEndpoint: RpcEndpointRef = null //CoarseGrainedScheduler

  protected def minRegisteredRatio: Double = _minRegisteredRatio //默认 0

  override def start() { //SparkContext 的 _taskScheduler.start() 方法 最终会调用 这个 start 方法
    val properties = new ArrayBuffer[(String, String)]
    for ((key, value) <- scheduler.sc.conf.getAll) {
      if (key.startsWith("spark.")) {
        properties += ((key, value))
      }
    }

    // TODO (prashant) send conf instead of properties
    //driver install name = CoarseGrainedExecutorBackend 的 DriverEndpoint
    driverEndpoint = createDriverEndpointRef(properties)
  }
 //driver install name = CoarseGrainedExecutorBackend 的 DriverEndpoint
  protected def createDriverEndpointRef(
      properties: ArrayBuffer[(String, String)]): RpcEndpointRef = {
    //ENDPOINT_NAME = CoarseGrainedScheduler
    //这个ref 在 executor object CoarseGrainedExecutorBackend 的run 方法 会使用到的
    rpcEnv.setupEndpoint(ENDPOINT_NAME, createDriverEndpoint(properties))
  }
//new DriverEndpoint 这个 内部类
  protected def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
    new DriverEndpoint(rpcEnv, properties)
  }

  //使用 driver 安装过的 driverEndpoint（DriverEndpoint） stop 所有的 Executors
  def stopExecutors() {
    try {
      if (driverEndpoint != null) { //通知 driverEndpoint StopExecutors
        logInfo("Shutting down all executors")
        driverEndpoint.askSync[Boolean](StopExecutors) //响应在 DriverEndpoint 这个内部类的receiveAndReply方法中
      }
    } catch {
      case e: Exception =>
        throw new SparkException("Error asking standalone scheduler to shut down executors", e)
    }
  }

  override def stop() {
    reviveThread.shutdownNow()
    stopExecutors()
    try {
      if (driverEndpoint != null) {
        driverEndpoint.askSync[Boolean](StopDriver)
      }
    } catch {
      case e: Exception =>
        throw new SparkException("Error stopping standalone scheduler's driver endpoint", e)
    }
  }

  /**
   * Reset the state of CoarseGrainedSchedulerBackend to the initial state. Currently it will only
   * be called in the yarn-client mode when AM re-registers after a failure.
   * */
  protected def reset(): Unit = {
    val executors: Set[String] = synchronized {
      requestedTotalExecutors = 0
      numPendingExecutors = 0
      executorsPendingToRemove.clear()
      executorDataMap.keys.toSet
    }

    // Remove all the lingering executors that should be removed but not yet. The reason might be
    // because (1) disconnected event is not yet received; (2) executors die silently.
    executors.foreach { eid =>
      removeExecutor(eid, SlaveLost("Stale executor after cluster manager re-registered."))
    }
  }

  //当TaskScheduler 的定时任务 发现有 可推测任务可以执行的时候会调用这个方法
  //当TaskScheduler 提交 tasks后，也会调用这个方法
  override def reviveOffers() {
    driverEndpoint.send(ReviveOffers)//这里然后会 拿到对应的task的TaskDescript ，来通知executor 执行tasks
  }

  //dirver 发送 killTask的消息，需要 Backend 的 receiver 响应
  //这个方法 会在 TaskSchedulerImpl 的 cancelTasks 和 killTaskAttempt 方法中 调用 用来 kill 某个 executor的 task
  override def killTask(
      taskId: Long, executorId: String, interruptThread: Boolean, reason: String) {
    driverEndpoint.send(KillTask(taskId, executorId, interruptThread, reason))
  }

  override def defaultParallelism(): Int = {
    conf.getInt("spark.default.parallelism", math.max(totalCoreCount.get(), 2))
  }

  /**
   * Called by subclasses when notified of a lost worker. It just fires the message and returns
   * at once.
   */
  //本类的 reset 方法中 有使用到
  protected def removeExecutor(executorId: String, reason: ExecutorLossReason): Unit = {
    driverEndpoint.send(RemoveExecutor(executorId, reason))
  }

  protected def removeWorker(workerId: String, host: String, message: String): Unit = {
    driverEndpoint.ask[Boolean](RemoveWorker(workerId, host, message)).failed.foreach(t =>
      logError(t.getMessage, t))(ThreadUtils.sameThread)
  }

  def sufficientResourcesRegistered(): Boolean = true

  //是否 准备就绪，满足 已经注册的executors的数量 是否已经满足了 spark-submot --nums-executos * 0。8 的阈值 或者 超过了 maxRegisteredWaitingTimeMs
  //这个方法在 TaslSchedulerImpl 中使用，判断 这个backend 是否已经 ready
  override def isReady(): Boolean = {
    if (sufficientResourcesRegistered) { //这个方法在 子类 YarnSchedulerBackend 中，主要最用是表示 已经注册的executors的数量 是否已经满足了 spark-submot --nums-executos * 0。8 的阈值
      logInfo("SchedulerBackend is ready for scheduling beginning after " +
        s"reached minRegisteredResourcesRatio: $minRegisteredRatio")
      return true
    }
    if ((System.currentTimeMillis() - createTime) >= maxRegisteredWaitingTimeMs) {
      logInfo("SchedulerBackend is ready for scheduling beginning after waiting " +
        s"maxRegisteredResourcesWaitingTime: $maxRegisteredWaitingTimeMs(ms)")
      return true
    }
    false
  }

  /**
   * Return the number of executors currently registered with this backend.
   */
  private def numExistingExecutors: Int = executorDataMap.size

  override def getExecutorIds(): Seq[String] = {
    executorDataMap.keySet.toSeq
  }

  /**
   * Request an additional number of executors from the cluster manager.
   * @return whether the request is acknowledged.
   */
  //增加 executor，SparkContext中 有使用到这个方法
  final override def requestExecutors(numAdditionalExecutors: Int): Boolean = {
    if (numAdditionalExecutors < 0) {
      throw new IllegalArgumentException(
        "Attempted to request a negative number of additional executor(s) " +
        s"$numAdditionalExecutors from the cluster manager. Please specify a positive number!")
    }
    logInfo(s"Requesting $numAdditionalExecutors additional executor(s) from the cluster manager")

    val response = synchronized {
      requestedTotalExecutors += numAdditionalExecutors
      numPendingExecutors += numAdditionalExecutors
      logDebug(s"Number of pending executors is now $numPendingExecutors")
      if (requestedTotalExecutors !=
          (numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)) {
        logDebug(
          s"""requestExecutors($numAdditionalExecutors): Executor request doesn't match:
             |requestedTotalExecutors  = $requestedTotalExecutors
             |numExistingExecutors     = $numExistingExecutors
             |numPendingExecutors      = $numPendingExecutors
             |executorsPendingToRemove = ${executorsPendingToRemove.size}""".stripMargin)
      }

      // Account for executors pending to be added or removed
      doRequestTotalExecutors(requestedTotalExecutors)
    }

    defaultAskTimeout.awaitResult(response)
  }

  /**
   * Update the cluster manager on our scheduling needs. Three bits of information are included
   * to help it make decisions.
   * @param numExecutors The total number of executors we'd like to have. The cluster manager
   *                     shouldn't kill any running executor to reach this number, but,
   *                     if all existing executors were to die, this is the number of executors
   *                     we'd want to be allocated.
   * @param localityAwareTasks The number of tasks in all active stages that have a locality
   *                           preferences. This includes running, pending, and completed tasks.
   * @param hostToLocalTaskCount A map of hosts to the number of tasks from all active stages
   *                             that would like to like to run on that host.
   *                             This includes running, pending, and completed tasks.
   * @return whether the request is acknowledged by the cluster manager.
   */
  //向 yarn 申请 executors
  final override def requestTotalExecutors(
      numExecutors: Int,
      localityAwareTasks: Int,
      hostToLocalTaskCount: Map[String, Int]
    ): Boolean = {
    if (numExecutors < 0) {
      throw new IllegalArgumentException(
        "Attempted to request a negative number of executor(s) " +
          s"$numExecutors from the cluster manager. Please specify a positive number!")
    }

    val response = synchronized {
      this.requestedTotalExecutors = numExecutors
      this.localityAwareTasks = localityAwareTasks
      this.hostToLocalTaskCount = hostToLocalTaskCount

      numPendingExecutors =
        math.max(numExecutors - numExistingExecutors + executorsPendingToRemove.size, 0)

      doRequestTotalExecutors(numExecutors)
    }

    defaultAskTimeout.awaitResult(response)
  }

  /**
   * Request executors from the cluster manager by specifying the total number desired,
   * including existing pending and running executors.
   *
   * The semantics here guarantee that we do not over-allocate executors for this application,
   * since a later request overrides the value of any prior request. The alternative interface
   * of requesting a delta of executors risks double counting new executors when there are
   * insufficient resources to satisfy the first request. We make the assumption here that the
   * cluster manager will eventually fulfill all requests when resources free up.
   *
   * @return a future whose evaluation indicates whether the request is acknowledged.
   */
  protected def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] =
    Future.successful(false)

  /**
   * Request that the cluster manager kill the specified executors.
   *
   * @param executorIds identifiers of executors to kill
   * @param adjustTargetNumExecutors whether the target number of executors be adjusted down
   *                                 after these executors have been killed
   * @param countFailures if there are tasks running on the executors when they are killed, whether
   *                      those failures be counted to task failure limits?
   * @param force whether to force kill busy executors, default false
   * @return the ids of the executors acknowledged by the cluster manager to be removed.
   */
  final override def killExecutors(
      executorIds: Seq[String],
      adjustTargetNumExecutors: Boolean,
      countFailures: Boolean,
      force: Boolean): Seq[String] = {
    logInfo(s"Requesting to kill executor(s) ${executorIds.mkString(", ")}")

    val response = withLock {
      val (knownExecutors, unknownExecutors) = executorIds.partition(executorDataMap.contains)
      unknownExecutors.foreach { id =>
        logWarning(s"Executor to kill $id does not exist!")
      }

      // If an executor is already pending to be removed, do not kill it again (SPARK-9795)
      // If this executor is busy, do not kill it unless we are told to force kill it (SPARK-9552)
      val executorsToKill = knownExecutors
        .filter { id => !executorsPendingToRemove.contains(id) }
        .filter { id => force || !scheduler.isExecutorBusy(id) }
      executorsToKill.foreach { id => executorsPendingToRemove(id) = !countFailures }

      logInfo(s"Actual list of executor(s) to be killed is ${executorsToKill.mkString(", ")}")

      // If we do not wish to replace the executors we kill, sync the target number of executors
      // with the cluster manager to avoid allocating new ones. When computing the new target,
      // take into account executors that are pending to be added or removed.
      val adjustTotalExecutors =
        if (adjustTargetNumExecutors) {
          requestedTotalExecutors = math.max(requestedTotalExecutors - executorsToKill.size, 0)
          if (requestedTotalExecutors !=
              (numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)) {
            logDebug(
              s"""killExecutors($executorIds, $adjustTargetNumExecutors, $countFailures, $force):
                 |Executor counts do not match:
                 |requestedTotalExecutors  = $requestedTotalExecutors
                 |numExistingExecutors     = $numExistingExecutors
                 |numPendingExecutors      = $numPendingExecutors
                 |executorsPendingToRemove = ${executorsPendingToRemove.size}""".stripMargin)
          }
          doRequestTotalExecutors(requestedTotalExecutors)
        } else {
          numPendingExecutors += knownExecutors.size
          Future.successful(true)
        }

      val killExecutors: Boolean => Future[Boolean] =
        if (!executorsToKill.isEmpty) {
          _ => doKillExecutors(executorsToKill)
        } else {
          _ => Future.successful(false)
        }

      val killResponse = adjustTotalExecutors.flatMap(killExecutors)(ThreadUtils.sameThread)

      killResponse.flatMap(killSuccessful =>
        Future.successful (if (killSuccessful) executorsToKill else Seq.empty[String])
      )(ThreadUtils.sameThread)
    }

    defaultAskTimeout.awaitResult(response)
  }

  /**
   * Kill the given list of executors through the cluster manager.
   * @return whether the kill request is acknowledged.
   */
  protected def doKillExecutors(executorIds: Seq[String]): Future[Boolean] =
    Future.successful(false)

  /**
   * Request that the cluster manager kill all executors on a given host.
   * @return whether the kill request is acknowledged.
   */
  final override def killExecutorsOnHost(host: String): Boolean = {
    logInfo(s"Requesting to kill any and all executors on host ${host}")
    // A potential race exists if a new executor attempts to register on a host
    // that is on the blacklist and is no no longer valid. To avoid this race,
    // all executor registration and killing happens in the event loop. This way, either
    // an executor will fail to register, or will be killed when all executors on a host
    // are killed.
    // Kill all the executors on this host in an event loop to ensure serialization.
    driverEndpoint.send(KillExecutorsOnHost(host))
    true
  }

  protected def fetchHadoopDelegationTokens(): Option[Array[Byte]] = { None }

  // SPARK-27112: We need to ensure that there is ordering of lock acquisition
  // between TaskSchedulerImpl and CoarseGrainedSchedulerBackend objects in order to fix
  // the deadlock issue exposed in SPARK-27112
  private def withLock[T](fn: => T): T = scheduler.synchronized {
    CoarseGrainedSchedulerBackend.this.synchronized { fn }
  }
}

private[spark] object CoarseGrainedSchedulerBackend {
  val ENDPOINT_NAME = "CoarseGrainedScheduler"
}

姜上清风

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Spark-YarnClusterSchedulerBackend,YarnSchedulerBackend,CoarseGrainedSchedulerBackend 源码阅读

YarnClusterSchedulerBackend主要负责和 executors 通信，在 driver 的用户线程中工作,TaskSchedulerImpl 的任务会通过这个类及其父类分发到executor。//主要负责和 executors 通信，在 driver 的用户线程中工作 private[spark] class YarnClusterSchedulerBacke...
复制链接

扫一扫