spark2.3 源码分析之CoarseGrainedSchedulerBackend

最新推荐文章于 2022-12-22 15:35:23 发布

zhifeng687

最新推荐文章于 2022-12-22 15:35:23 发布

阅读量1.5k

点赞数

分类专栏： spark

本文链接：https://blog.csdn.net/qq_26222859/article/details/79190306

版权

spark 专栏收录该内容

30 篇文章 4 订阅

订阅专栏

概述

CoarseGrainedSchedulerBackend是一个阻塞等待coarse-grained executors来连接的SchedulerBackend。该SchedulerBackend会在整个spark-job期间持有每个executor，而不是一个task结束后再为下一个task向Scheduler申请新的executor。executor可以通过多种方式建立，具体的方式由继承CoarseGrainedSchedulerBackend的类实现。

CoarseGrainedSchedulerBackend向ExecutorBackend端发送的消息主要如下：

registeredExecutor：回复ExecutorBackend注册功能，ExecutorBackend接到后会创建Executor。
LaunchTask：通知Executor启动一个task，消息中包含序列化的task信息，Executor通过该信息启动task。

与register executor相关的成员变量

// Total number of executors that are currently registered
  protected val totalRegisteredExecutors = new AtomicInteger(0)

 // Accessing `executorDataMap` in `DriverEndpoint.receive/receiveAndReply` doesn't need any
  // protection. But accessing `executorDataMap` out of `DriverEndpoint.receive/receiveAndReply`
  // must be protected by `CoarseGrainedSchedulerBackend.this`. Besides, `executorDataMap` should
  // only be modified in `DriverEndpoint.receive/receiveAndReply` with protection by
  // `CoarseGrainedSchedulerBackend.this`.
  private val executorDataMap = new HashMap[String, ExecutorData]

  // Number of executors requested by the cluster manager, [[ExecutorAllocationManager]]
  @GuardedBy("CoarseGrainedSchedulerBackend.this")
  private var requestedTotalExecutors = 0

  // Number of executors requested from the cluster manager that have not registered yet
  @GuardedBy("CoarseGrainedSchedulerBackend.this")
  private var numPendingExecutors = 0

创建DriverEndpointRef

override def start() {
    val properties = new ArrayBuffer[(String, String)]
    for ((key, value) <- scheduler.sc.conf.getAll) {
      if (key.startsWith("spark.")) {
        properties += ((key, value))
      }
    }

    // TODO (prashant) send conf instead of properties
    driverEndpoint = createDriverEndpointRef(properties)
  }

  protected def createDriverEndpointRef(
      properties: ArrayBuffer[(String, String)]): RpcEndpointRef = {
    rpcEnv.setupEndpoint(ENDPOINT_NAME, createDriverEndpoint(properties))
  }

  protected def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
    new DriverEndpoint(rpcEnv, properties)
  }

接收executor端的注册，并返回registeredExecutor消息

override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {

      case RegisterExecutor(executorId, executorRef, hostname, cores, logUrls) =>
        if (executorDataMap.contains(executorId)) {
//如果该execId已经注册过，向ExecutorBackend端发送RegisterExecutorFailed消息
          executorRef.send(RegisterExecutorFailed("Duplicate executor ID: " + executorId))
          context.reply(true)
        } else if (scheduler.nodeBlacklist.contains(hostname)) {
          // If the cluster manager gives us an executor on a blacklisted node (because it
          // already started allocating those resources before we informed it of our blacklist,
          // or if it ignored our blacklist), then we reject that executor immediately.
          logInfo(s"Rejecting $executorId as it has been blacklisted.")
//如果该execId在黑名单内，向ExecutorBackend端发送RegisterExecutorFailed消息
          executorRef.send(RegisterExecutorFailed(s"Executor is blacklisted: $executorId"))
          context.reply(true)
        } else {
          // If the executor's rpc env is not listening for incoming connections, `hostPort`
          // will be null, and the client connection should be used to contact the executor.
          val executorAddress = if (executorRef.address != null) {
              executorRef.address
            } else {
              context.senderAddress
            }
//输出executor已经成功注册的日志
          logInfo(s"Registered executor $executorRef ($executorAddress) with ID $executorId")
          addressToExecutorId(executorAddress) = executorId
          totalCoreCount.addAndGet(cores)
          totalRegisteredExecutors.addAndGet(1)
          val data = new ExecutorData(executorRef, executorAddress, hostname,
            cores, cores, logUrls)
          // This must be synchronized because variables mutated
          // in this block are read when requesting executors
          CoarseGrainedSchedulerBackend.this.synchronized {
            executorDataMap.put(executorId, data)
            if (currentExecutorIdCounter < executorId.toInt) {
              currentExecutorIdCounter = executorId.toInt
            }
            if (numPendingExecutors > 0) {
              numPendingExecutors -= 1
              logDebug(s"Decremented number of pending executors ($numPendingExecutors left)")
            }
          }
//向ExecutorBackend端发送RegisteredExecutor的消息
          executorRef.send(RegisteredExecutor)
          // Note: some tests expect the reply to come after we put the executor in the map
          context.reply(true)
          listenerBus.post(
            SparkListenerExecutorAdded(System.currentTimeMillis(), executorId, data))
          makeOffers()
        }

      case StopDriver =>
        context.reply(true)
        stop()

      case StopExecutors =>
        logInfo("Asking each executor to shut down")
        for ((_, executorData) <- executorDataMap) {
          executorData.executorEndpoint.send(StopExecutor)
        }
        context.reply(true)

      case RemoveWorker(workerId, host, message) =>
        removeWorker(workerId, host, message)
        context.reply(true)

      case RetrieveSparkAppConfig =>
        val reply = SparkAppConfig(
          sparkProperties,
          SparkEnv.get.securityManager.getIOEncryptionKey(),
          fetchHadoopDelegationTokens())
        context.reply(reply)
    }

跑spark任务输出executor已经成功注册的日志如下：

2019-06-28 19:48:52 [ INFO] [dispatcher-event-loop-2] scheduler.cluster.YarnClientSchedulerBackend:58 Registered executor NettyRpcEndpointRef(null) (hadoop8:56828) with ID 2
2019-06-28 19:48:53 [ INFO] [dispatcher-event-loop-8] scheduler.cluster.YarnClientSchedulerBackend:58 Registered executor NettyRpcEndpointRef(null) (hadoop22:53906) with ID 6
2019-06-28 19:48:56 [ INFO] [dispatcher-event-loop-0] scheduler.cluster.YarnClientSchedulerBackend:58 Registered executor NettyRpcEndpointRef(null) (hadoop13:51644) with ID 8
2019-06-28 19:48:57 [ INFO] [dispatcher-event-loop-10] scheduler.cluster.YarnClientSchedulerBackend:58 Registered executor NettyRpcEndpointRef(null) (hadoop12:45570) with ID 3

向ExecutorBackend端发送LaunchTask消息

// Launch tasks returned by a set of resource offers
    private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
      for (task <- tasks.flatten) {
        val serializedTask = TaskDescription.encode(task)
        if (serializedTask.limit() >= maxRpcMessageSize) {
          Option(scheduler.taskIdToTaskSetManager.get(task.taskId)).foreach { taskSetMgr =>
            try {
              var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
                "spark.rpc.message.maxSize (%d bytes). Consider increasing " +
                "spark.rpc.message.maxSize or using broadcast variables for large values."
              msg = msg.format(task.taskId, task.index, serializedTask.limit(), maxRpcMessageSize)
              taskSetMgr.abort(msg)
            } catch {
              case e: Exception => logError("Exception in error callback", e)
            }
          }
        }
        else {
          val executorData = executorDataMap(task.executorId)
          executorData.freeCores -= scheduler.CPUS_PER_TASK

          logDebug(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " +
            s"${executorData.executorHost}.")

          executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))
        }
      }
    }