Spark源码分析（五）：Worker工作原理

最新推荐文章于 2023-03-07 17:52:53 发布

lxlneversettle

最新推荐文章于 2023-03-07 17:52:53 发布

阅读量261

点赞数

分类专栏： spark core 文章标签： spark

本文链接：https://blog.csdn.net/lxlneversettle/article/details/88873881

版权

spark core 专栏收录该内容

10 篇文章 1 订阅

订阅专栏

Worker工作原理

在之前关于Master调度中我们知道，Master会向Worker发送启动Executor的消息
接下来看一下worker收到启动executor的消息后会怎样处理

// 启动executor
case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) =>
  if (masterUrl != activeMasterUrl) {
    logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")
  } else {
    try {
      logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))

      // Create the executor's working directory
      // 创建工作目录
	  // workDir默认是SPARK_HOME/work
      val executorDir = new File(workDir, appId + "/" + execId)
      if (!executorDir.mkdirs()) {
        throw new IOException("Failed to create directory " + executorDir)
      }

      // Create local dirs for the executor. These are passed to the executor via the
      // SPARK_EXECUTOR_DIRS environment variable, and deleted by the Worker when the
      // application finishes.
      // 通过参数SPARK_EXECUTOR_DIRS来配置app的本地目录
      val appLocalDirs = appDirectories.getOrElse(appId, {
        val localRootDirs = Utils.getOrCreateLocalRootDirs(conf)
        val dirs = localRootDirs.flatMap { dir =>
          try {
            val appDir = Utils.createDirectory(dir, namePrefix = "executor")
            Utils.chmod700(appDir)
            Some(appDir.getAbsolutePath())
          } catch {
            case e: IOException =>
              logWarning(s"${e.getMessage}. Ignoring this directory.")
              None
          }
        }.toSeq
        if (dirs.isEmpty) {
          throw new IOException("No subfolder can be created in " +
            s"${localRootDirs.mkString(",")}.")
        }
        dirs
      })
      appDirectories(appId) = appLocalDirs
      // 创建一个ExecutorRunner(之前执行Driver时还有一个DriverRunner)
      val manager = new ExecutorRunner(
        appId,
        execId,
        appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),
        cores_,
        memory_,
        self,
        workerId,
        host,
        webUi.boundPort,
        publicAddress,
        sparkHome,
        executorDir,
        workerUri,
        conf,
        appLocalDirs, ExecutorState.RUNNING)
      // 加入本地缓存
      executors(appId + "/" + execId) = manager
      manager.start()
      // 更新使用资源
      coresUsed += cores_
      memoryUsed += memory_
      // 向master发送executor状态变化信息
      sendToMaster(ExecutorStateChanged(appId, execId, manager.state, None, None))
    } catch {
      case e: Exception =>
        logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e)
        if (executors.contains(appId + "/" + execId)) {
          executors(appId + "/" + execId).kill()
          executors -= appId + "/" + execId
        }
        sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED,
          Some(e.toString), None))
    }
  }

private[worker] def start() {
 // 创建了一个线程
 workerThread = new Thread("ExecutorRunner for " + fullId) {
   override def run() { fetchAndRunExecutor() }
 }
 workerThread.start()
 // Shutdown hook that kills actors on shutdown.
 shutdownHook = ShutdownHookManager.addShutdownHook { () =>
   // It's possible that we arrive here before calling `fetchAndRunExecutor`, then `state` will
   // be `ExecutorState.RUNNING`. In this case, we should set `state` to `FAILED`.
   if (state == ExecutorState.RUNNING) {
     state = ExecutorState.FAILED
   }
   killProcess(Some("Worker shutting down")) }
}

private def fetchAndRunExecutor() {
 try {
   // Launch the process
   val subsOpts = appDesc.command.javaOpts.map {
     Utils.substituteAppNExecIds(_, appId, execId.toString)
   }
   val subsCommand = appDesc.command.copy(javaOpts = subsOpts)
   // 也会创建一个进程
   val builder = CommandUtils.buildProcessBuilder(subsCommand, new SecurityManager(conf),
     memory, sparkHome.getAbsolutePath, substituteVariables)
   val command = builder.command()
   val formattedCommand = command.asScala.mkString("\"", "\" \"", "\"")
   logInfo(s"Launch command: $formattedCommand")

   // 设置一些环境变量
   builder.directory(executorDir)
   builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
   // In case we are running this from within the Spark Shell, avoid creating a "scala"
   // parent process for the executor command
   builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")

   // Add webUI log urls
   val baseUrl =
     if (conf.getBoolean("spark.ui.reverseProxy", false)) {
       s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType="
     } else {
       s"http://$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
     }
   // 重定向输出流
   // 将Executor的输出信息写入到本地工作目录的stderr和stdout中
   builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
   builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")

   process = builder.start()
   val header = "Spark Executor Command: %s\n%s\n\n".format(
     formattedCommand, "=" * 40)

   // Redirect its stdout and stderr to files
   val stdout = new File(executorDir, "stdout")
   stdoutAppender = FileAppender(process.getInputStream, stdout, conf)

   val stderr = new File(executorDir, "stderr")
   Files.write(header, stderr, StandardCharsets.UTF_8)
   stderrAppender = FileAppender(process.getErrorStream, stderr, conf)

   // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)
   // or with nonzero exit code
   // 执行进程，等待执行完毕获得执行状态
   val exitCode = process.waitFor()
   state = ExecutorState.EXITED
   val message = "Command exited with code " + exitCode
   worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))
 } catch {
   case interrupted: InterruptedException =>
     logInfo("Runner thread for executor " + fullId + " interrupted")
     state = ExecutorState.KILLED
     killProcess(None)
   case e: Exception =>
     logError("Error running executor", e)
     state = ExecutorState.FAILED
     killProcess(Some(e.toString))
 }
}

从上面可以看出，不管是在Worker上启动Driver还是Executor，都是先创建一个XXXRunner，然后调用该对象的start()方法，在该方法中会启动一个线程，在该线程中会创建工作目录，然后创建并且启动一个进程，在该进程中来执行Driver或者Executor
在上面启动进程时实际上会执行CoarseGrainedExecutorBackend的main()
下面来看一下该类的main()

def main(args: Array[String]) {
  var driverUrl: String = null
  var executorId: String = null
  var hostname: String = null
  var cores: Int = 0
  var appId: String = null
  var workerUrl: Option[String] = None
  val userClassPath = new mutable.ListBuffer[URL]()

  var argv = args.toList
  
  while (!argv.isEmpty) {
    argv match {
      case ("--driver-url") :: value :: tail =>
        driverUrl = value
        argv = tail
      case ("--executor-id") :: value :: tail =>
        executorId = value
        argv = tail
      case ("--hostname") :: value :: tail =>
        hostname = value
        argv = tail
      case ("--cores") :: value :: tail =>
        cores = value.toInt
        argv = tail
      case ("--app-id") :: value :: tail =>
        appId = value
        argv = tail
      case ("--worker-url") :: value :: tail =>
        // Worker url is used in spark standalone mode to enforce fate-sharing with worker
        workerUrl = Some(value)
        argv = tail
      case ("--user-class-path") :: value :: tail =>
        userClassPath += new URL(value)
        argv = tail
      case Nil =>
      case tail =>
        // scalastyle:off println
        System.err.println(s"Unrecognized options: ${tail.mkString(" ")}")
        // scalastyle:on println
        printUsageAndExit()
    }
  }

  if (driverUrl == null || executorId == null || hostname == null || cores <= 0 ||
    appId == null) {
    printUsageAndExit()
  }

  run(driverUrl, executorId, hostname, cores, appId, workerUrl, userClassPath)
  System.exit(0)
}

该函数主要会解析一些信息，比如driver地址，executor id 等等,最后会调用run()
下面看一下run()

private def run(
      driverUrl: String,
      executorId: String,
      hostname: String,
      cores: Int,
      appId: String,
      workerUrl: Option[String],
      userClassPath: Seq[URL]) {

 Utils.initDaemon(log)

 SparkHadoopUtil.get.runAsSparkUser { () =>
   // Debug code
   Utils.checkHost(hostname)

   // Bootstrap to fetch the driver's Spark properties.
   val executorConf = new SparkConf
   val fetcher = RpcEnv.create(
     "driverPropsFetcher",
     hostname,
     -1,
     executorConf,
     new SecurityManager(executorConf),
     clientMode = true)
   val driver = fetcher.setupEndpointRefByURI(driverUrl)
   val cfg = driver.askSync[SparkAppConfig](RetrieveSparkAppConfig)
   val props = cfg.sparkProperties ++ Seq[(String, String)](("spark.app.id", appId))
   fetcher.shutdown()

   // Create SparkEnv using properties we fetched from the driver.
   val driverConf = new SparkConf()
   for ((key, value) <- props) {
     // this is required for SSL in standalone mode
     if (SparkConf.isExecutorStartupConf(key)) {
       driverConf.setIfMissing(key, value)
     } else {
       driverConf.set(key, value)
     }
   }

   cfg.hadoopDelegationCreds.foreach { tokens =>
     SparkHadoopUtil.get.addDelegationTokens(tokens, driverConf)
   }

   val env = SparkEnv.createExecutorEnv(
     driverConf, executorId, hostname, cores, cfg.ioEncryptionKey, isLocal = false)

   env.rpcEnv.setupEndpoint("Executor", new CoarseGrainedExecutorBackend(
     env.rpcEnv, driverUrl, executorId, hostname, cores, userClassPath, env))
   workerUrl.foreach { url =>
     env.rpcEnv.setupEndpoint("WorkerWatcher", new WorkerWatcher(env.rpcEnv, url))
   }
   env.rpcEnv.awaitTermination()
 }
}

该方法最主要会启动一个CoarseGrainedExecutorBackend
下面看一下CoarseGrainedExecutorBackend的onStart()

override def onStart() {
  logInfo("Connecting to driver: " + driverUrl)
  // 获取driver的rpcEndPoint
  rpcEnv.asyncSetupEndpointRefByURI(driverUrl).flatMap { ref =>
    // This is a very fast action so we can use "ThreadUtils.sameThread"
    driver = Some(ref)
    // 向driver发送注册信息
    
    ref.ask[Boolean](RegisterExecutor(executorId, self, hostname, cores, extractLogUrls))
  }(ThreadUtils.sameThread).onComplete {
    // This is a very fast action so we can use "ThreadUtils.sameThread"
    case Success(msg) =>
      // Always receive `true`. Just ignore it
    case Failure(e) =>
      exitExecutor(1, s"Cannot register with driver: $driverUrl", e, notifyDriver = false)
  }(ThreadUtils.sameThread)
}

该方法最主要的就是向Driver进行注册
下面看一下Drvier如何处理
CoarseGrainedSchedulerBackend

override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {

  case RegisterExecutor(executorId, executorRef, hostname, cores, logUrls) =>
  	// 首先判断executorId是否重复
     if (executorDataMap.contains(executorId)) {
       executorRef.send(RegisterExecutorFailed("Duplicate executor ID: " + executorId))
       context.reply(true)
     } else if (scheduler.nodeBlacklist.contains(hostname)) {
     // 接着判断executor所在节点是否在黑名单中
     // 告知executor，它在黑名单中
       // If the cluster manager gives us an executor on a blacklisted node (because it
       // already started allocating those resources before we informed it of our blacklist,
       // or if it ignored our blacklist), then we reject that executor immediately.
       logInfo(s"Rejecting $executorId as it has been blacklisted.")
       executorRef.send(RegisterExecutorFailed(s"Executor is blacklisted: $executorId"))
       context.reply(true)
     } else {
       // If the executor's rpc env is not listening for incoming connections, `hostPort`
       // will be null, and the client connection should be used to contact the executor.
       val executorAddress = if (executorRef.address != null) {
           executorRef.address
         } else {
           context.senderAddress
         }
       logInfo(s"Registered executor $executorRef ($executorAddress) with ID $executorId")
       addressToExecutorId(executorAddress) = executorId
       totalCoreCount.addAndGet(cores)
       totalRegisteredExecutors.addAndGet(1)
       val data = new ExecutorData(executorRef, executorAddress, hostname,
         cores, cores, logUrls)
       // This must be synchronized because variables mutated
       // in this block are read when requesting executors
       // 将executor信息记录下来
       CoarseGrainedSchedulerBackend.this.synchronized {
         executorDataMap.put(executorId, data)
         if (currentExecutorIdCounter < executorId.toInt) {
           currentExecutorIdCounter = executorId.toInt
         }
         if (numPendingExecutors > 0) {
           numPendingExecutors -= 1
           logDebug(s"Decremented number of pending executors ($numPendingExecutors left)")
         }
       }
       // 告知executor注册成功
       executorRef.send(RegisteredExecutor)
       // Note: some tests expect the reply to come after we put the executor in the map
       context.reply(true)
       listenerBus.post(
         SparkListenerExecutorAdded(System.currentTimeMillis(), executorId, data))
       // 将task分配到executor上
       makeOffers()
     }
}

上面函数中最重要的部分就是会触发makeOffers()，这个函数下一篇再讲
接下来看一下CoarseGrainedExecutorBackend接收到响应之后会做什么

case RegisteredExecutor =>
  logInfo("Successfully registered with driver")
  try {
    // 创建了一个executor对象
    // executor会作为后续事件的执行句柄
    executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false)
  } catch {
    case NonFatal(e) =>
      exitExecutor(1, "Unable to create executor due to " + e.getMessage, e)
  }

当CoarseGrainedExecutorBackend接收到来自Driver的注册成功响应之后，会创建一个executor对象，executor会作为后续事件的执行句柄

下面就是Worker的工作原理图
另外我们从源码中也可以知道，每个组件都对应着一个叫做CoarseGrainedXXXBackend，该对象主要就是用来和其他组件进行通信，比如这里的CoarseGrainedExecutorBackend，就是Executor和其他组件进行通信的
又比如CoarseGrainedSchedulerBackend就是Driver用来和其他组件进行通信的
Worker原理图

lxlneversettle

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Spark源码分析（五）：Worker工作原理

Worker工作原理在之前关于Master调度中我们知道，Master会向Worker发送启动Executor的消息接下来看一下worker收到启动executor的消息后会怎样处理// 启动executorcase LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) => if (masterUr...
复制链接

扫一扫

专栏目录