Spark2.2 Worker原理剖析图解及源码剖析

Worker原理剖析图解

这里写图片描述


LaunchExecutor()源码剖析

这里写图片描述

LaunchExecutor

    /**
     * leen
     */
    case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) =>
      if (masterUrl != activeMasterUrl) {
        logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")
      } else {
        try {
          logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))

          // 创建Executors的工作目录
          val executorDir = new File(workDir, appId + "/" + execId)
          // 验证是否创建成功
          if (!executorDir.mkdirs()) {
            throw new IOException("Failed to create directory " + executorDir)
          }
          // 为Executor创建本地目录,这些通过SPARK_EXECUTOR_DIRS环境变量传递给executor,并在应用程序完成时删除。
          val appLocalDirs = appDirectories.getOrElse(appId, {
            val localRootDirs = Utils.getOrCreateLocalRootDirs(conf)
            val dirs = localRootDirs.flatMap { dir =>
              try {
                val appDir = Utils.createDirectory(dir, namePrefix = "executor")
                Utils.chmod700(appDir)
                Some(appDir.getAbsolutePath())
              } catch {
                case e: IOException =>
                  logWarning(s"${e.getMessage}. Ignoring this directory.")
                  None
              }
            }.toSeq
            if (dirs.isEmpty) {
              throw new IOException("No subfolder can be created in " +
                s"${localRootDirs.mkString(",")}.")
            }
            dirs
          })
          appDirectories(appId) = appLocalDirs
          /**
           * 创建ExecutorRunner对象
           */
          val manager = new ExecutorRunner(
            appId,
            execId,
            appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),
            cores_,
            memory_,
            self,
            workerId,
            host,
            webUi.boundPort,
            publicAddress,
            sparkHome,
            executorDir,
            workerUri,
            conf,
            appLocalDirs, ExecutorState.RUNNING)
          executors(appId + "/" + execId) = manager
          // 调用ExecutorRunner对象的start() 方法
          manager.start()
          // 更新资源占用
          coresUsed += cores_
          memoryUsed += memory_
          // 当Executor的状态值发生变化的时候,发送给Master进行处理  ExecutorStateChanged()
          sendToMaster(ExecutorStateChanged(appId, execId, manager.state, None, None))
        } catch {
          case e: Exception =>
            logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e)
            if (executors.contains(appId + "/" + execId)) {
              executors(appId + "/" + execId).kill()
              executors -= appId + "/" + execId
            }
            sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED,
              Some(e.toString), None))
        }
      }

ExecutorRunner的start方法

  /**
   * leen
   * start()方法
   */
  private[worker] def start() {
    // 创建一个Java的线程体
    workerThread = new Thread("ExecutorRunner for " + fullId) {
      // fetchAndRunExecutor() 创建并运行Executor
      override def run() {
        fetchAndRunExecutor()
      }
    }
    workerThread.start()
    shutdownHook = ShutdownHookManager.addShutdownHook { () =>
      // 在我们调用fetchAndRunExecutor方法之前,我们可能会到达这里,这种情况下,设置state = ExecutorState.FAILED
      if (state == ExecutorState.RUNNING) {
        state = ExecutorState.FAILED
      }
      // Kill executor进程,等待退出,并通知worker更新资源状态
      killProcess(Some("Worker shutting down"))
    }
  }

fetchAndRunExecutor

  /**
   * 下载并运行我们ApplicationDescription中描述的Executor
   */
  private def fetchAndRunExecutor() {
    try {
      // 创建ProcessBuilder,启动进程
      val builder = CommandUtils.buildProcessBuilder(appDesc.command, new SecurityManager(conf),
        memory, sparkHome.getAbsolutePath, substituteVariables)
      // 启动命令
      val command = builder.command()
      // 格式化启动命令
      val formattedCommand = command.asScala.mkString("\"", "\" \"", "\"")
      logInfo(s"Launch command: $formattedCommand")

      builder.directory(executorDir)
      builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
      builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")

      // 添加webUI日志网址
      val baseUrl =
        if (conf.getBoolean("spark.ui.reverseProxy", false)) {
          s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType="
        } else {
          s"http://$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
        }
      builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
      builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")

      /** 启动builder */
      process = builder.start()
      val header = "Spark Executor Command: %s\n%s\n\n".format(
        formattedCommand, "=" * 40)

      // 将其stdout和stderr重定向到文件
      val stdout = new File(executorDir, "stdout")
      stdoutAppender = FileAppender(process.getInputStream, stdout, conf)

      val stderr = new File(executorDir, "stderr")
      Files.write(header, stderr, StandardCharsets.UTF_8)
      stderrAppender = FileAppender(process.getErrorStream, stderr, conf)

      //等待它退出;执行器可以使用代码0退出(当Driver命令它关闭),或者非0的退出状态值
      val exitCode = process.waitFor()
      state = ExecutorState.EXITED
      val message = "Command exited with code " + exitCode
      // 当Executor的状态值发生变化的时候,发送给Master进行处理  ExecutorStateChanged()
      worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))
    } catch {
      case interrupted: InterruptedException =>
        logInfo("Runner thread for executor " + fullId + " interrupted")
        state = ExecutorState.KILLED
        killProcess(None)
      case e: Exception =>
        logError("Error running executor", e)
        state = ExecutorState.FAILED
        killProcess(Some(e.toString))
    }
  }

killProcess

  /**
   * Kill executor进程,等待退出,并通知worker更新资源状态
   * @param message 引起Executor失败的异常消息
   */
  private def killProcess(message: Option[String]) {
    var exitCode: Option[Int] = None
    if (process != null) {
      logInfo("Killing process!")
      if (stdoutAppender != null) {
        stdoutAppender.stop()
      }
      if (stderrAppender != null) {
        stderrAppender.stop()
      }
      exitCode = Utils.terminateProcess(process, EXECUTOR_TERMINATE_TIMEOUT_MS)
      if (exitCode.isEmpty) {
        logWarning("Failed to terminate process: " + process +
          ". This process will likely be orphaned.")
      }
    }
    try {
      worker.send(ExecutorStateChanged(appId, execId, state, message, exitCode))
    } catch {
      case e: IllegalStateException => logWarning(e.getMessage(), e)
    }
  }

ExecutorStateChanged

/**接收到 ExecutorStateChanged 消息的时候  ===>>>  handleExecutorStateChanged()*/

case executorStateChanged@ExecutorStateChanged(appId, execId, state, message, exitStatus) =>
handleExecutorStateChanged(executorStateChanged)

handleExecutorStateChanged

  /**
   * 处理 handleExecutorStateChanged
   * @param executorStateChanged
   */
  private[worker] def handleExecutorStateChanged(executorStateChanged: ExecutorStateChanged):
  Unit = {
    // 发送消息给Master 处理
    sendToMaster(executorStateChanged)
    val state = executorStateChanged.state
    //如果 Executor是正常完成,进行资源处理
    if (ExecutorState.isFinished(state)) {
      val appId = executorStateChanged.appId
      val fullId = appId + "/" + executorStateChanged.execId
      val message = executorStateChanged.message
      val exitStatus = executorStateChanged.exitStatus
      executors.get(fullId) match {
        case Some(executor) =>
          logInfo("Executor " + fullId + " finished with state " + state +
            message.map(" message " + _).getOrElse("") +
            exitStatus.map(" exitStatus " + _).getOrElse(""))
          executors -= fullId
          finishedExecutors(fullId) = executor
          trimFinishedExecutorsIfNecessary()
          coresUsed -= executor.cores
          memoryUsed -= executor.memory
        case None =>
          logInfo("Unknown Executor " + fullId + " finished with state " + state +
            message.map(" message " + _).getOrElse("") +
            exitStatus.map(" exitStatus " + _).getOrElse(""))
      }

      //判断是否整个Application完成,如果是,则清理整个Application的资源占用
      maybeCleanupApplication(appId)
    }
  }

Master的ExecutorStateChanged方法

    /**
     * leen
     * Executorde的状态发生改变
     */
    case ExecutorStateChanged(appId, execId, state, message, exitStatus) =>
      // 1.找到Executor所对应的App,之后反过来通过App内部的Executors缓存获得 ExecutorDescription
      //   其中ExecutorDescription中含有 appId、execId、cores、state[ExecutorState.Value]信息
      val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId))
      execOption match {
        // 2.如果有值
        case Some(exec) =>
          val appInfo = idToApp(appId)
          // 2.1 设置Executor的状态
          val oldState = exec.state
          exec.state = state
          // 2.2 如果Executor的状态为:RUNNING
          if (state == ExecutorState.RUNNING) {
            assert(oldState == ExecutorState.LAUNCHING,
              s"executor $execId state transfer from $oldState to RUNNING is illegal")
            appInfo.resetRetryCount()
          }
          // 2.3向Driver同步发送当下Executor的状态信息
          exec.application.driver.send(ExecutorUpdated(execId, state, message, exitStatus, false))

          // 2.4 如果Executor的状态为完成状态:KILLED, FAILED, LOST, EXITED
          if (ExecutorState.isFinished(state)) {
            // 从Worker和App中移除这个Executor
            logInfo(s"Removing executor ${exec.fullId} because it is $state")
            // 如果一个Application已经被完成,则保存其信息,显示在前端页面
            // 从App的缓存中移除Executor
            if (!appInfo.isFinished) {
              appInfo.removeExecutor(exec)
            }
            //从运行Executor的Worker的缓存中移除Executor
            exec.worker.removeExecutor(exec)

            val normalExit = exitStatus == Some(0)
            // 只需要重试一定次数,这样我们就不会进入无限循环
            //如果退出的状态不正常,并且EXECUTOR重试的次数 >= MAX_EXECUTOR_RETRIES[10次],则 removeApplication
            if (!normalExit
              && appInfo.incrementRetryCount() >= MAX_EXECUTOR_RETRIES
              && MAX_EXECUTOR_RETRIES >= 0) {
              // < 0 disables this application-killing path
              val execs = appInfo.executors.values
              if (!execs.exists(_.state == ExecutorState.RUNNING)) {
                logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " +
                  s"${appInfo.retryCount} times; removing it")
                removeApplication(appInfo, ApplicationState.FAILED)
              }
            }
          }
          // 3.重新调度执行
          schedule()
        case None =>
          logWarning(s"Got status update for unknown executor $appId/$execId")
      }
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

生命不息丶折腾不止

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值