这一篇分析worker的工作原理
上一篇我们说到Master会给worker发出LaunchDriver和LaunchExecutor的消息
先来一张总体的流程图
这边我们先说launchDriver
worker.scala
case LaunchDriver(driverId, driverDesc) => { logInfo(s"Asked to launch driver $driverId") val driver = new DriverRunner( conf, driverId, workDir, sparkHome, driverDesc.copy(command = Worker.maybeUpdateSSLSettings(driverDesc.command, conf)), self, workerUri, securityMgr) drivers(driverId) = driver driver.start() coresUsed += driverDesc.cores memoryUsed += driverDesc.mem }
首先会创建一个Dri'v'er'Ru'nner
然后加入到worker的容器中
然后调用DriverRunner#start()方法
new Thread("DriverRunner for " + driverId) { override def run() { try { val driverDir = createWorkingDirectory() val localJarFilename = downloadUserJar(driverDir) def substituteVariables(argument: String): String = argument match { case "{{WORKER_URL}}" => workerUrl case "{{USER_JAR}}" => localJarFilename case other => other } // TODO: If we add ability to submit multiple jars they should also be added here val builder = CommandUtils.buildProcessBuilder(driverDesc.command, securityManager, driverDesc.mem, sparkHome.getAbsolutePath, substituteVariables) launchDriver(builder, driverDir, driverDesc.supervise) } catch { case e: Exception => finalException = Some(e) } val state = if (killed) { DriverState.KILLED } else if (finalException.isDefined) { DriverState.ERROR } else { finalExitCode match { case Some(0) => DriverState.FINISHED case _ => DriverState.FAILED } } finalState = Some(state) worker.send(DriverStateChanged(driverId, state, finalException)) } }.start()
首先是启动了一个线程,然后我们具体看线程中的run方法
val driverDir = createWorkingDirectory() val localJarFilename = downloadUserJar(driverDir)
首先会创建一个本地的工作目录,然后下载jar包
val builder = CommandUtils.buildProcessBuilder(driverDesc.command, securityManager, driverDesc.mem, sparkHome.getAbsolutePath, substituteVariables)
这里会创建一个processBuilder,这个类是专门用来启动进程的,然后调用launchDriver
进入launchDriver到runCommandWithRetry,找到
synchronized { if (killed) { return } process = Some(command.start()) initialize(process.get) }
process = Some(command.start()) 这句话将会启动driver进程
def initialize(process: Process): Unit = { // Redirect stdout and stderr to files val stdout = new File(baseDir, "stdout") CommandUtils.redirectStream(process.getInputStream, stdout) val stderr = new File(baseDir, "stderr") val formattedCommand = builder.command.asScala.mkString("\"", "\" \"", "\"") val header = "Launch Command: %s\n%s\n\n".format(formattedCommand, "=" * 40) Files.append(header, stderr, UTF_8) CommandUtils.redirectStream(process.getErrorStream, stderr) }
方法中会进行一些输出的重定向
回到driver#start()方法,
worker.send(DriverStateChanged(driverId, state, finalException))最后会worker发送driver状态改变的消息
接着来看下launchExecutor
val manager = new ExecutorRunner(
首先会创建一个new ExecutorRunner
然后调用他的start方法
private[worker] def start() { workerThread = new Thread("ExecutorRunner for " + fullId) { override def run() { fetchAndRunExecutor() } } workerThread.start() // Shutdown hook that kills actors on shutdown. shutdownHook = ShutdownHookManager.addShutdownHook { () => // It's possible that we arrive here before calling `fetchAndRunExecutor`, then `state` will // be `ExecutorState.RUNNING`. In this case, we should set `state` to `FAILED`. if (state == ExecutorState.RUNNING) { state = ExecutorState.FAILED } killProcess(Some("Worker shutting down")) } }
这里会启动workerThread线程,看该线程的run方法
fetchAndRunExecutor
private def fetchAndRunExecutor() { try { // Launch the process val builder = CommandUtils.buildProcessBuilder(appDesc.command, new SecurityManager(conf), memory, sparkHome.getAbsolutePath, substituteVariables) val command = builder.command() val formattedCommand = command.asScala.mkString("\"", "\" \"", "\"") logInfo(s"Launch command: $formattedCommand") builder.directory(executorDir) builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator)) // In case we are running this from within the Spark Shell, avoid creating a "scala" // parent process for the executor command builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0") // Add webUI log urls val baseUrl = s"http://$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType=" builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr") builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout") process = builder.start() val header = "Spark Executor Command: %s\n%s\n\n".format( formattedCommand, "=" * 40) // Redirect its stdout and stderr to files val stdout = new File(executorDir, "stdout") stdoutAppender = FileAppender(process.getInputStream, stdout, conf) val stderr = new File(executorDir, "stderr") Files.write(header, stderr, UTF_8) stderrAppender = FileAppender(process.getErrorStream, stderr, conf) // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown) // or with nonzero exit code val exitCode = process.waitFor() state = ExecutorState.EXITED val message = "Command exited with code " + exitCode worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))
我们可以看到这里和launchDriver特别类似
val builder = CommandUtils.buildProcessBuilder这里会启动一个processBuilder
builder.directory(executorDir) 创建exector工作目录
process = builder.start()启动executor进程
stderrAppender = FileAppender(process.getErrorStream, stderr, conf) 文件重定向
worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))
这里会发送给worker executor状态改变的消息
回到LaunchExecutor方法 找到最后一行
sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED,
会给master发送一个executorSateChanged消息
找到Master.scala的ExecutorStateChanged方法
appInfo.removeExecutor(exec) exec.worker.removeExecutor(exec)
这里会移出内存中的executor
if (!normalExit) { if (appInfo.incrementRetryCount() < ApplicationState.MAX_NUM_RETRY) { schedule() } else { val execs = appInfo.executors.values if (!execs.exists(_.state == ExecutorState.RUNNING)) { logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " + s"${appInfo.retryCount} times; removing it") removeApplication(appInfo, ApplicationState.FAILED) } }
注意看这里,如果是非正常退出,会重试,重新调度,知道超过最大次数,默认是10次