第三十二课 Spark worker原理与源码
1. Spark worker的原理
2. Worker 启动Driver
3. Worker 启动Executor
4. Worker与Master交互
worker的核心作用是管理当前机器的内存和cpu资源,但真正上来说,worker是接收Master的指令来启动Driver或者Executor。所以,要想了解清楚worker,我们的关注点必然在 Worker是如何启动Driver和Executor这两方面。
在Driver或者Executor的工作过程中,有时会出现Driver或是Executor挂掉了,worker内部会有一些处理机制;如果真的处理不了时(Executor fill掉或者是出现了Exception),Worker就会与Master进行沟通,通知Master Executor挂掉了,而作为整个集群的资源调度器,Master可以根据集群的资源情况,调用Scheduler重新为应用程序调度资源。
private[deploy] class Worker(
override val rpcEnv: RpcEnv,
webUiPort: Int,
cores: Int,
memory: Int,
masterRpcAddresses: Array[RpcAddress],
systemName: String,
endpointName: String,
workDirPath: String = null,
val conf: SparkConf,
val securityMgr: SecurityManager)
extends ThreadSafeRpcEndpoint with Logging
在启动的时候,Master会最先向Worker端发送一个LaunchDriver类型的消息。LaunchDriver是一个case Class,它的参数包括了driver的Id,Driver的元数据信息DriverDescription。如下:
case class LaunchDriver (driverId: String, driverDesc: DriverDescription) extends DeployMessage
private[deploy] case class DriverDescription(
jarUrl: String, //Jar 包的URL
mem: Int, //内存信息
cores: Int, //cores信息
supervise: Boolean, //优先级信息
command: Command //Driver的指令
) {
override def toString: String = s"DriverDescription (${command.mainClass})"
(2) Worker接收LaunchDriver
private[deploy] class DriverRunner(
extends Logging {
/** Starts a thread to run and manage the driver. */
private[worker] def start() = {
new Thread("DriverRunner for " + driverId) {
override def run() {
override def run() { try { val driverDir = createWorkingDirectory() val localJarFilename = downloadUserJar(driverDir) def substituteVariables(argument: String): String = argument match { case "{{WORKER_URL}}" => workerUrl case "{{USER_JAR}}" => localJarFilename case other => other }<pre name="code" class="plain">
(4) 自己写的代码打成Jar包。
<pre name="code" class="plain">/**
* Download the user jar into the supplied directory and return its local path.
* Will throw an exception if there are errors downloading the jar.
private def downloadUserJar(driverDir: File): String = {
val jarPath = new Path(driverDesc.jarUrl)
val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
val destPath = new File(driverDir.getAbsolutePath, jarPath.getName)
val jarFileName = jarPath.getName
val localJarFile = new File(driverDir, jarFileName)
val localJarFilename = localJarFile.getAbsolutePath
if (!localJarFile.exists()) { // May already exist if running multiple workers on one node
logInfo(s"Copying user jar $jarPath to $destPath")
useCache = false)
if (!localJarFile.exists()) { // Verify copy succeeded
throw new Exception(s"Did not see expected jar $jarFileName in $driverDir")
val builder = CommandUtils.buildProcessBuilder(driverDesc.command , securityManager,
driverDesc.mem, sparkHome.getAbsolutePath, substituteVariables)
launchDriver(builder, driverDir, driverDesc.supervise)
private def launchDriver(builder: ProcessBuilder, baseDir: File, supervise: Boolean) {
def initialize(process: Process): Unit = {
// Redirect stdout and stderr to files
val stdout = new File(baseDir, "stdout")
CommandUtils.redirectStream(process.getInputStream, stdout)
val stderr = new File(baseDir, "stderr")
val formattedCommand = builder.command.asScala.mkString("\"", "\" \"", "\"")
val header = "Launch Command: %s\n%s\n\n".format(formattedCommand, "=" * 40)
Files.append(header, stderr, UTF_8)
CommandUtils.redirectStream(process.getErrorStream, stderr)
runCommandWithRetry(ProcessBuilderLike(builder), initialize, supervise)
private[deploy] object ProcessBuilderLike {
def apply(processBuilder: ProcessBuilder): ProcessBuilderLike = new
ProcessBuilderLike {
override def start(): Process = processBuilder.start() //启动builder
override def command: Seq[String] = processBuilder.command().asScala
// Needed because ProcessBuilder is a final class and cannot be mocked
private[deploy] trait ProcessBuilderLike { //根据具体的builder来启动不同的任务调度模式
def start(): Process
def command: Seq[String]
private def launchDriver(….){
runCommandWithRetry(ProcessBuilderLike(builder), initialize, supervise)
def runCommandWithRetry(
command: ProcessBuilderLike, initialize: Process => Unit, supervise: Boolean): Unit = {
// Time to wait between submission retries.
var waitSeconds = 1
// A run of this many seconds resets the exponential back-off.
val successfulRunDuration = 5
var keepTrying = !killed
while (keepTrying) {
logInfo("Launch Command: " + command.command.mkString("\"", "\" \"", "\""))
synchronized {
if (killed) { return }
process = Some(command.start())
val processStart = clock.getTimeMillis()
val exitCode = process.get.waitFor()
if (clock.getTimeMillis() - processStart > successfulRunDuration * 1000) {
waitSeconds = 1
if (supervise && exitCode != 0 && !killed) {
logInfo(s"Command exited with status $exitCode, re-launching after $waitSeconds s.")
waitSeconds = waitSeconds * 2 // exponential back-off
keepTrying = supervise && exitCode != 0 && !killed
finalExitCode = Some(exitCode)
private[deploy] class DriverRunner(
extends Logging {
/** Starts a thread to run and manage the driver. */
private[worker] def start() = {
new Thread("DriverRunner for " + driverId) {
override def run() {
worker.send(DriverStateChanged(driverId, state, finalException))
private[deploy] class Worker(
extends ThreadSafeRpcEndpoint with Logging {
override def receive: PartialFunction[Any, Unit] = synchronized {
case driverStateChanged @ DriverStateChanged(driverId, state, exception) => {
private[worker] def handleDriverStateChanged(driverStateChanged: DriverStateChanged): Unit = { val driverId = driverStateChanged.driverId val exception = driverStateChanged.exception val state = driverStateChanged.state state match { case DriverState.ERROR => logWarning(s"Driver $driverId failed with unrecoverable exception: ${exception.get}") case DriverState.FAILED => logWarning(s"Driver $driverId exited with failure") case DriverState.FINISHED => logInfo(s"Driver $driverId exited successfully") case DriverState.KILLED => logInfo(s"Driver $driverId was killed by user") case _ => logDebug(s"Driver $driverId changed state to $state") } //给master发送消息,告诉master,Driver状态发生变化了。 sendToMaster(driverStateChanged) val driver = drivers.remove(driverId).get finishedDrivers(driverId) = driver trimFinishedDriversIfNecessary() memoryUsed -= driver.driverDesc.mem coresUsed -= driver.driverDesc.cores }
(1) Master发送LaunchExecutor过程
launchExecutor(worker, exec) ->
exec.application.id, exec.id,exec.application.desc, exec.cores, exec.memory))case class LaunchExecutor( masterUrl: String, //Master的URL appId: String, //应用程序的ID execId: Int, //Executor的ID appDesc: ApplicationDescription, //应用程序的元信息 cores: Int, //每个Executor中的Cores memory: Int) //内存信息 extends DeployMessage
同样LaunchExecutor也是一个case class,其中ApplicationDescription保存了应用程序的元信息。
private[deploy] class Worker( ……) extends ThreadSafeRpcEndpoint with Logging { …… override def receive: PartialFunction[Any, Unit] = synchronized{ …… case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) => if (masterUrl != activeMasterUrl) 〖{ 〗^ logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.") } else { try { logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name)) // Create the executor's working directory val executorDir = new File(workDir, appId + "/" + execId〖) 〗^ if (!executorDir.mkdirs()) { throw new IOException("Failed to create directory " + executorDir) } // Create local dirs for the executor. These are passed to the executor via the // SPARK_EXECUTOR_DIRS environment variable, and deleted by the Worker when the // application finishes. val appLocalDirs = appDirectories.get(appId).getOrElse { Utils.getOrCreateLocalRootDirs(conf).map { dir => val appDir = Utils.createDirectory(dir, namePrefix = "executor") Utils.chmod700(appDir) appDir.getAbsolutePath() }.toSeq } appDirectories(appId) = appLocalDirs //创建ExecutorRunne〖r 〗^ val manager = new ExecutorRunner( appId, execId, appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)), cores_, memory_, self, workerId, host, webUi.boundPort, publicAddress, sparkHome, executorDir, workerUri, conf, appLocalDirs, ExecutorState.RUNNING) executors(appId + "/" + execId) = manager //启动ExecutorRunne〖〖r 〗^ 〗^ manager.start() coresUsed += cores_ memoryUsed += memory_ sendToMaster(ExecutorStateChanged(appId, execId, manager.state, None, None)) } catch { case e: Exception => { logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e) if (executors.contains(appId + "/" + execId)) { executors(appId + "/" + execId).kill() executors -= appId + "/" + execId } sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED, Some(e.toString), None)) } } } …… }
private[deploy] class Master( ……) extends ThreadSafeRpcEndpoint with Logging with LeaderElectable { …… override def receive: PartialFunction[Any, Unit] ={ …… //收到ExecutorStateChanged消息 case executorStateChanged @ ExecutorStateChanged(appId, execId, state, message, exitStatus) => handleExecutorStateChanged(executorStateChanged) } } }
private[worker] def handleExecutorStateChanged(executorStateChanged: ExecutorStateChanged): Unit = { //给Master发送消息 sendToMaster(executorStateChanged) val state = executorStateChanged.state if (ExecutorState.isFinished(state)) { val appId = executorStateChanged.appId val fullId = appId + "/" + executorStateChanged.execId val message = executorStateChanged.message val exitStatus = executorStateChanged.exitStatus executors.get(fullId) match { case Some(executor) => logInfo("Executor " + fullId + " finished with state " + state + message.map(" message " + _).getOrElse("") + exitStatus.map(" exitStatus " + _).getOrElse("")) executors -= fullId finishedExecutors(fullId) = executor trimFinishedExecutorsIfNecessary() coresUsed -= executor.cores memoryUsed -= executor.memory case None => logInfo("Unknown Executor " + fullId + " finished with state " + state + message.map(" message " + _).getOrElse("") + exitStatus.map(" exitStatus " + _).getOrElse("")) } maybeCleanupApplication(appId) } }
private[deploy] class Master( ……) extends ThreadSafeRpcEndpoint with Logging with LeaderElectable { …… override def receive: PartialFunction[Any, Unit] ={ …… //Master接收到Executor消息 case ExecutorStateChanged(appId, execId, state, message, exitStatus) => { val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId)) execOption match { case Some(exec) => { val appInfo = idToApp(appId) val oldState = exec.state exec.state = state if (state == ExecutorState.RUNNING) { assert(oldState == ExecutorState.LAUNCHING, s"executor $execId state transfer from $oldState to RUNNING is illegal") appInfo.resetRetryCount() } //给Driver发送消息告诉Driver,Executor状态发生改变了。 exec.application.driver.send(ExecutorUpdated(execId, state, message, exitStatus)) …… } case None => logWarning(s"Got status update for unknown executor $appId/$execId") } }