SparkContext启动Executer 源码解读

最新推荐文章于 2022-06-19 10:20:48 发布
MC_DAChun
最新推荐文章于 2022-06-19 10:20:48 发布
阅读量254
点赞数 1
分类专栏： Spark 文章标签： spark 大数据 scala
本文链接：https://blog.csdn.net/MC_DAChun/article/details/104304452
版权
Spark 专栏收录该内容
1 篇文章 0 订阅
订阅专栏
近来学习Spark 执行流程，解读追踪Spark启动Executer源码。Scala版本为2.12.10 ,Spark版本为spark-3.0.0-preview2-bin-hadoop2.7
SparkContext.scala   
// start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's
// constructor
558 row
_taskScheduler.start()

TaskScheduler的实现类为TaskSchedulerImpl跳转到TaskSchedulerImpl.scala
196 row 
backend.start()
backend对象的实现类为 StandaloneSchedulerBackend
跳转到StandaloneSchedulerBackend.scala
63 row 
super.start()
//启动SchedulerBackEnd中DiverActor
CoarseGrainedSchedulerBackend.scala 423 row 
  protected def createDriverEndpoint(): DriverEndpoint = new DriverEndpoint()
  //接下来启动SchedulerBackEnd中的ClientActor 
118 row -125 row 
val appDesc = ApplicationDescription(sc.appName, maxCores, sc.executorMemory, command,
webUrl, sc.eventLogDir, sc.eventLogCodec, coresPerExecutor, initialExecutorLimit,
resourceReqsPerExecutor = executorResourceReqs)

转到 121 row
client = new StandaloneAppClient(sc.env.rpcEnv, masters, appDesc, this, conf)

StandaloneAppClient.scala
转到 84 row
override def onStart(): Unit = {
  try {
	//开始注册
    registerWithMaster(1)
  } catch {
    case e: Exception =>
      logWarning("Failed to connect to master", e)
      markDisconnected()
      stop()
  }
}

client.start()

转到	141 row 
 registerMasterFutures.set(tryRegisterAllMasters())
 
RpcEndpoint.scala 执行流程
 * {@code constructor -> onStart -> receive* -> onStop}
 
 
转到	107开始真正注册
其中发送的appDescription对象就是Master 接收的对象
  masterRef.send(RegisterApplication(appDescription, self))
  
  
  
  转到Master.scala 272
  //该description对象来自StandaloneAppClient 发送的appDescription
   case RegisterApplication(description, driver) =>
  // TODO Prevent repeated registrations from some driver
  if (state == RecoveryState.STANDBY) {
    // ignore, don't send response
  } else {
    logInfo("Registering app " + description.name)
    val app = createApplication(description, driver)
	
    registerApplication(app)
	
    logInfo("Registered app " + description.name + " with ID " + app.id)
    persistenceEngine.addApplication(app)
	
	//注册完成RegisteredApplication 回复client 注册完成
    driver.send(RegisteredApplication(app.id, self))
	
	//作用就是调度资源，然后在worker中启动executer
	跳转到777 行
    schedule()
  }
  
  
  
	跳转到 907 row 
	private def registerApplication(app: ApplicationInfo): Unit = {
		val appAddress = app.driver.address
		if (addressToApp.contains(appAddress)) {
		  logInfo("Attempted to re-register application at same address: " + appAddress)
		  return
		}

		applicationMetricsSystem.registerSource(app.appSource)
		apps += app
		idToApp(app.id) = app
		endpointToApp(app.driver) = app
		addressToApp(appAddress) = app
		//使用默认的FIFOScheduler调度策略将注册的application任务加载到ArrayBuffer进来，在队列中进行等待
		waitingApps += app
	  }
	  
	private val waitingApps = new ArrayBuffer[ApplicationInfo]
	
	
	  跳转 777行		
	 /**
	   * Schedule the currently available resources among waiting apps. This method will be called
	   * every time a new app joins or resource availability changes.
	   */
	  private def schedule(): Unit = {
		if (state != RecoveryState.ALIVE) {
		  return
		}
		// Drivers take strict precedence over executors
		val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE))
		val numWorkersAlive = shuffledAliveWorkers.size
		var curPos = 0
		for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers
		  // We assign workers to each waiting driver in a round-robin fashion. For each driver, we
		  // start from the last worker that was assigned a driver, and continue onwards until we have
		  // explored all alive workers.
		  var launched = false
		  var isClusterIdle = true
		  var numWorkersVisited = 0
		  while (numWorkersVisited < numWorkersAlive && !launched) {
			val worker = shuffledAliveWorkers(curPos)
			isClusterIdle = worker.drivers.isEmpty && worker.executors.isEmpty
			numWorkersVisited += 1
			if (canLaunchDriver(worker, driver.desc)) {
			  val allocated = worker.acquireResources(driver.desc.resourceReqs)
			  driver.withResources(allocated)
			  
			  //启动 ExecuterBackend
			  launchDriver(worker, driver)
			  waitingDrivers -= driver
			  launched = true
			}
			curPos = (curPos + 1) % numWorkersAlive
		  }
		  if (!launched && isClusterIdle) {
			logWarning(s"Driver ${driver.id} requires more resource than any of Workers could have.")
		  }
		}
		startExecutorsOnWorkers()
	  }
	  
	  
	  
	  
	   696行
	   /**
	   * Schedule and launch executors on workers
	   */
	  private def startExecutorsOnWorkers(): Unit = {
		// Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
		// in the queue, then the second app, etc.
		for (app <- waitingApps) {
		  val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(1)
		  // If the cores left is less than the coresPerExecutor,the cores left will not be allocated
		  if (app.coresLeft >= coresPerExecutor) {
			// Filter out workers that don't have enough resources to launch an executor
			val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
			  .filter(canLaunchExecutor(_, app.desc))
			  .sortBy(_.coresFree).reverse
			if (waitingApps.length == 1 && usableWorkers.isEmpty) {
			  logWarning(s"App ${app.id} requires more resource than any of Workers could have.")
			}
			val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)

			// Now that we've decided how many cores to allocate on each worker, let's allocate them
			for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
			
			分配资源给Executer ，并启动Executer
			  allocateWorkerResourceToExecutors(
				app, assignedCores(pos), app.desc.coresPerExecutor, usableWorkers(pos))
			}
		  }
		}
	  }
	  
	  
		721 行
	    /**
		   * Allocate a worker's resources to one or more executors.
		   * @param app the info of the application which the executors belong to
		   * @param assignedCores number of cores on this worker for this application
		   * @param coresPerExecutor number of cores per executor
		   * @param worker the worker info
		   */
		  private def allocateWorkerResourceToExecutors(
			  app: ApplicationInfo,
			  assignedCores: Int,
			  coresPerExecutor: Option[Int],
			  worker: WorkerInfo): Unit = {
			// If the number of cores per executor is specified, we divide the cores assigned
			// to this worker evenly among the executors with no remainder.
			// Otherwise, we launch a single executor that grabs all the assignedCores on this worker.
			val numExecutors = coresPerExecutor.map { assignedCores / _ }.getOrElse(1)
			val coresToAssign = coresPerExecutor.getOrElse(assignedCores)
			for (i <- 1 to numExecutors) {
			  val allocated = worker.acquireResources(app.desc.resourceReqsPerExecutor)
			  val exec = app.addExecutor(worker, coresToAssign, allocated)
			  
			  启动和加载Executer
			  launchExecutor(worker, exec)
			  app.state = ApplicationState.RUNNING
			}
		  }
		  
		  816行
		    private def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc): Unit = {
				logInfo("Launching executor " + exec.fullId + " on worker " + worker.id)
				
				worker.addExecutor(exec)
				//向Worker.scala 发送LaunchExecutor ,启动Executer
				worker.endpoint.send(LaunchExecutor(masterUrl, exec.application.id, exec.id,
				  exec.application.desc, exec.cores, exec.memory, exec.resources))
				  
				exec.application.driver.send(
				  ExecutorAdded(exec.id, worker.id, worker.hostPort, exec.cores, exec.memory))
			  }
			  
		Worker.scala  549 行  
		// Worker 中真正启动Executer的逻辑代码
		 case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_, resources_) =>
			  if (masterUrl != activeMasterUrl) {
				logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")
			  } else {
				try {
				  logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))

				  // Create the executor's working directory
				  val executorDir = new File(workDir, appId + "/" + execId)
				  if (!executorDir.mkdirs()) {
					throw new IOException("Failed to create directory " + executorDir)
				  }

				  // Create local dirs for the executor. These are passed to the executor via the
				  // SPARK_EXECUTOR_DIRS environment variable, and deleted by the Worker when the
				  // application finishes.
				  val appLocalDirs = appDirectories.getOrElse(appId, {
					val localRootDirs = Utils.getOrCreateLocalRootDirs(conf)
					val dirs = localRootDirs.flatMap { dir =>
					  try {
						val appDir = Utils.createDirectory(dir, namePrefix = "executor")
						Utils.chmod700(appDir)
						Some(appDir.getAbsolutePath())
					  } catch {
						case e: IOException =>
						  logWarning(s"${e.getMessage}. Ignoring this directory.")
						  None
					  }
					}.toSeq
					if (dirs.isEmpty) {
					  throw new IOException("No subfolder can be created in " +
						s"${localRootDirs.mkString(",")}.")
					}
					dirs
				  })
				  appDirectories(appId) = appLocalDirs
				  
				  //Master 发送LaunchExecutor消息给Worker 启动Executer 最终由ExecutorRunner创建
				  val manager = new ExecutorRunner(
					appId,
					execId,
					appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),
					cores_,
					memory_,
					self,
					workerId,
					webUi.scheme,
					host,
					webUi.boundPort,
					publicAddress,
					sparkHome,
					executorDir,
					workerUri,
					conf,
					appLocalDirs,
					ExecutorState.LAUNCHING,
					resources_)
				  executors(appId + "/" + execId) = manager
				  
				  //调用manager.start()真正启动Executer的方法
				  manager.start()
				  coresUsed += cores_
				  memoryUsed += memory_
				  addResourcesUsed(resources_)
				} catch {
				  case e: Exception =>
					logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e)
					if (executors.contains(appId + "/" + execId)) {
					  executors(appId + "/" + execId).kill()
					  executors -= appId + "/" + execId
					}
					
					//启动成功回复Master，Executer启动完成
					sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED,
					  Some(e.toString), None))
				}
		}	

		ExecutorRunner.scala
		   private[worker] def start(): Unit = {
		   //创建线程，启动Start()方法
			workerThread = new Thread("ExecutorRunner for " + fullId) {
			
			  override def run(): Unit = { fetchAndRunExecutor() }
			  
			}
			workerThread.start()
			// Shutdown hook that kills actors on shutdown.
			shutdownHook = ShutdownHookManager.addShutdownHook { () =>
			  // It's possible that we arrive here before calling `fetchAndRunExecutor`, then `state` will
			  // be `ExecutorState.LAUNCHING`. In this case, we should set `state` to `FAILED`.
			  if (state == ExecutorState.LAUNCHING) {
				state = ExecutorState.FAILED
			  }
			  killProcess(Some("Worker shutting down")) }
		  }
		  
		  148行
		   /**
		   * Download and run the executor described in our ApplicationDescription
		   */
		  private def fetchAndRunExecutor(): Unit = {
			try {
			  val resourceFileOpt = prepareResourcesFile(SPARK_EXECUTOR_PREFIX, resources, executorDir)
			  // Launch the process
			  val arguments = appDesc.command.arguments ++ resourceFileOpt.map(f =>
				Seq("--resourcesFile", f.getAbsolutePath)).getOrElse(Seq.empty)
			  val subsOpts = appDesc.command.javaOpts.map {
				Utils.substituteAppNExecIds(_, appId, execId.toString)
			  }
			  val subsCommand = appDesc.command.copy(arguments = arguments, javaOpts = subsOpts)
			  val builder = CommandUtils.buildProcessBuilder(subsCommand, new SecurityManager(conf),
				memory, sparkHome.getAbsolutePath, substituteVariables)
			  val command = builder.command()
			  val redactedCommand = Utils.redactCommandLineArgs(conf, command.asScala)
				.mkString("\"", "\" \"", "\"")
			  logInfo(s"Launch command: $redactedCommand")

			  builder.directory(executorDir)
			  builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
			  // In case we are running this from within the Spark Shell, avoid creating a "scala"
			  // parent process for the executor command
			  builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")

			  // Add webUI log urls
			  val baseUrl =
				if (conf.get(UI_REVERSE_PROXY)) {
				  s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType="
				} else {
				  s"$webUiScheme$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
				}
			  builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
			  builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")
				//启动进程
			  process = builder.start()
			  val header = "Spark Executor Command: %s\n%s\n\n".format(
				redactedCommand, "=" * 40)

			  // Redirect its stdout and stderr to files
			  val stdout = new File(executorDir, "stdout")
			  stdoutAppender = FileAppender(process.getInputStream, stdout, conf)

			  val stderr = new File(executorDir, "stderr")
			  Files.write(header, stderr, StandardCharsets.UTF_8)
			  stderrAppender = FileAppender(process.getErrorStream, stderr, conf)

			  state = ExecutorState.RUNNING
			  worker.send(ExecutorStateChanged(appId, execId, state, None, None))
			  // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)
			  // or with nonzero exit code
			  val exitCode = process.waitFor()
			  state = ExecutorState.EXITED
			  val message = "Command exited with code " + exitCode
			  worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))
			} catch {
			  case interrupted: InterruptedException =>
				logInfo("Runner thread for executor " + fullId + " interrupted")
				state = ExecutorState.KILLED
				killProcess(None)
			  case e: Exception =>
				logError("Error running executor", e)
				state = ExecutorState.FAILED
				killProcess(Some(e.toString))
			}
		  }
		  
		  
		  CommandUtils.scala
		    /**
		   * Build a ProcessBuilder based on the given parameters.
		   * The `env` argument is exposed for testing.
		   */
		  def buildProcessBuilder(
			  command: Command,
			  securityMgr: SecurityManager,
			  memory: Int,
			  sparkHome: String,
			  substituteArguments: String => String,
			  classPaths: Seq[String] = Seq.empty,
			  env: Map[String, String] = sys.env): ProcessBuilder = {
			val localCommand = buildLocalCommand(
			  command, securityMgr, substituteArguments, classPaths, env)
			val commandSeq = buildCommandSeq(localCommand, memory, sparkHome)
			val builder = new ProcessBuilder(commandSeq: _*)
			val environment = builder.environment()
			for ((key, value) <- localCommand.environment) {
			  environment.put(key, value)
			}
			builder
		  }
				60行
		    private def buildCommandSeq(command: Command, memory: Int, sparkHome: String): Seq[String] = {
				// SPARK-698: do not call the run.cmd script, as process.destroy()
				// fails to kill a process tree on Windows
				//调用该构造器
				val cmd = new WorkerCommandBuilder(sparkHome, memory, command).buildCommand()
				cmd.asScala ++ Seq(command.mainClass) ++ command.arguments
			  }
			  
		  /**
			 * This class is used by CommandUtils. It uses some package-private APIs in SparkLauncher, and since
			 * Java doesn't have a feature similar to `private[spark]`, and we don't want that class to be
			 * public, needs to live in the same package as the rest of the library.
			 */
			private[spark] class WorkerCommandBuilder(sparkHome: String, memoryMb: Int, command: Command)
				extends AbstractCommandBuilder {

			  childEnv.putAll(command.environment.asJava)
			  childEnv.put(CommandBuilderUtils.ENV_SPARK_HOME, sparkHome)

			  override def buildCommand(env: JMap[String, String]): JList[String] = {
				val cmd = buildJavaCommand(command.classPathEntries.mkString(File.pathSeparator))
				cmd.add(s"-Xmx${memoryMb}M")
				command.javaOpts.foreach(cmd.add)
				cmd
			  }

			  def buildCommand(): JList[String] = buildCommand(new JHashMap[String, String]())

			}
			
			启动Executer完成
MC_DAChun
关注
1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
SparkContext启动Executer 源码解读

SparkContext.scala // start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's// constructor558 row_taskScheduler.start()跳转到TaskSchedulerImpl.scala196 row backen...
复制链接

扫一扫