承接上文(spark-core_24:AppClient的ClientEndpoint注册RegisterApplication)
上文中提到:master调用launchExecutor(){worker.endpoint.send(LaunchExecutor(masterUrl,
exec.application.id, exec.id,exec.application.desc, exec.cores, exec.memory))}让worker启动CoarseGrainedExecutorBackend
19,Worker使用JDK的ProcessBuider.start来启动CoarseGrainedExecutorBackend进程
override def receive: PartialFunction[Any, Unit] =synchronized {
…..
/**
* appDesc,里面包括command信息,里面有启动类CoarseGrainedExecutorBackend
//是在SparkContext初始化时启动TaskSchedulerImpl.start()之后由SparkDeploySchedulerBackend的AppClient的RpcEndPonit调用registerApplication 放进去的
//再调用startExecutorsOnWorkers==》allocateWorkerResourceToExecutors==》launchExecutor(worker, exec)==》worker.endpoint.send(LaunchExecutor(masterUrl,。。)
masterUrl:spark://luyl152:7077
appId: app-20180404172558-0000
execId: 一个自增的数值,默认从0开始
cores_ : --num-executors或SparkConf的"spark.executor.cores"的值,如果没有值,只启动一个CoarseGrainedExecutorBackend,把worker所有可用的core给它
memory_ : 对应sc.executorMemory,默认是1024MB
*/
case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) =>
/**worker LaunchExecutor创建了ExecutorRunner,然后调用了ExecutorRunner的start()方法,该start()方法调用了方法fetchAndRunExecutor(),
* 这个fetchAndRunExecutor()方法中有以下代码:
val builder =CommandUtils.buildProcessBuilder(appDesc.command, newSecurityManager(conf),memory, sparkHome.getAbsolutePath, substituteVariables)
process = builder.start()
*/
if (masterUrl != activeMasterUrl) {
logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")
} else {
try {
logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))
// Create the executor's working directory.创建CoarseGrainedExecutorBackend对应的工作目录
//这个workDir:在WorkerArguments 中初始化SPARK_WORKER_DIR如果不设置这个变量,会在worker启动时在spark_home下面创建一个work目录
//executorDir :/data/spark-1.6.0-bin-hadoop2.6/work/app-20180508234845-0000/4
val executorDir = new File(workDir, appId+ "/" + execId)
if (!executorDir.mkdirs()){
throw new IOException("Failed tocreate directory " +executorDir)
}
// Create local dirs for the executor. These are passedto the executor via the
// SPARK_EXECUTOR_DIRSenvironment variable, and deleted by the Worker when the
// application finishes.
//创建本地目录为CoarseGrainedExecutorBackend,会通过环境变量SPARK_EXECUTOR_DIRS(在WorkerArguments 中初始化)传给CoarseGrainedExecutorBackend
//当application完成时,会将它删除掉
//appDirectories:HashMap[String,Seq[String]]第一次的时候,肯定是没有值的
//appLocalDirs:返回Seq("/tmp/spark-b7c124be-813a-4c06-8f8e-1e04fd2b5056/executor-ed6c2e1e-c448-4883-8f34-5efdde76521b")
val appLocalDirs = appDirectories.get(appId).getOrElse {
//getOrCreateLocalRootDirs()返回:Array(/tmp/spark-e72251ed-96b6-4fe6-b704-1772b5fc5a8b)
Utils.getOrCreateLocalRootDirs(conf).map { dir=>
//返回/tmp/spark-e72251ed-96b6-4fe6-b704-1772b5fc5a8b/executor-7ab80469-4222-40c9-87cf-a6f2f00e30c6
val appDir = Utils.createDirectory(dir, namePrefix = "executor")
Utils.chmod700(appDir)
appDir.getAbsolutePath()
}.toSeq
}
//appDirectories:HashMap["app-20180404172558-0000",Seq("/tmp/spark-e72251ed-96b6-4fe6-b704-1772b5fc5a8b/executor-7ab80469-4222-40c9-87cf-a6f2f00e30c6")]
appDirectories(appId) = appLocalDirs
//ExecutorRunner让每个Worker节点真正去启动CoarseGrainedExecutorBackend进程
//使用Jdk的ProcessBuilder.start()来启动CoarseGrainedExecutorBackend
val manager = new ExecutorRunner(
appId, //app-20180404172558-0000
execId,
appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),
cores_,
memory_,
self,
workerId, //worker-20180321165947-luyl153-RpcAddress.port值
host, //worker的host
webUi.boundPort, //worker的WebUI端口是8081,master的是8080
publicAddress, //当前worker的主机名
sparkHome,
executorDir,///data/spark-1.6.0-bin-hadoop2.6/work/app-20180508234845-0000/4
workerUri, //spark://sparkWorker@luyl153:RpcAddress.port
conf,
appLocalDirs, ExecutorState.RUNNING)
//executors: HashMap[String, ExecutorRunner]
executors(appId + "/" + execId) = manager
manager.start()
coresUsed += cores_
memoryUsed += memory_
sendToMaster(ExecutorStateChanged(appId, execId, manager.state, None, None))
} catch {
case e: Exception => {
logError(s"Failed to launch executor $appId/$execId for${appDesc.name}.", e)
if (executors.contains(appId + "/" + execId)) {
executors(appId + "/" + execId).kill()
executors -= appId + "/" + execId
}
sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED,
Some(e.toString), None))
}
}
}
20,最精彩的代码即将登场,就是spark使用ProcessBuilder来启动CoarseGrainedExecutorBackend
private[deploy] class ExecutorRunner(
val appId:String, //app-20180404172558-0000
val execId:Int,
val appDesc:ApplicationDescription,
val cores: Int,
val memory: Int,//对应sc.executorMemory,默认是1024MB
val worker:RpcEndpointRef,
val workerId: String,
val host: String,
val webUiPort: Int,//worker的WebUI端口是8081,master的是8080
val publicAddress:String,//当前worker的主机名
val sparkHome:File,
val executorDir: File, //$spark_home$/work/0
val workerUrl:String, //spark://sparkWorker@luyl153:RpcAddress.port
conf: SparkConf,
val appLocalDirs: Seq[String], // Seq("/tmp/spark-e72251ed-96b6-4fe6-b704-1772b5fc5a8b/executor-7ab80469-4222-40c9-87cf-a6f2f00e30c6")
@volatile var state:ExecutorState.Value)
extends Logging {
private val fullId =appId + "/" + execId
private var workerThread: Thread = null
private var process: Process = null
private var stdoutAppender: FileAppender = null
private var stderrAppender: FileAppender = null
// NOTE: This is now redundant with the automatedshut-down enforced by the Executor. It might make sense to remove this in the future.
private var shutdownHook: AnyRef = null
private[worker] def start() {
workerThread = new Thread("ExecutorRunner for " + fullId) {
override def run() {fetchAndRunExecutor() }
}
workerThread.start()
// Shutdown hook that kills actors on shutdown.
//jdk的main类退出时的hook代码,后面分析,还是挺有用的,也长见识
shutdownHook = ShutdownHookManager.addShutdownHook { () =>
// It's possible that we arrive here before calling`fetchAndRunExecutor`, then `state` will
// be `ExecutorState.RUNNING`. Inthis case, we should set `state` to `FAILED`.
if (state== ExecutorState.RUNNING) {
state = ExecutorState.FAILED
}
killProcess(Some("Worker shutting down")) }
}
===》查看fetchAndRunExecutor方法
/**
* Download and run the executordescribed in our ApplicationDescription
* 使用Jdk的ProcessBuilder.start()来启动CoarseGrainedExecutorBackend
*https://blog.csdn.net/u013256816/article/details/54603910
*/
private def fetchAndRunExecutor() {
try {
// Launch the process,它就是返回jdk的ProcessBuilder
val builder= CommandUtils.buildProcessBuilder(appDesc.command, new SecurityManager(conf), memory, sparkHome.getAbsolutePath, substituteVariables)
。。。}
===》先看一下CommandUtils.buildProcessBuilder通过,ProcessBuilder放入执行main类的java命令
object CommandUtils extends Logging {
/**
* Build a ProcessBuilder based on thegiven parameters.
* The `env` argument is exposed for testing.
* //command :Command(org.apache.spark.executor.CoarseGrainedExecutorBackend,
// List(--driver-url,spark://CoarseGrainedScheduler@192.168.1.152:49972,
// --executor-id, {{EXECUTOR_ID}},
// --hostname, {{HOSTNAME}},
// --cores, {{CORES}}, --app-id,{{APP_ID}}, --worker-url, {{WORKER_URL}}),
// Map(SPARK_USER -> root,SPARK_EXECUTOR_MEMORY -> 1024m),
//List(),List(),ArraySeq(-Dspark.driver.port=49972, -XX:+PrintGCDetails,-Dkey=value, -Dnumbers=one two three))
*/
def buildProcessBuilder(
command: Command,
securityMgr: SecurityManager,
memory: Int,//1024MB
sparkHome: String,
substituteArguments: String =>String, //将command中的参数变量{EXECUTOR_ID}}、{{CORES}}转成具体值
classPaths: Seq[String] = Seq[String](),
env: Map[String, String] = sys.env):ProcessBuilder = {
val localCommand= buildLocalCommand(
command, securityMgr, substituteArguments, classPaths, env)
val commandSeq= buildCommandSeq(localCommand, memory, sparkHome)
/**会把如下命令给ProcessBuilder构造器,可以看出就是一个java -cp *.jarmain类 。。启动命令
* "/usr/local/java/jdk1.8.0_91/bin/java" "-cp" "/data/spark-1.6.0-bin-hadoop2.6/conf/:/data/spark-1.6.0-bin-hadoop2.6/lib/spark-assembly-1.6.0-hadoop2.6.0.jar:/data/spark-1.6.0-bin-hadoop2.6/lib/datanucleus-api-jdo-3.2.6.jar:/data/spark-1.6.0-bin-hadoop2.6/lib/datanucleus-rdbms-3.2.9.jar:/data/spark-1.6.0-bin-hadoop2.6/lib/datanucleus-core-3.2.10.jar:/data/hadoop-2.6.5/etc/hadoop/" "-Xms1024M" "-Xmx1024M""-Dspark.driver.port=47218" "-XX:+PrintGCDetails""-Dkey=value" "-Dnumbers=one two three" "org.apache.spark.executor.CoarseGrainedExecutorBackend"
"--driver-url""spark://CoarseGrainedScheduler@192.168.1.152:47218""--executor-id" "0" "--hostname""192.168.1.153" "--cores" "4""--app-id" "app-20180503193934-0000" "--worker-url""spark://Worker@192.168.1.153:44713"
*/
val builder= new ProcessBuilder(commandSeq: _*)
// environment方法获得运行进程的环境变量,得到一个Map,可以修改环境变量
val environment= builder.environment()
for ((key, value) <- localCommand.environment) {
environment.put(key, value)
}
builder
}
===》得到的ProcessBuilder对象之后,回到fetchAndRunExecutor()继续往下走
private def fetchAndRunExecutor() {
try {
。。。
//返回此进程生成器的操作系统程序和参数。
val command= builder.command()
val formattedCommand= command.asScala.mkString("\"", "\" \"", "\"")
/**
* 94:18/05/03 19:44:17 INFOworker.ExecutorRunner: Launch command: "/usr/local/java/jdk1.8.0_91/bin/java" "-cp" "/data/spark-1.6.0-bin-hadoop2.6/conf/:/data/spark-1.6.0-bin-hadoop2.6/lib/spark-assembly-1.6.0-hadoop2.6.0.jar:/data/spark-1.6.0-bin-hadoop2.6/lib/datanucleus-api-jdo-3.2.6.jar:/data/spark-1.6.0-bin-hadoop2.6/lib/datanucleus-rdbms-3.2.9.jar:/data/spark-1.6.0-bin-hadoop2.6/lib/datanucleus-core-3.2.10.jar:/data/hadoop-2.6.5/etc/hadoop/"
"-Xms1024M" "-Xmx1024M""-Dspark.driver.port=47218" "-XX:+PrintGCDetails""-Dkey=value"
"-Dnumbers=one two three""org.apache.spark.executor.CoarseGrainedExecutorBackend"
* "--driver-url""spark://CoarseGrainedScheduler@192.168.1.152:47218""--executor-id" "0"
* "--hostname""192.168.1.153" "--cores" "4""--app-id" "app-20180503193934-0000"
* "--worker-url""spark://Worker@192.168.1.153:44713"
*/
logInfo(s"Launchcommand: $formattedCommand")
//设置当前进程工作目录,//$spark_home$/work/0
builder.directory(executorDir)
// appLocalDirs: Seq("/tmp/spark-e72251ed-96b6-4fe6-b704-1772b5fc5a8b/executor-7ab80469-4222-40c9-87cf-a6f2f00e30c6")
//builder.environment返回一个Map,可以修改环境变量的值
builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
// In case we are running this from within the SparkShell, avoid creating a "scala"
// parent process for the executorcommand
builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")
// Add webUI log urls
//worker的WebUI端口是8081,master的是8080,将worker的错误和正确的输出流指定web页面上
val baseUrl=
s"http://$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")
//峰回路转,CoarseGrainedExecutorBackend就是在这启动的
process = builder.start()
val header= "Spark Executor Command: %s\n%s\n\n".format(
formattedCommand, "=" * 40)
// Redirect its stdout and stderr to files
//把标准输出到: /$spark_home$/work/0/stdout文件中
val stdout= new File(executorDir, "stdout")
stdoutAppender = FileAppender(process.getInputStream, stdout, conf)
//把错误输出到: /$spark_home$/work/0/stderr文件中
val stderr= new File(executorDir, "stderr")
Files.write(header, stderr, UTF_8)
stderrAppender = FileAppender(process.getErrorStream, stderr, conf)
// Wait for it to exit; executor may exit with code 0(when driver instructs it to shutdown) or with nonzero exit code
// process.waitFor()会让当前线程阻塞,在不出现异常时直到进程执行结束
val exitCode= process.waitFor()
state = ExecutorState.EXITED
val message= "Command exited with code " + exitCode
worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))
} catch {
case interrupted:InterruptedException => {
logInfo("Runner thread for executor " + fullId + " interrupted")
state = ExecutorState.KILLED
killProcess(None)
}
case e: Exception => {
logError("Error running executor", e)
state = ExecutorState.FAILED
killProcess(Some(e.toString))
}
}
}
21,CoarseGrainedExecutorBackend启动main进程,会解析从Worker哪传过来的main参数同时调用run方法
private[spark] object CoarseGrainedExecutorBackendextends Logging {
…..
def main(args: Array[String]) {
var driverUrl:String = null//CoarseGrainedSchedulerBackend的DriverEndpointRef
var executorId:String = null//ExecutorDesc的id,是一个自增的数,对应每个CoarseGrainedSchedulerBackend
var hostname:String = null//worker的ip
var cores:Int = 0 //cores的值是由SparkConf的"spark.executor.cores"的值决定(我这设置了1所以是1),如果没有值,只启动一个CoarseGrainedExecutorBackend,把worker所有可用的core给它
var appId:String = null //app-20180503193934-0000
var workerUrl:Option[String] = None //spark://Worker@192.168.1.153:44713
val userClassPath= new mutable.ListBuffer[URL]()
var argv= args.toList
//将参数解析出来,放到成员变量中
while (!argv.isEmpty){
argv match {
case ("--driver-url") :: value :: tail =>
driverUrl = value
argv = tail
case ("--executor-id") :: value :: tail =>
executorId = value
argv = tail
。。。。
printUsageAndExit()
}
}
//如果有一个值是空就打印退出main
if (driverUrl== null || executorId == null ||hostname == null || cores <= 0 ||
appId == null) {
printUsageAndExit()
}
run(driverUrl, executorId, hostname, cores, appId, workerUrl, userClassPath)
}
22,查看run方法初始化了什么
private[spark] object CoarseGrainedExecutorBackendextends Logging {
/**
* "--driver-url" "spark://CoarseGrainedScheduler@192.168.1.152:56522" 就是 CoarseGrainedSchedulerBackend的DriverEndpointRef
* "--executor-id""4" //ExecutorDesc的id,是一个自增的数,对应每个CoarseGrainedSchedulerBackend
* "--hostname""192.168.1.153"
* "--cores""1" //--cores的值是由SparkConf的"spark.executor.cores"的值决定(我这设置了1所以是1),
* 如果没有值,只启动一个CoarseGrainedExecutorBackend,把worker所有可用的core给它
* "--app-id""app-20180508234845-0000"
* "--worker-url""spark://Worker@192.168.1.153:53403"
*/
private def run(
driverUrl: String,
executorId: String,
hostname: String,
cores: Int,
appId: String,
workerUrl: Option[String],
userClassPath: Seq[URL]) {
//打印:liunx相关信号,如:当有Ctrl+C 取消命令时对应INT信息,也会将CoarseGrainedExecutorBackend进程取消掉
SignalLogger.register(log)
SparkHadoopUtil.get.runAsSparkUser{ () =>
// Debug code
Utils.checkHost(hostname)
// Bootstrap to fetch the driver's Spark properties.
val executorConf= new SparkConf
val port= executorConf.getInt("spark.executor.port", 0)
//创建一个RpcEnv相当于创建ActorSystem,标识是driverPropsFetcher
val fetcher= RpcEnv.create(
"driverPropsFetcher",
hostname,
port,
executorConf,
new SecurityManager(executorConf),
clientMode = true)
//得到CoarseGrainedSchedulerBackend的DriverEndpointRef
val driver= fetcher.setupEndpointRefByURI(driverUrl)
//会回复一个Seq[(String,String)]里面对应sparkConf中key以spark开始的所有属性,同时将(spark.app.id,"app-20180508234845-0000")也放到这个Seq集合中
val props= driver.askWithRetry[Seq[(String, String)]](RetrieveSparkProps) ++
Seq[(String, String)](("spark.app.id", appId))
//再将fetcher的RpcEnv关掉
fetcher.shutdown()
// Create SparkEnv using properties we fetched from thedriver.
//新new 一个默认的SparkConf(),并从DriverEndpointRef取到的Seq[(String, String)]赋到当前的SparkConf()中
val driverConf= new SparkConf()
for ((key, value) <- props) {
// this is required for SSL in standalone mode
if (SparkConf.isExecutorStartupConf(key)){
driverConf.setIfMissing(key, value)
} else {
driverConf.set(key, value)
}
}
if (driverConf.contains("spark.yarn.credentials.file")) {
logInfo("Will periodically update credentials from: " +
driverConf.get("spark.yarn.credentials.file"))
SparkHadoopUtil.get.startExecutorDelegationTokenRenewer(driverConf)
}
//创建一个CoarseGrainedSchedulerBackend对应的SparkEnv,创建RpcEnv时,因为是client模式,所以rpcEnv.address没有值
val env= SparkEnv.createExecutorEnv(
driverConf, executorId, hostname, port, cores, isLocal = false)
// SparkEnv will set spark.executor.port if the rpc envis listening for incoming
// connections (e.g., if it's usingakka). Otherwise, the executor is running in
// client mode only, and does notaccept incoming connections.
//SparkEnv将设置spark.executor.port,如果rpcEnv为外部联接提供监听(如使用akka).否则executor只会运行在client模式,并且不会接收外部联接
//创建一个CoarseGrainedSchedulerBackend对应的SparkEnv,在创建RpcEnv时,因为是client模式,所以rpcEnv.address没有值
val sparkHostPort= env.conf.getOption("spark.executor.port").map { port =>
hostname + ":" + port
}.orNull
/**
* 将sparkExecutor对应rpcEnv、
* driverUrl"spark://CoarseGrainedScheduler@192.168.1.152:56522" 就是CoarseGrainedSchedulerBackend的DriverEndpointRef
* executorId "4" //ExecutorDesc的id,是一个自增的数,对应每个CoarseGrainedSchedulerBackend
* sparkHostPort:创建RpcEnv时,因为是client模式,所以rpcEnv.address没有值,所以该值是Null
* cores "1" //--cores的值是由SparkConf的"spark.executor.cores"的值决定(我这设置了1所以是1),如果没有值,只启动一个CoarseGrainedExecutorBackend,把worker所有可用的core给它
* userClassPath:空的集合
* env:SparkEnv 给CoarseGrainedExecutorBackend实例
*/
env.rpcEnv.setupEndpoint("Executor", new CoarseGrainedExecutorBackend(
env.rpcEnv, driverUrl, executorId, sparkHostPort, cores, userClassPath, env))
//也构造了一个WorkerWatcher, url:"spark://Worker@192.168.1.153:53403"
workerUrl.foreach { url =>
env.rpcEnv.setupEndpoint("WorkerWatcher", new WorkerWatcher(env.rpcEnv, url))
}
env.rpcEnv.awaitTermination()
SparkHadoopUtil.get.stopExecutorDelegationTokenRenewer()
}
}
23,实例化了CoarseGrainedExecutorBackend,它也是RpcEndpoint
/** 该实例由CoarseGrainedExecutorBackend的main初始化的,
* 将sparkExecutor对应rpcEnv、
* driverUrl"spark://CoarseGrainedScheduler@192.168.1.152:56522" 就是CoarseGrainedSchedulerBackend的DriverEndpointRef
* executorId "4" //ExecutorDesc的id,是一个自增的数,对应每个CoarseGrainedSchedulerBackend
* hostPort:创建RpcEnv时,因为是client模式,所以rpcEnv.address没有值,所以该值是Null
* cores "1" //--cores的值是由SparkConf的"spark.executor.cores"的值决定(我这设置了1所以是1), 如果没有值,只启动一个CoarseGrainedExecutorBackend,把worker所有可用的core给它
* userClassPath:空的集合
* env: SparkEnv给CoarseGrainedExecutorBackend实例
*/
private[spark] class CoarseGrainedExecutorBackend(
override val rpcEnv: RpcEnv,
driverUrl: String,
executorId: String,
hostPort: String,
cores: Int,
userClassPath: Seq[URL],
env: SparkEnv)
extends ThreadSafeRpcEndpoint withExecutorBackend with Logging {
var executor: Executor = null
//就是DriverEndpoint的引用
@volatile var driver: Option[RpcEndpointRef] = None
// If this CoarseGrainedExecutorBackend is changed tosupport multiple threads, then this may need
// to be changed so that we don't sharethe serializer instance across threads
//如果CoarseGrainedExecutorBackend变成多线程,那么这个需要改变,以便于我们不会把系列化实例在多线程中共享
private[this] val ser: SerializerInstance =env.closureSerializer.newInstance()
override def onStart() {
//driverUrl:spark://CoarseGrainedScheduler@192.168.1.152:49972,这个得到的RpcEndpointRef 就是DriverEndpoint的引用
logInfo("Connectingto driver: " + driverUrl)
rpcEnv.asyncSetupEndpointRefByURI(driverUrl).flatMap { ref =>
// This is a very fast action so we can use"ThreadUtils.sameThread"
//这是一个非常快的动作,所以我们使用"ThreadUtils.sameThread"
driver = Some(ref)
//会通知DriverEndpoint,然后DriverEndpoint会回复RegisteredExecutor给CoarseGrainedExecutorBackend,让它创建Executor
/* executorId "4" //ExecutorDesc的id,是一个自增的数,对应每个CoarseGrainedSchedulerBackend
self: CoarseGrainedExecutorBackend
hostPort:创建RpcEnv时,因为是client模式,所以rpcEnv.address没有值,所以该值是Null
cores:--cores的值是由SparkConf的"spark.executor.cores"的值决定(我这设置了1所以是1),如果没有值,只启动一个CoarseGrainedExecutorBackend,把worker所有可用的core给它
extractLogUrls: 从环境变量中过滤出key值含有:"SPARK_LOG_URL_"的kv,然后map将key:"SPARK_LOG_URL_"去掉后剩下的部分做为key,原来v不改
*/
ref.ask[RegisterExecutorResponse](
RegisterExecutor(executorId, self, hostPort, cores, extractLogUrls))
}(ThreadUtils.sameThread).onComplete{
// This is a very fast action so we can use"ThreadUtils.sameThread"
case Success(msg) =>Utils.tryLogNonFatalError {
Option(self).foreach(_.send(msg))// msg must be RegisterExecutorResponse
}
case Failure(e) => {
logError(s"Cannot register with driver: $driverUrl", e)
System.exit(1)
}
}(ThreadUtils.sameThread)
}
24,和CoarseGrainedExecutorBackend的DriverEndpoint通信,发送RegisterExecutor,让它创建Executor
class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
extends ThreadSafeRpcEndpoint withLogging {
。。。。。
override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] ={
//CoarseGrainedExecutorBackend在初始化的时候,这个case会被调用
/* executorId "4" //ExecutorDesc的id,是一个自增的数,对应每个CoarseGrainedSchedulerBackend
executorRef:CoarseGrainedExecutorBackend
hostPort:创建RpcEnv时,因为是client模式,所以rpcEnv.address没有值,所以该值是Null
cores:--cores的值是由SparkConf的"spark.executor.cores"的值决定(我这设置了1所以是1),如果没有值,只启动一个CoarseGrainedExecutorBackend,把worker所有可用的core给它
extractLogUrls: 从环境变量中过滤出key值含有:"SPARK_LOG_URL_"的kv,然后map将key:"SPARK_LOG_URL_"去掉后剩下的部分做为key,原来v不改
(stdout,http://192.168.1.154:8081/logPage/?appId=app-20180516150725-0000&executorId=1&logType=stdout)
(stderr,http://192.168.1.154:8081/logPage/?appId=app-20180516150725-0000&executorId=1&logType=stderr)
*/
case RegisterExecutor(executorId, executorRef, hostPort, cores, logUrls) =>
//executorDataMap:HashMap[String,ExecutorData],刚开始时肯定是没有值的
if (executorDataMap.contains(executorId)) {
context.reply(RegisterExecutorFailed("Duplicate executor ID: " + executorId))
} else {
// If the executor's rpc env is not listening forincoming connections, `hostPort`
// will be null, and the clientconnection should be used to contact the executor.
//如果CoarseGrainedExecutorBackend的rpcEnv不去监听外来联接,hostPort是null,并且客户端联接必须被用来联系CoarseGrainedExecutorBackend
val executorAddress = if (executorRef.address!= null) {
executorRef.address
} else {
//standalone的client模式会进入这个代码,然后将CoarseGrainedSchedulerBackend的RpcAddress取到"luyl155:53561"
context.senderAddress
}
//17/11/12 20:31:22 INFOcluster.SparkDeploySchedulerBackend: Registered executorNettyRpcEndpointRef(null) (luyl155:53561) with ID 2
logInfo(s"Registeredexecutor $executorRef ($executorAddress) with ID $executorId")
//addressToExecutorId:HashMap[RpcAddress, String],将CoarseGrainedSchedulerBackend的RpcAddress为key,值是CoarseGrainedSchedulerBackend自己的id
addressToExecutorId(executorAddress) = executorId
//totalCoreCount: AtomicInteger(0)是所有CoarseGrainedSchedulerBackend对应的cores总和
totalCoreCount.addAndGet(cores)
//totalRegisteredExecutors: AtomicInteger(0),统计有多少个CoarseGrainedSchedulerBackend
totalRegisteredExecutors.addAndGet(1)
/*executorRef: CoarseGrainedExecutorBackend
executorRef.address、:因为创建RpcEnv时,因为是client模式,所以rpcEnv.address没有值,所以该值是Null
executorAddress.host:CoarseGrainedSchedulerBackend所在worker的ip
cores:CoarseGrainedSchedulerBackend所能拥有的cores的个数
logUrls: 从环境变量中过滤出key值含有:"SPARK_LOG_URL_"的kv,然后map将key:"SPARK_LOG_URL_"去掉后剩下的部分做为key,原来v不改
* */
val data = new ExecutorData(executorRef, executorRef.address, executorAddress.host,
cores, cores, logUrls)
// This must be synchronized because variables mutated in this block are read when requestingexecutors
// 必须同步,因为请求 CoarseGrainedSchedulerBackend时,变量在这个块中会变化
CoarseGrainedSchedulerBackend.this.synchronized{
//executorDataMap:HashMap[String, ExecutorData],将oarseGrainedSchedulerBackend的id和它ExecutorData,里面有ref引用、ip地址、core个数,放进去
executorDataMap.put(executorId, data)
//numPendingExecutors的初始值是0
if (numPendingExecutors> 0) {
numPendingExecutors -= 1
logDebug(s"Decrementednumber of pending executors ($numPendingExecutors left)")
}
}
// Note: some tests expect the reply to come after we putthe executor in the map
//会通知CoarseGrainedExecutorBackend,初始化Executor线程池,然后makeOffer
//executorAddress.host:CoarseGrainedSchedulerBackend所在的ip
context.reply(RegisteredExecutor(executorAddress.host))
listenerBus.post(
SparkListenerExecutorAdded(System.currentTimeMillis(), executorId, data))
//稍等分析这个makeOffer,它是执行任务用的
makeOffers()
}
25,DriverEndpoint回复CoarseGrainedExecutorBackend,传RegisteredExecutor对象给它,让CoarseGrainedExecutorBackend会实例化Executor
private[spark] class CoarseGrainedExecutorBackend(
override val rpcEnv: RpcEnv,
driverUrl: String,
executorId: String,
hostPort: String,
cores: Int,
userClassPath: Seq[URL],
env: SparkEnv)
extends ThreadSafeRpcEndpoint withExecutorBackend with Logging {
……
override def receive:PartialFunction[Any, Unit] = {
//executorAddress.host:CoarseGrainedSchedulerBackend所在的ip
case RegisteredExecutor(hostname)=>
logInfo("Successfully registered with driver")
/** executorId: ExecutorDesc的id,是一个自增的数,对应每个CoarseGrainedSchedulerBackend
* hostname:CoarseGrainedSchedulerBackend所在的ip
* env : SparkEnv 给CoarseGrainedExecutorBackend实例
* userClassPath: 空的集合
*/
executor = new Executor(executorId, hostname, env, userClassPath, isLocal= false)
26,查看Executor初始化过程,调用了BlockManager的initialize方法
private[spark] class Executor(
executorId: String,
executorHostname: String,
env: SparkEnv,
userClassPath: Seq[URL] = Nil,
isLocal: Boolean= false)
extends Logging {
logInfo(s"Starting executor ID $executorId on host $executorHostname")
// Application dependencies (added through SparkContext)that we've fetched so far on this node. Each map holds the master's timestamp for theversion of that file or JAR we got.
//应该是和--jars和--files给Executor的依赖
private val currentFiles: HashMap[String, Long] = new HashMap[String,Long]()
private val currentJars: HashMap[String, Long] = new HashMap[String,Long]()
//生成一个空的ByteBuffer,新的ByteBuffer的limit和capacity是当前数组的长度,position是0,mark不存在
private val EMPTY_BYTE_BUFFER = ByteBuffer.wrap(new Array[Byte](0))
//在CoarseGrainedExecutorBackend被调用main时,将只有key以spark开始的所有属性,才被放进来
private val conf =env.conf
// No ip or host:port - just hostname
Utils.checkHost(executorHostname, "Expected executed slave to be a hostname")
// must not have port specified.
assert (0 == Utils.parseHostPort(executorHostname)._2)
// Make sure the local hostname we report matches thecluster scheduler's name for this host
//确保我们报告的本地主机名称与此主机的群集调度程序名称相匹配
Utils.setCustomHostname(executorHostname)
//isLocal的默认值是false
if (!isLocal){
// Setup an uncaught exception handler for non-localmode.
// Make any thread terminations due touncaught exceptions kill the entire
// executor process to avoidsurprising stalls.
//为非本地模式设置未捕获的异常处理程序。
// 由于未捕获的异常而使任何线程终止都会终止整个执行程序进程,以避免令人吃惊的停顿。
Thread.setDefaultUncaughtExceptionHandler(SparkUncaughtExceptionHandler)
}
// Start worker thread pool. 初始化缓存线程池
private val threadPool = ThreadUtils.newDaemonCachedThreadPool("Executor task launch worker")
//ExecutorSource用于测量系统。
private val executorSource = new ExecutorSource(threadPool, executorId)
if (!isLocal){
env.metricsSystem.registerSource(executorSource)
//Executor在初始化时调用一次,driver在SparkContext初始化也调用
//是在在CoarseGrainedExecutorBackend被调用main时放进去的,值是app-20180508234845-0000
(查看spark-core_28:Executor初始化过程env.blockManager.initialize(conf.getAppId)- NettyBlockTransferService.init()源码分析)
env.blockManager.initialize( conf .getAppId)}
===》接下来就是发送心跳给和指标给Driver
// must be initializedbefore running startDriverHeartbeat()
//是SparkContext初始化出来的,HeartbeatReceiver。 ENDPOINT_NAME: HeartbeatReceiver
private val heartbeatReceiverRef =
RpcUtils.makeDriverRef(HeartbeatReceiver.ENDPOINT_NAME, conf, env.rpcEnv)
startDriverHeartbeater()
后面再具体看一个具体看一下Executor发起的每10s一次的心跳
/**
* Schedules a task to report heartbeatand partial metrics for active tasks to driver.
* 安排任何去报告心跳,同时部分活动的指标给driver
*/
private def startDriverHeartbeater(): Unit = {
//默认时间是10秒,会转换成毫秒值
val intervalMs= conf.getTimeAsMs("spark.executor.heartbeatInterval", "10s")
// Wait a random interval so the heartbeats don't end upin sync
//应该是小于20的值
val initialDelay= intervalMs + (math.random * intervalMs).asInstanceOf[Int]
val heartbeatTask= new Runnable() {
override def run(): Unit = Utils.logUncaughtExceptions(reportHeartBeat())
}
//先延迟小于20s,然后每10s执行一次
heartbeater.scheduleAtFixedRate(heartbeatTask, initialDelay, intervalMs, TimeUnit.MILLISECONDS)
}