上次说到yarnClient 把 命令行 和 容器信息 提交到RM 运行AM
首先 看AM的main方法
def main(args: Array[String]): Unit = {
SignalUtils.registerLogger(log)
val amArgs = new ApplicationMasterArguments(args)
。。。
SparkHadoopUtil.get.runAsSparkUser { () =>
//new YarnRMClient 用来和RM来通讯的
master = new ApplicationMaster(amArgs, new YarnRMClient)
//注意master的run方法
System.exit(master.run())
}
}
run方法
final def run(): Int = {
try {
val appAttemptId = client.getAttemptId()
var attemptID: Option[String] = None
if (isClusterMode) {
。。。。
设置一些参数
}
new CallerContext("APPMASTER",
Option(appAttemptId.getApplicationId.toString), attemptID).setCurrentContext()
logInfo("ApplicationAttemptId: " + appAttemptId)
//文件系统
val fs = FileSystem.get(yarnConf)
// This shutdown hook should run *after* the SparkContext is shut down.
val priority = ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY - 1
ShutdownHookManager.addShutdownHook(priority) { () =>
val maxAppAttempts = client.getMaxRegAttempts(sparkConf, yarnConf)
val isLastAttempt = client.getAttemptId().getAttemptId() >= maxAppAttempts
if (!finished) {
// The default state of ApplicationMaster is failed if it is invoked by shut down hook.
// This behavior is different compared to 1.x version.
// If user application is exited ahead of time by calling System.exit(N), here mark
// this application as failed with EXIT_EARLY. For a good shutdown, user shouldn't call
// System.exit(0) to terminate the application.
finish(finalStatus,
ApplicationMaster.EXIT_EARLY,
"Shutdown hook called before final status was reported.")
}
if (!unregistered) {
// we only want to unregister if we don't want the RM to retry
if (finalStatus == FinalApplicationStatus.SUCCEEDED || isLastAttempt) {
unregister(finalStatus, finalMsg)
cleanupStagingDir(fs)
}
}
}
。。。。
if (isClusterMode) {
//集群模式 runDriver
runDriver(securityMgr)
} else {
//client 模式调用这个
runExecutorLauncher(securityMgr)
}
} catch {
case e: Exception =>
// catch everything else if not specifically handled
logError("Uncaught exception: ", e)
finish(FinalApplicationStatus.FAILED,
ApplicationMaster.EXIT_UNCAUGHT_EXCEPTION,
"Uncaught exception: " + e)
}
exitCode
}
ApplicationMaster的runDriver()方法
private def runDriver(securityMgr: SecurityManager): Unit = {
addAmIpFilter()
//启动一个线程名字叫Driver 运行userClass的main方法
//下面的代码任然在异步运行,申请资源等等
userClassThread = startUserApplication()
// This a bit hacky, but we need to wait until the spark.driver.port property has
// been set by the Thread executing the user class.
logInfo("Waiting for spark context initialization...")
val totalWaitTime = sparkConf.get(AM_MAX_WAIT_TIME)
try {
val sc = ThreadUtils.awaitResult(sparkContextPromise.future,
Duration(totalWaitTime, TimeUnit.MILLISECONDS))
if (sc != null) {
rpcEnv = sc.env.rpcEnv
val driverRef = runAMEndpoint(
sc.getConf.get("spark.driver.host"),
sc.getConf.get("spark.driver.port"),
isClusterMode = true)
//注册应用 申请资源
//请求资源,RM给出的资源container,选择
//nmClient 根据命令 command java -cp CoarseGrainedExecutorBackend
registerAM(sc.getConf, rpcEnv, driverRef, sc.ui.map(_.appUIAddress).getOrElse(""),
securityMgr)
} else {
// Sanity check; should never happen in normal operation, since sc should only be null
// if the user app did not create a SparkContext.
if (!finished) {
throw new IllegalStateException("SparkContext is null but app is still running!")
}
}
//userClass里面还会做很多事情,初始sc 切分任务 等等
//ApplicationMaster 需要等着
userClassThread.join()
} catch {
case e: SparkException if e.getCause().isInstanceOf[TimeoutException] =>
logError(
s"SparkContext did not initialize after waiting for $totalWaitTime ms. " +
"Please check earlier log output for errors. Failing the application.")
finish(FinalApplicationStatus.FAILED,
ApplicationMaster.EXIT_SC_NOT_INITED,
"Timed out waiting for SparkContext.")
}
}
ApplicationMaster的startUserApplication()方法
主要是把启动一个线程来启动 userClass的main方法,并把这个线程的名字改成Driver
这里例外一个线程来运行Driver 来初始化sc 任务切分等等。而applicationMaster同时在运行后面的方法,
申请资源,等等
/**
* Start the user class, which contains the spark driver, in a separate Thread.
* If the main routine exits cleanly or exits with System.exit(N) for any N
* we assume it was successful, for all other cases we assume failure.
*
* Returns the user thread that was started.
*/
private def startUserApplication(): Thread = {
logInfo("Starting the user application in a separate Thread")
val classpath = Client.getUserClasspath(sparkConf)
val urls = classpath.map { entry =>
new URL("file:" + new File(entry.getPath()).getAbsolutePath())
}
val userClassLoader =
if (Client.isUserClassPathFirst(sparkConf, isDriver = true)) {
new ChildFirstURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
} else {
new MutableURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
}
var userArgs = args.userArgs
if (args.primaryPyFile != null && args.primaryPyFile.endsWith(".py")) {
// When running pyspark, the app is run using PythonRunner. The second argument is the list
// of files to add to PYTHONPATH, which Client.scala already handles, so it's empty.
userArgs = Seq(args.primaryPyFile, "") ++ userArgs
}
if (args.primaryRFile != null && args.primaryRFile.endsWith(".R")) {
// TODO(davies): add R dependencies here
}
//这里的userClass 是用户自己写的class 里面包含了初始化 sc 等等
val mainMethod = userClassLoader.loadClass(args.userClass)
.getMethod("main", classOf[Array[String]])
//创建一个线程来执行main方法
val userThread = new Thread {
override def run() {
try {
//执行main方法
mainMethod.invoke(null, userArgs.toArray)
finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS)
logDebug("Done running users class")
} catch {
case e: InvocationTargetException =>
e.getCause match {
case _: InterruptedException =>
// Reporter thread can interrupt to stop user class
case SparkUserAppException(exitCode) =>
val msg = s"User application exited with status $exitCode"
logError(msg)
finish(FinalApplicationStatus.FAILED, exitCode, msg)
case cause: Throwable =>
logError("User class threw exception: " + cause, cause)
finish(FinalApplicationStatus.FAILED,
ApplicationMaster.EXIT_EXCEPTION_USER_CLASS,
"User class threw exception: " + cause)
}
sparkContextPromise.tryFailure(e.getCause())
} finally {
// Notify the thread waiting for the SparkContext, in case the application did not
// instantiate one. This will do nothing when the user code instantiates a SparkContext
// (with the correct master), or when the user code throws an exception (due to the
// tryFailure above).
sparkContextPromise.trySuccess(null)
}
}
}
userThread.setContextClassLoader(userClassLoader)
//启动之前把当前线程叫做Driver
//一个线程,线程里面执行了userclass的main方法
userThread.setName("Driver")
userThread.start()
userThread
}
ApplicationMaster
main()
SparkHadoopUtil.get.runAsSparkUser { () =>
//包含了RMClient 与RM联系
master = new ApplicationMaster(amArgs, new YarnRMClient)
System.exit(master.run())
}
run()
//集群模式 run的是userclass 用户自定义的类
runDriver(securityMgr)
//例外起一个线程来运行userclass 初始化sc ,切分job 等等 然后把线程的名字改成driver
userClassThread = startUserApplication()
//applicationmaster的进程,同时进行资源的申请
//注册 就是申请资源
/**
根据RM给的资源container
然后通过nmClient 启动这些container 运行准备的commond
运行 CoarseGrainedExecutorBackend
*/
registerAM(sc.getConf, rpcEnv, driverRef, sc.ui.map(_.appUIAddress)
//资源申请完成后,要等userclass线程结束才可以
userClassThread.join()