YarnClusterApplication
1.概述
在【spark源码-任务提交流程之sparkSubmit】中分析到,sparkSubmit在执行过程中,会根据部署模式选择不同的sparkApplication子类实现进行实例化,并启动实例化对象;
针对yarn-cluster模式,构造的实例是org.apache.spark.deploy.yarn.YarnClusterApplication类;接下来分析执行流程在YarnClusterApplication中的执行过程;
2.YarnClusterApplication
在YarnClusterApplication的start方法中,解析参数、构造rm客户端,调用客户端run方法;
private[spark] class YarnClusterApplication extends SparkApplication {
override def start(args: Array[String], conf: SparkConf): Unit = {
// 在yarn模式下使用yarn缓存来分发文件和jar文件,从conf中移除jars和files配置
conf.remove("spark.jars")
conf.remove("spark.files")
//解析构造rm client的参数对象
//构造RM客户端对象
//执行RM客户端对象的run方法
new Client(new ClientArguments(args), conf).run()
}
}
3.ClientArguments 解析命令行参数
private[deploy] class ClientArguments(args: Array[String]) {
import ClientArguments._
var cmd: String = "" // 'launch' or 'kill'
var logLevel = Level.WARN
// launch parameters
var masters: Array[String] = null //节点url(host:port),以spark://开头
var jarUrl: String = "" //jar包路径
var mainClass: String = "" //应用程序class的全路径
var supervise: Boolean = DEFAULT_SUPERVISE //确保驱动程序在失败并出现非零退出代码时自动重新启动,默认false
var memory: Int = DEFAULT_MEMORY //默认值1024M
var cores: Int = DEFAULT_CORES //默认值1
private var _driverOptions = ListBuffer[String]()
def driverOptions: Seq[String] = _driverOptions.toSeq
// kill parameters
var driverId: String = ""
//解析参数
parse(args.toList)
@tailrec
private def parse(args: List[String]): Unit = args match {}
}
private[deploy] object ClientArguments {
val DEFAULT_CORES = 1
val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // MB
val DEFAULT_SUPERVISE = false
def isValidJarUrl(s: String): Boolean = {}
}
3.1.DEFAULT_MEMORY 默认内存设置
private[deploy] object ClientArguments {
val DEFAULT_CORES = 1 //cpu核数默认值1
val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // 内存默认值1024M
}
private[spark] object Utils extends Logging {
val DEFAULT_DRIVER_MEM_MB = JavaUtils.DEFAULT_DRIVER_MEM_MB.toInt
}
public class JavaUtils {
public static final long DEFAULT_DRIVER_MEM_MB = 1024L;
}
4.Client 构造RM Client对象
在clint实例化过程中,对yarnClient、分布式缓存管理器、通信组件进行了实例化,设置了am和executor的内存、堆外内存、cpu核数
private[spark] class Client(
val args: ClientArguments,
val sparkConf: SparkConf)
extends Logging {
//实例化yarnClient
private val yarnClient = YarnClient.createYarnClient
//实例化conf
private val hadoopConf = new YarnConfiguration(SparkHadoopUtil.newConfiguration(sparkConf))
//判断是否集群部署
private val isClusterMode = sparkConf.get("spark.submit.deployMode", "client") == "cluster"
//AM内存,集群模式,取driver内存,否则取am内存配置
private val amMemory = if (isClusterMode) {
sparkConf.get(DRIVER_MEMORY).toInt
} else {
sparkConf.get(AM_MEMORY).toInt
}
//堆外内存
private val amMemoryOverhead = {
val amMemoryOverheadEntry = if (isClusterMode) DRIVER_MEMORY_OVERHEAD else AM_MEMORY_OVERHEAD
sparkConf.get(amMemoryOverheadEntry).getOrElse(
math.max((MEMORY_OVERHEAD_FACTOR * amMemory).toLong, MEMORY_OVERHEAD_MIN)).toInt
}
//am的cpu核数
private val amCores = if (isClusterMode) {
sparkConf.get(DRIVER_CORES)
} else {
sparkConf.get(AM_CORES)
}
//executor内存
private val executorMemory = sparkConf.get(EXECUTOR_MEMORY)
//executor堆外内存
private val executorMemoryOverhead = sparkConf.get(EXECUTOR_MEMORY_OVERHEAD).getOrElse(
math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toLong, MEMORY_OVERHEAD_MIN)).toInt
private val isPython = sparkConf.get(IS_PYTHON_APP)
private val pysparkWorkerMemory: Int = if (isPython) {
sparkConf.get(PYSPARK_EXECUTOR_MEMORY).map(_.toInt).getOrElse(0)
} else {
0
}
//实例化分布式缓存管理器
private val distCacheMgr = new ClientDistributedCacheManager()
//在安全的HDFS上运行时使用Principal来登录到KDC
private val principal = sparkConf.get(PRINCIPAL).orNull
//包含上面指定的主体的keytab文件的完整路径。此密钥表将通过安全分布式缓存复制到运行YARN应用程序主控的节点,以定期更新登录票据和委托令牌
private val keytab = sparkConf.get(KEYTAB).orNull
private val loginFromKeytab = principal != null
private val amKeytabFileName: String = {
require((principal == null) == (keytab == null),
"Both principal and keytab must be defined, or neither.")
if (loginFromKeytab) {
logInfo(s"Kerberos credentials: principal = $principal, keytab = $keytab")
// Generate a file name that can be used for the keytab file, that does not conflict
// with any user file.
new File(keytab).getName() + "-" + UUID.randomUUID().toString
} else {
null
}
}
//实例化与LauncherServer进行通信的组件
private val launcherBackend = new LauncherBackend() {
override protected def conf: SparkConf = sparkConf
override def onStopRequest(): Unit = {
if (isClusterMode && appId != null) {
yarnClient.killApplication(appId)
} else {
setState(SparkAppHandle.State.KILLED)
stop()
}
}
}
private val fireAndForget = isClusterMode && !sparkConf.get(WAIT_FOR_APP_COMPLETION)
private var appId: ApplicationId = null
// app staging根目录
private val appStagingBaseDir = sparkConf.get(STAGING_DIR).map { new Path(_) }
.getOrElse(FileSystem.get(hadoopConf).getHomeDirectory())
}
5.执行client 的run方法
提交应用,获取应用id,根据应用id监控应用状态;
private[spark] class Client(
def run(): Unit = {
//提交应用,返回应用id
this.appId = submitApplication()
if (!launcherBackend.isConnected() && fireAndForget) {
val report = getApplicationReport(appId)
val state = report.getYarnApplicationState
logInfo(s"Application report for $appId (state: $state)")
logInfo(formatReportDetails(report))
if (state == YarnApplicationState.FAILED || state == YarnApplicationState.KILLED) {
throw new SparkException(s"Application $appId finished with status: $state")
}
} else {
//监控应用状态
val YarnAppReport(appState, finalState, diags) = monitorApplication(appId)
if (appState == YarnApplicationState.FAILED || finalState == FinalApplicationStatus.FAILED) {
diags.foreach { err =>
logError(s"Application diagnostics message: $err")
}
throw new SparkException(s"Application $appId finished with failed status")
}
if (appState == YarnApplicationState.KILLED || finalState == FinalApplicationStatus.KILLED) {
throw new SparkException(s"Application $appId is killed")
}
if (finalState == FinalApplicationStatus.UNDEFINED) {
throw new SparkException(s"The final status of application $appId is undefined")
}
}
}
}
6.执行client的submitApplication方法
private[spark] class Client(
def submitApplication(): ApplicationId = {
var appId: ApplicationId = null
try {
launcherBackend.connect()
//初始化hadoop的环境
yarnClient.init(hadoopConf)
//启动yarn client,链接yarn
yarnClient.start()
logInfo("Requesting a new application from cluster with %d NodeManagers"
.format(yarnClient.getYarnClusterMetrics.getNumNodeManagers))
// yarn客户端从RM创建一个新的应用
val newApp = yarnClient.createApplication()
//获取应用创建结果,得到应用id
val newAppResponse = newApp.getNewApplicationResponse()
appId = newAppResponse.getApplicationId()
new CallerContext("CLIENT", sparkConf.get(APP_CALLER_CONTEXT),
Option(appId.toString)).setCurrentContext()
// 验证集群是否有足够的资源用于AM
verifyClusterResources(newAppResponse)
// 配置AM的启动参数,内部进行了command的封装:,
//【cluster模式】command = bin/java org.apache.spark.deploy.yarn.ApplicationMaster, //【client模式】command = bin/java org.apache.spark.deploy.yarn.ExecutorLauncher
val containerContext = createContainerLaunchContext(newAppResponse)
val appContext = createApplicationSubmissionContext(newApp, containerContext)
// Finally, submit and monitor the application
logInfo(s"Submitting application $appId to ResourceManager")
//提交yarn应用程序到RM
yarnClient.submitApplication(appContext)
launcherBackend.setAppId(appId.toString)
reportLauncherState(SparkAppHandle.State.SUBMITTED)
appId
} catch {
case e: Throwable =>
if (appId != null) {
cleanupStagingDir(appId)
}
throw e
}
}
}
6.1 createContainerLaunchContext 构建提交应用的上下文环境
在当前方法中,定义了am的启动类:
【cluster模式】amClass= bin/java org.apache.spark.deploy.yarn.ApplicationMaster
【client模式】amClass = bin/java org.apache.spark.deploy.yarn.ExecutorLauncher
private[spark] class Client(
private def createContainerLaunchContext(newAppResponse: GetNewApplicationResponse)
: ContainerLaunchContext = {
...........
val javaOpts = ListBuffer[String]()
............
// Add Xmx for AM memory
javaOpts += "-Xmx" + amMemory + "m"
.............
javaOpts += "-Djava.io.tmpdir=" + tmpDir
..............
// In our expts, using (default) throughput collector has severe perf ramifications in
// multi-tenant machines
javaOpts += "-XX:+UseConcMarkSweepGC"
javaOpts += "-XX:MaxTenuringThreshold=31"
javaOpts += "-XX:SurvivorRatio=8"
javaOpts += "-XX:+CMSIncrementalMode"
javaOpts += "-XX:+CMSIncrementalPacing"
javaOpts += "-XX:CMSIncrementalDutyCycleMin=0"
javaOpts += "-XX:CMSIncrementalDutyCycle=10"
}
....................
//AM启动类
val amClass =
if (isClusterMode) {
Utils.classForName("org.apache.spark.deploy.yarn.ApplicationMaster").getName
} else {
Utils.classForName("org.apache.spark.deploy.yarn.ExecutorLauncher").getName
}
if (args.primaryRFile != null && args.primaryRFile.endsWith(".R")) {
args.userArgs = ArrayBuffer(args.primaryRFile) ++ args.userArgs
}
val userArgs = args.userArgs.flatMap { arg =>
Seq("--arg", YarnSparkHadoopUtil.escapeForShell(arg))
}
val amArgs =
Seq(amClass) ++ userClass ++ userJar ++ primaryPyFile ++ primaryRFile ++ userArgs ++
Seq("--properties-file", buildPath(Environment.PWD.$$(), LOCALIZED_CONF_DIR, SPARK_CONF_FILE))
// Command for the ApplicationMaster
val commands = prefixEnv ++
Seq(Environment.JAVA_HOME.$$() + "/bin/java", "-server") ++
javaOpts ++ amArgs ++
Seq(
"1>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout",
"2>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr")
// TODO: it would be nicer to just make sure there are no null commands here
val printableCommands = commands.map(s => if (s == null) "null" else s).toList
amContainer.setCommands(printableCommands.asJava)
}
}