1,Spark任务提交入口:
在脚本提交与java类提交,都会调用 org.apache.spark.deploy.SparkSubmit 类 main方法:
2,调用doSubmit方法:
3,构建SparkSubmitArguments对象(Spark任务提交的参数解析)
Spark运行时的参数:
参数属性:
在这里插入代码片var master: String = null
var deployMode: String = null
var executorMemory: String = null
var executorCores: String = null
var totalExecutorCores: String = null
var propertiesFile: String = null
var driverMemory: String = null
var driverExtraClassPath: String = null
var driverExtraLibraryPath: String = null
var driverExtraJavaOptions: String = null
var queue: String = null
var numExecutors: String = null
var files: String = null
var archives: String = null
var mainClass: String = null
var primaryResource: String = null
var name: String = null
var childArgs: ArrayBuffer[String] = new ArrayBuffer[String]()
var jars: String = null
var packages: String = null
var repositories: String = null
var ivyRepoPath: String = null
var ivySettingsPath: Option[String] = None
var packagesExclusions: String = null
var verbose: Boolean = false
var isPython: Boolean = false
var pyFiles: String = null
var isR: Boolean = false
var action: SparkSubmitAction = null
val sparkProperties: HashMap[String, String] = new HashMap[String, String]()
var proxyUser: String = null
var principal: String = null
var keytab: String = null
private var dynamicAllocationEnabled: Boolean = false
// Standalone cluster mode only
var supervise: Boolean = false
var driverCores: String = null
var submissionToKill: String = null
var submissionToRequestStatusFor: String = null
var useRest: Boolean = false // used internally
4,解析参数:
5,解析完参数,给提交属性赋值:
属性对应列表:在通过脚本提交时,指定参数key对应列表
// The following constants define the "main" name for the available options. They're defined
// to avoid copy & paste of the raw strings where they're needed.
//
// The fields are not static so that they're exposed to Scala code that uses this class. See
// SparkSubmitArguments.scala. That is also why this class is not abstract - to allow code to
// easily use these constants without having to create dummy implementations of this class.
protected final String CLASS = "--class";
protected final String CONF = "--conf";
protected final String DEPLOY_MODE = "--deploy-mode";
protected final String DRIVER_CLASS_PATH = "--driver-class-path";
protected final String DRIVER_CORES = "--driver-cores";
protected final String DRIVER_JAVA_OPTIONS = "--driver-java-options";
protected final String DRIVER_LIBRARY_PATH = "--driver-library-path";
protected final String DRIVER_MEMORY = "--driver-memory";
protected final String EXECUTOR_MEMORY = "--executor-memory";
protected final String FILES = "--files";
protected final String JARS = "--jars";
protected final String KILL_SUBMISSION = "--kill";
protected final String MASTER = "--master";
protected final String NAME = "--name";
protected final String PACKAGES = "--packages";
protected final String PACKAGES_EXCLUDE = "--exclude-packages";
protected final String PROPERTIES_FILE = "--properties-file";
protected final String PROXY_USER = "--proxy-user";
protected final String PY_FILES = "--py-files";
protected final String REPOSITORIES = "--repositories";
protected final String STATUS = "--status";
protected final String TOTAL_EXECUTOR_CORES = "--total-executor-cores";
// Options that do not take arguments.
protected final String HELP = "--help";
protected final String SUPERVISE = "--supervise";
protected final String USAGE_ERROR = "--usage-error";
protected final String VERBOSE = "--verbose";
protected final String VERSION = "--version";
// Standalone-only options.
// YARN-only options.
protected final String ARCHIVES = "--archives";
protected final String EXECUTOR_CORES = "--executor-cores";
protected final String KEYTAB = "--keytab";
protected final String NUM_EXECUTORS = "--num-executors";
protected final String PRINCIPAL = "--principal";
protected final String QUEUE = "--queue";
6,执行submit方法:
7,根据不同的模式,执行doRunMain方法:
8,行runMain()方法
**重要:**准备执行环境:返回(childArgs, childClasspath, sparkConf, childMainClass)四个属性:
提交环境准备阶段重要的点:
a,匹配资源管理组件,
b,匹配不同的部署模式
c,通过不同的部署模式,匹配Spark任务的集群管理模式:
d,将提交参数中的jars,files,PyFiles,archives(仅限与spark on yarn)等提交的jar包与文件,解析为全局路径;
// Resolve glob path for different resources.
args.jars = Option(args.jars).map(resolveGlobPaths(_, hadoopConf)).orNull
args.files = Option(args.files).map(resolveGlobPaths(_, hadoopConf)).orNull
args.pyFiles = Option(args.pyFiles).map(resolveGlobPaths(_, hadoopConf)).orNull
args.archives = Option(args.archives).map(resolveGlobPaths(_, hadoopConf)).orNull
//解决全局路径方法
def resolveGlobPaths(paths: String, hadoopConf: Configuration): String = {
require(paths != null, "paths cannot be null.")
Utils.stringToSeq(paths).flatMap { path =>
val (base, fragment) = splitOnFragment(path)
//调用resolveGlobPath方法
(resolveGlobPath(base, hadoopConf), fragment) match {
case (resolved, Some(_)) if resolved.length > 1 => throw new SparkException(
s"${base.toString} resolves ambiguously to multiple files: ${resolved.mkString(",")}")
case (resolved, Some(namedAs)) => resolved.map(_ + "#" + namedAs)
case (resolved, _) => resolved
}
}.mkString(",")
}
//该方法为解析重点
private def resolveGlobPath(uri: URI, hadoopConf: Configuration): Array[String] = {
uri.getScheme match {
//匹配:如果是local或者可以远程获取的模式时,则解析路径为数组
case "local" | "http" | "https" | "ftp" => Array(uri.toString)
case _ =>
//其他默认:解析为hdfs路径;
val fs = FileSystem.get(uri, hadoopConf)
Option(fs.globStatus(new Path(uri))).map { status =>
status.filter(_.isFile).map(_.getPath.toUri.toString)
}.getOrElse(Array(uri.toString))
}
}
e,用户认证
f,下载任务依赖文件File和jars 到目标文件目录中(以yarn为例:)
// When running in YARN, for some remote resources with scheme:
// 1. Hadoop FileSystem doesn't support them.
// 2. We explicitly bypass Hadoop FileSystem with "spark.yarn.dist.forceDownloadSchemes".
// We will download them to local disk prior to add to YARN's distributed cache.
// For yarn client mode, since we already download them with above code, so we only need to
// figure out the local path and replace the remote one.
if (clusterManager == YARN) {
val forceDownloadSchemes = sparkConf.get(FORCE_DOWNLOAD_SCHEMES)
def shouldDownload(scheme: String): Boolean = {
forceDownloadSchemes.contains("*") || forceDownloadSchemes.contains(scheme) ||
Try { FileSystem.getFileSystemClass(scheme, hadoopConf) }.isFailure
}
def downloadResource(resource: String): String = {
val uri = Utils.resolveURI(resource)
uri.getScheme match {
case "local" | "file" => resource
case e if shouldDownload(e) =>
val file = new File(targetDir, new Path(uri).getName)
if (file.exists()) {
file.toURI.toString
} else {
downloadFile(resource, targetDir, sparkConf, hadoopConf, secMgr)
}
case _ => uri.toString
}
}
args.primaryResource = Option(args.primaryResource).map { downloadResource }.orNull
args.files = Option(args.files).map { files =>
Utils.stringToSeq(files).map(downloadResource).mkString(",")
}.orNull
args.pyFiles = Option(args.pyFiles).map { pyFiles =>
Utils.stringToSeq(pyFiles).map(downloadResource).mkString(",")
}.orNull
args.jars = Option(args.jars).map { jars =>
Utils.stringToSeq(jars).map(downloadResource).mkString(",")
}.orNull
args.archives = Option(args.archives).map { archives =>
Utils.stringToSeq(archives).map(downloadResource).mkString(",")
}.orNull
}
f,加载任务依赖的jars到classpath
// Add the main application jar and any added jars to classpath in case YARN client
// requires these jars.
// This assumes both primaryResource and user jars are local jars, or already downloaded
// to local by configuring "spark.yarn.dist.forceDownloadSchemes", otherwise it will not be
// added to the classpath of YARN client.
if (isYarnCluster) {
if (isUserJar(args.primaryResource)) {
childClasspath += args.primaryResource
}
if (args.jars != null) { childClasspath ++= args.jars.split(",") }
}
h,加载mainClass:不同的模式,加载不同的mainClass,包括包括使用语言(Python,R等)以及部署模式:Yarn,K8S,Local等;
此处:以yarn为例:
// In yarn-cluster mode, use yarn.Client as a wrapper around the user class
if (isYarnCluster) {
//private[deploy] val YARN_CLUSTER_SUBMIT_CLASS =
"org.apache.spark.deploy.yarn.YarnClusterApplication"
//yarn集群模式需要的MainClass
childMainClass = YARN_CLUSTER_SUBMIT_CLASS
if (args.isPython) {
childArgs += ("--primary-py-file", args.primaryResource)
childArgs += ("--class", "org.apache.spark.deploy.PythonRunner")
} else if (args.isR) {
val mainFile = new Path(args.primaryResource).getName
childArgs += ("--primary-r-file", mainFile)
childArgs += ("--class", "org.apache.spark.deploy.RRunner")
} else {
if (args.primaryResource != SparkLauncher.NO_RESOURCE) {
childArgs += ("--jar", args.primaryResource)
}
childArgs += ("--class", args.mainClass)
}
if (args.childArgs != null) {
args.childArgs.foreach { arg => childArgs += ("--arg", arg) }
}
}
9,构建SparkApplication对象,并执行start()方法;
10, 执行YarnClusterApplication类的start()方法,
YarnClusterApplication继承了SparkApplication类.
private[spark] class YarnClusterApplication extends SparkApplication {
override def start(args: Array[String], conf: SparkConf): Unit = {
// SparkSubmit would use yarn cache to distribute files & jars in yarn mode,
// so remove them from sparkConf here for yarn mode.
conf.remove(JARS)
conf.remove(FILES)
//构建Client对象,并且根据任务提交时的参数,构建ClientArguments对象;
new Client(new ClientArguments(args), conf, null).run()
}
}
ClientArguments类对象
// TODO: Add code and support for ensuring that yarn resource 'tasks' are location aware !
private[spark] class ClientArguments(args: Array[String]) {
var userJar: String = null
var userClass: String = null
var primaryPyFile: String = null
var primaryRFile: String = null
var userArgs: ArrayBuffer[String] = new ArrayBuffer[String]()
parseArgs(args.toList)
private def parseArgs(inputArgs: List[String]): Unit = {
var args = inputArgs
while (!args.isEmpty) {
args match {
case ("--jar") :: value :: tail => //任务提交时,--jar 传入的jar包
userJar = value
args = tail
case ("--class") :: value :: tail => //--class指定的Class类
userClass = value
args = tail
case ("--primary-py-file") :: value :: tail =>
primaryPyFile = value
args = tail
case ("--primary-r-file") :: value :: tail =>
primaryRFile = value
args = tail
case ("--arg") :: value :: tail =>
userArgs += value
args = tail
case Nil =>
case _ =>
throw new IllegalArgumentException(getUsageMessage(args))
}
}
if (primaryPyFile != null && primaryRFile != null) {
throw new IllegalArgumentException("Cannot have primary-py-file and primary-r-file" +
" at the same time")
}
}
private def getUsageMessage(unknownParam: List[String] = null): String = {
val message = if (unknownParam != null) s"Unknown/unsupported param $unknownParam\n" else ""
message +
s"""
|Usage: org.apache.spark.deploy.yarn.Client [options]
|Options:
| --jar JAR_PATH Path to your application's JAR file (required in YARN cluster
| mode)
| --class CLASS_NAME Name of your application's main class (required)
| --primary-py-file A main Python file
| --primary-r-file A main R file
| --arg ARG Argument to be passed to your application's main class.
| Multiple invocations are possible, each will be passed in order.
""".stripMargin
}
}
11,构建YarnClient对象;
对应上步骤中new Client(new ClientArguments(args), conf, null).run(),并执行
YarnClientImpl中关键属性:
12,执行Client对象的run()方法:
private[spark] class Client(
val args: ClientArguments,
val sparkConf: SparkConf,
val rpcEnv: RpcEnv)
extends Logging {
/**
* Submit an application to the ResourceManager.
* If set spark.yarn.submit.waitAppCompletion to true, it will stay alive
* reporting the application's status until the application has exited for any reason.
* Otherwise, the client process will exit after submission.
* If the application finishes with a failed, killed, or undefined status,
* throw an appropriate SparkException.
*/
def run(): Unit = {
this.appId = submitApplication() **//提交Application,并且返回yarn任务唯一id:AppId**
if (!launcherBackend.isConnected() && fireAndForget) {
val report = getApplicationReport(appId)
val state = report.getYarnApplicationState
logInfo(s"Application report for $appId (state: $state)")
logInfo(formatReportDetails(report))
if (state == YarnApplicationState.FAILED || state == YarnApplicationState.KILLED) {
throw new SparkException(s"Application $appId finished with status: $state")
}
} else {
val YarnAppReport(appState, finalState, diags) = monitorApplication(appId)
if (appState == YarnApplicationState.FAILED || finalState == FinalApplicationStatus.FAILED) {
diags.foreach { err =>
logError(s"Application diagnostics message: $err")
}
throw new SparkException(s"Application $appId finished with failed status")
}
if (appState == YarnApplicationState.KILLED || finalState == FinalApplicationStatus.KILLED) {
throw new SparkException(s"Application $appId is killed")
}
if (finalState == FinalApplicationStatus.UNDEFINED) {
throw new SparkException(s"The final status of application $appId is undefined")
}
}
}
}
13,执行submitApplication()方法
/**
* Submit an application running our ApplicationMaster to the ResourceManager.
*
* The stable Yarn API provides a convenience method (YarnClient#createApplication) for
* creating applications and setting up the application submission context. This was not
* available in the alpha API.
*/
def submitApplication(): ApplicationId = {
ResourceRequestHelper.validateResources(sparkConf)
var appId: ApplicationId = null
try {
launcherBackend.connect() //建立与服务端(RM)的连接
yarnClient.init(hadoopConf)//根据配置信息初始化yarnClient
yarnClient.start() //启动客户端
logInfo("Requesting a new application from cluster with %d NodeManagers"
.format(yarnClient.getYarnClusterMetrics.getNumNodeManagers))
// Get a new application from our RM
val newApp = yarnClient.createApplication() //从ResourceManager中创建一个应用;
val newAppResponse = newApp.getNewApplicationResponse()
appId = newAppResponse.getApplicationId() //从响应中获取任务唯一Id
// The app staging dir based on the STAGING_DIR configuration if configured
// otherwise based on the users home directory.
val appStagingBaseDir = sparkConf.get(STAGING_DIR)
.map { new Path(_, UserGroupInformation.getCurrentUser.getShortUserName) }
.getOrElse(FileSystem.get(hadoopConf).getHomeDirectory())
stagingDirPath = new Path(appStagingBaseDir, getAppStagingDir(appId))
new CallerContext("CLIENT", sparkConf.get(APP_CALLER_CONTEXT),
Option(appId.toString)).setCurrentContext()
// Verify whether the cluster has enough resources for our AM
verifyClusterResources(newAppResponse)
// Set up the appropriate contexts to launch our AM
val containerContext = createContainerLaunchContext(newAppResponse) //创建容器环境的上下文
val appContext = createApplicationSubmissionContext(newApp, containerContext) //创建Application的上下文
// Finally, submit and monitor the application
logInfo(s"Submitting application $appId to ResourceManager")
//yarnClient提交appContext 到ResourceManger,RM根据接受到的指令,执行下面的操作
yarnClient.submitApplication(appContext)
launcherBackend.setAppId(appId.toString)
reportLauncherState(SparkAppHandle.State.SUBMITTED) //并设置处理状态为:SUBMITTED
appId
} catch {
case e: Throwable =>
if (stagingDirPath != null) {
cleanupStagingDir()
}
throw e
}
}
14 ,创建Yarn容器环境 (重要部分)
/**
* Set up a ContainerLaunchContext to launch our ApplicationMaster container.
* This sets up the launch environment, java options, and the command for launching the AM.
*/
private def createContainerLaunchContext(newAppResponse: GetNewApplicationResponse)
: ContainerLaunchContext = {
logInfo("Setting up container launch context for our AM")
val appId = newAppResponse.getApplicationId
//忽略中间相关配置代码.....................
// For log4j configuration to reference
javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR)
val userClass =
if (isClusterMode) {
Seq("--class", YarnSparkHadoopUtil.escapeForShell(args.userClass))
} else {
Nil
}
val userJar =
if (args.userJar != null) {
Seq("--jar", args.userJar)
} else {
Nil
}
val primaryPyFile =
if (isClusterMode && args.primaryPyFile != null) {
Seq("--primary-py-file", new Path(args.primaryPyFile).getName())
} else {
Nil
}
val primaryRFile =
if (args.primaryRFile != null) {
Seq("--primary-r-file", args.primaryRFile)
} else {
Nil
}
//通过不同模式,获取不同的amClass
val amClass =
if (isClusterMode) {
//如果时集群模式,则获取ApplicationMaster
Utils.classForName("org.apache.spark.deploy.yarn.ApplicationMaster").getName
} else {
//如果时Client的模式,则获取ExecutorLauncher对象
Utils.classForName("org.apache.spark.deploy.yarn.ExecutorLauncher").getName
}
if (args.primaryRFile != null &&
(args.primaryRFile.endsWith(".R") || args.primaryRFile.endsWith(".r"))) {
args.userArgs = ArrayBuffer(args.primaryRFile) ++ args.userArgs
}
val userArgs = args.userArgs.flatMap { arg =>
Seq("--arg", YarnSparkHadoopUtil.escapeForShell(arg))
}
//获取启动AM进程的参数
val amArgs =
Seq(amClass) ++ userClass ++ userJar ++ primaryPyFile ++ primaryRFile ++ userArgs ++
Seq("--properties-file",
buildPath(Environment.PWD.$$(), LOCALIZED_CONF_DIR, SPARK_CONF_FILE)) ++
Seq("--dist-cache-conf",
buildPath(Environment.PWD.$$(), LOCALIZED_CONF_DIR, DIST_CACHE_CONF_FILE))
// Command for the ApplicationMaster
//通过java命令启动,AM进程
val commands = prefixEnv ++
Seq(Environment.JAVA_HOME.$$() + "/bin/java", "-server") ++
javaOpts ++ amArgs ++
Seq(
"1>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout",
"2>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr")
// TODO: it would be nicer to just make sure there are no null commands here
val printableCommands = commands.map(s => if (s == null) "null" else s).toList
//将启动AM指令封装到am容器中,
amContainer.setCommands(printableCommands.asJava)
// send the acl settings into YARN to control who has access via YARN interfaces
val securityManager = new SecurityManager(sparkConf)
amContainer.setApplicationACLs(
YarnSparkHadoopUtil.getApplicationAclsForYarn(securityManager).asJava)
setupSecurityToken(amContainer)
amContainer //
}
15,启动进程 ApplicationMaster
启动进程,查看AM的main方法
object ApplicationMaster extends Logging {
// exit codes for different causes, no reason behind the values
private val EXIT_SUCCESS = 0
private val EXIT_UNCAUGHT_EXCEPTION = 10
private val EXIT_MAX_EXECUTOR_FAILURES = 11
private val EXIT_REPORTER_FAILURE = 12
private val EXIT_SC_NOT_INITED = 13
private val EXIT_SECURITY = 14
private val EXIT_EXCEPTION_USER_CLASS = 15
private val EXIT_EARLY = 16
private var master: ApplicationMaster = _
//启动AM进程,进程入口
def main(args: Array[String]): Unit = {
SignalUtils.registerLogger(log)
val amArgs = new ApplicationMasterArguments(args) //解析任务提交时的参数;
/** 对应热为奴提交时的参数信息
while (!args.isEmpty) {
// --num-workers, --worker-memory, and --worker-cores are deprecated since 1.0,
// the properties with executor in their names are preferred.
args match {
case ("--jar") :: value :: tail =>
userJar = value
args = tail
case ("--class") :: value :: tail =>
userClass = value
args = tail
case ("--primary-py-file") :: value :: tail =>
primaryPyFile = value
args = tail
case ("--primary-r-file") :: value :: tail =>
primaryRFile = value
args = tail
case ("--arg") :: value :: tail =>
userArgsBuffer += value
args = tail
case ("--properties-file") :: value :: tail =>
propertiesFile = value
args = tail
case ("--dist-cache-conf") :: value :: tail =>
distCacheConf = value
args = tail
case _ =>
printUsageAndExit(1, args)
}
*/
val sparkConf = new SparkConf()
if (amArgs.propertiesFile != null) {
Utils.getPropertiesFromFile(amArgs.propertiesFile).foreach { case (k, v) =>
sparkConf.set(k, v)
}
}
// Set system properties for each config entry. This covers two use cases:
// - The default configuration stored by the SparkHadoopUtil class
// - The user application creating a new SparkConf in cluster mode
//
// Both cases create a new SparkConf object which reads these configs from system properties.
sparkConf.getAll.foreach { case (k, v) =>
sys.props(k) = v
}
val yarnConf = new YarnConfiguration(SparkHadoopUtil.newConfiguration(sparkConf))
//创建AM对象
master = new ApplicationMaster(amArgs, sparkConf, yarnConf) //
val ugi = sparkConf.get(PRINCIPAL) match {
// We only need to log in with the keytab in cluster mode. In client mode, the driver
// handles the user keytab.
case Some(principal) if master.isClusterMode =>
val originalCreds = UserGroupInformation.getCurrentUser().getCredentials()
SparkHadoopUtil.get.loginUserFromKeytab(principal, sparkConf.get(KEYTAB).orNull)
val newUGI = UserGroupInformation.getCurrentUser()
if (master.appAttemptId == null || master.appAttemptId.getAttemptId > 1) {
// Re-obtain delegation tokens if this is not a first attempt, as they might be outdated
// as of now. Add the fresh tokens on top of the original user's credentials (overwrite).
// Set the context class loader so that the token manager has access to jars
// distributed by the user.
Utils.withContextClassLoader(master.userClassLoader) {
val credentialManager = new HadoopDelegationTokenManager(sparkConf, yarnConf, null)
credentialManager.obtainDelegationTokens(originalCreds)
}
}
// Transfer the original user's tokens to the new user, since it may contain needed tokens
// (such as those user to connect to YARN).
newUGI.addCredentials(originalCreds)
newUGI
case _ =>
SparkHadoopUtil.get.createSparkUser()
}
ugi.doAs(new PrivilegedExceptionAction[Unit]() {
override def run(): Unit = System.exit(master.run())
})
}
private[spark] def sparkContextInitialized(sc: SparkContext): Unit = {
master.sparkContextInitialized(sc)
}
private[spark] def getAttemptId(): ApplicationAttemptId = {
master.appAttemptId
}
private[spark] def getHistoryServerAddress(
sparkConf: SparkConf,
yarnConf: YarnConfiguration,
appId: String,
attemptId: String): String = {
sparkConf.get(HISTORY_SERVER_ADDRESS)
.map { text => SparkHadoopUtil.get.substituteHadoopVariables(text, yarnConf) }
.map { address => s"${address}${HistoryServer.UI_PATH_PREFIX}/${appId}/${attemptId}" }
.getOrElse("")
}
}
在AM对象中,会创建YarnRMClient对象
private[spark] class ApplicationMaster(
args: ApplicationMasterArguments,
sparkConf: SparkConf,
yarnConf: YarnConfiguration) extends Logging {
......
//创建YarnRMClient对象
private val client = new YarnRMClient()
.......
}
在YarnRMClient中会创建AMRMClient对象
private[spark] class YarnRMClient extends Logging {
//AMRMClient ApplicationMaster与ResourceManager联系的客户端
private var amClient: AMRMClient[ContainerRequest] = _
private var uiHistoryAddress: String = _
private var registered: Boolean = false
}
16,执行AM中的run方法;
final def run(): Int = {
try {
//启动runDriver | runExecutorLauncher 的准备代码,忽略
//最重要:如果是集群,则执行runDriver方法
if (isClusterMode) {
runDriver()
} else {
//如果不是集群,则执行runExecutorLauncher
runExecutorLauncher()
}
} catch {
.....
} finally {
....
}
exitCode
}
部分一完成,后续请看整理部分2:
Spark整理:任务提交源码解析-part2