承接上文“org.apache.spark.launcher.Main源码分析“
SparkSubmitArgumentsParser的父类就SparkSubmitOptionParser,在launcher.Main方法执行时用到OptionParser 它的父类也是SparkSubmitOptionParser。并且这个父类有一个方法parser。作用将spark-submit放进来的参数对应值赋到spark对应的变量中,如 --class的值 放到mainClass变量中此处在SparkSubmitArguments初始化时也调用这个父类的parser方法。
object SparkSubmit {
。。。
private[spark] def printVersionAndExit(): Unit= {
printStream.println("""Welcometo
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version %s
/_/
""".format(SPARK_VERSION))
printStream.println("Type --helpfor more information.")
exitFn(0)
}
// scalastyle:on println
def main(args: Array[String]): Unit = {
/** 使用SparkSubmitArguments封装spark-submit传入的参数:
* 这是spark-shell传进来的:“org.apache.spark.deploy.SparkSubmit”
* --class org.apache.spark.repl.Main --name "Spark shell"--master spark://luyl152:7077,luyl153:7077,luyl154:7077
* 这是自己的应用通过spark-submit传进来的:
* --master spark://luyl152:7077,luyl153:7077,luyl154:7077 --classdt.scala.App /tool/jarDir/maven_scala-1.0-SNAPSHOT.jar
*/
val appArgs = new SparkSubmitArguments(args)
一、查看一下new SparkSubmitArguments(args)中初始化调用的parse(args)代码
/**
* Parses and encapsulates arguments fromthe spark-submit script.
* The env argument is used for testing.
* sys.env :会将系统中所有环境变量都取出来
*/
private[deploy] class SparkSubmitArguments(args:Seq[String], env: Map[String, String] = sys.env)
extends SparkSubmitArgumentsParser {
var master: String = null
var deployMode: String = null
var executorMemory: String= null
var executorCores: String= null
var totalExecutorCores: String= null
var propertiesFile: String= null
var driverMemory: String = null
var driverExtraClassPath: String= null
var driverExtraLibraryPath: String= null
var driverExtraJavaOptions: String= null
……
// Set parameters from command line arguments
try {
parse(args.asJava) //因为parse()方法参数类型是java的List,所以要转一下
} catch {
case e: IllegalArgumentException =>
SparkSubmit.printErrorAndExit(e.getMessage())
}
……
===>查看一下SparkSubmitOptionParser的parse代码:
/**
* Parse a list of spark-submit commandline options.
* <p>
* See SparkSubmitArguments.scala for a more formal descriptionof available options.
*
* @throws IllegalArgumentExceptionIf an error is found during parsing
* 参数是这些:--class org.apache.spark.repl.Main --name"Spark shell" --master spark://luyl152:7077.
* 作用就是将spark-submit放进来的参数对应值赋到spark对应的变量中,如 --class的值 放到mainClass变量中
*/
protected finalvoid parse(List<String> args) {
//spark-submit可以传sparkConf参数:--confPROP=VALUE ,参数可以看org.apache.spark.deploy.SparkSubmitArguments类最后面
//或spark-submit-h就可以查看
Pattern eqSeparatedOpt = Pattern.compile("(--[^=]+)=(.+)");
int idx = 0;
for (idx = 0; idx < args.size(); idx++) {
String arg = args.get(idx);
String value = null;
//当出现--conf PROP=VALUE这种类型的参数arg、value值变成if代码里面的值
Matcher m = eqSeparatedOpt.matcher(arg);
if (m.matches()) {
arg = m.group(1); //--conf PROP
value = m.group(2); //VALUE
}
// Look for options with a value.
//该方法主要是找到spark-submit后面的带有--参数,如args 放进"--class",和opts二维数组进行匹配
//匹配到的还是返回--class,如果没有匹配到则null
String name = findCliOption(arg, opts);
if (name != null) {
if (value== null) {
if (idx== args.size() - 1) { //如果匹配了并且没有参数值则报错,如:只有 --class ,则size是1,idx此时0, 1-1=0
throw new IllegalArgumentException(
String.format("Missing argument for option'%s'.", arg));
}
idx++;
value = args.get(idx); //如果有值,则idx索引的下一位就是参数对应的值
}
//name就是spark-submit的参数如--class,而value就是参数对应的值
//在它的自身OptionParser做的实现,作用就是将spark-submit放进来的参数对应值赋到spark对应的变量中
//如 --class的值放到mainClass变量中(里面实现很easy,就不写了)
if (!handle(name, value)) {
break;
}
continue; //只有匹配到才会让idx再次加1
}
// Look for aswitch. 如果上面没有匹配到,会再去匹配一下是否有出现-verbose这样参数
name = findCliOption(arg, switches);
if (name != null) {
if (!handle(name, null)) {
break;
}
continue;
}
if (!handleUnknown(arg)){
break;
}
}
if (idx< args.size()) {
idx++;
}
// 将多出来的参数加到 SparkSubmitCommandBuilder() {his.sparkArgs = new ArrayList<String>();..}
handleExtraArgs(args.subList(idx, args.size()));
}
===》上面handle(name,value)在OptionParser的实现如下
/**
*作用就是将spark-submit放进来的参数对应值赋到spark对应的变量中
*/
@Override
protected boolean handle(String opt, String value) {
if (opt.equals(MASTER)) {
master =value;
} elseif (opt.equals(DEPLOY_MODE)) {
deployMode =value;
} elseif (opt.equals(PROPERTIES_FILE)) {
propertiesFile = value;
} elseif (opt.equals(DRIVER_MEMORY)) {
conf.put(SparkLauncher.DRIVER_MEMORY, value);
} elseif (opt.equals(DRIVER_JAVA_OPTIONS)) {
conf.put(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, value);
} elseif (opt.equals(DRIVER_LIBRARY_PATH)) {
conf.put(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH, value);。。。。。
===》初始化new SparkSubmitArguments(args)之后再次返回SparkSubmit$.main方法
//这个verbose只要在spark提交的时候加入:--verbose参数就可以变成true,可以看源码实现很easy (可以得到很多信息如:)
//./spark-submit --classdt.spark.DriverInWinDebuggerCluster --master spark://luyl152:7077,luyl153:7077,luyl154:7077 --verbose /tool/maven_scala-1.0-SNAPSHOT.jar
if (appArgs.verbose) {
// scalastyle:off println
printStream.println(appArgs)
/**
* Main class:
dt.spark.DriverInWinDebuggerCluster
Arguments:
System properties:
spark.yarn.historyServer.address-> luyl152:18080
spark.eventLog.enabled -> true
SPARK_SUBMIT -> true
spark.executor.extraJavaOptions-> -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
spark.history.fs.logDirectory-> hdfs://ns1/historyserverforspark
spark.app.name ->dt.spark.DriverInWinDebuggerCluster
spark.jars ->file:/tool/maven_scala-1.0-SNAPSHOT.jar
spark.submit.deployMode ->client #默认就是client ,在这个类中搜索--deploy-mode,有相关的解释
spark.eventLog.dir ->hdfs://ns1/historyserverforspark
spark.master ->spark://luyl152:7077,luyl153:7077,luyl154:7077
Classpath elements:
file:/tool/maven_scala-1.0-SNAPSHOT.jar
*/
// scalastyle:on println
}
/** 这里的action就是spark-submit执行的动作,包括:SUBMIT, KILL, REQUEST_STATUS(使
* 用了SparkSubmitAction进行了封装),如果没有指定,SparkSubmitArguments设置的值是SparkSubmitAction.SUBMIT,
* 所以下面的这个模式匹配将执行submit(appArgs)
*/
appArgs.actionmatch {
case SparkSubmitAction.SUBMIT => submit(appArgs)
case SparkSubmitAction.KILL => kill(appArgs)
case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
}
}
二、进入submit(appArgs)分析一下:
/**
* Submit the application using theprovided parameters.
*
* This runs in two steps. First, weprepare the launch environment by setting up
* the appropriate classpath, systemproperties, and application arguments for
* running the child main class based onthe cluster manager and the deploy mode.
* Second, we use this launch environmentto invoke the main method of the child
* main class.
*
* submit方法的主要功能就是使用传进来的参数来提交应用程序。
* 主要分为两步骤:
* 1. 准备启动所需的环境,包括设置classpath、系统参数和应用程序的参数(根据部署模式和cluster
* manager运行childmain类)。
* 2. 使用上一步准备好的环境调用child main class中的main函数,我们这里只考虑client模式,
* cluster模式我们以后会单独分析。
* 所以如果是spark-shell,child main class就是org.apache.spark.repl.Main,如果是
* spark-submit直接进行提交,child main class就是用户编写的应用程序(含有main方法的类)
*/
private def submit(args: SparkSubmitArguments): Unit = {
//childArgs:表示运行主类的main参数是个ArrayBuffer,childClasspath:是主类的jar:/tool/jarDir/maven_scala-1.0-SNAPSHOT.jar
//sysProps:sparkConf相关的属性值是个Map, childMainClass就是主类的类全路径:dt.scala.App
val (childArgs, childClasspath, sysProps, childMainClass)= prepareSubmitEnvironment(args)
1、先分析一下如何prepareSubmitEnvironment(args)方法:
===》这个方法老长了,只分析一下,当前场景对应的代码
private[deploy] def prepareSubmitEnvironment(args: SparkSubmitArguments)
: (Seq[String], Seq[String], Map[String, String], String) = {
// Return values
val childArgs= new ArrayBuffer[String]()
val childClasspath= new ArrayBuffer[String]()
val sysProps= new HashMap[String, String]()
var childMainClass= ""
// Set the cluster manager,master就是spark://luyl152:7077,luyl153:7077,luyl154:7077
val clusterManager: Int = args.mastermatch {
case m if m.startsWith("yarn") => YARN
case m if m.startsWith("spark")=> STANDALONE
case m if m.startsWith("mesos")=> MESOS
case m if m.startsWith("local")=> LOCAL
case _=> printErrorAndExit("Mastermust start with yarn, spark, mesos, or local"); -1
}
// Set the deploy mode; default is client mode:默认是args.deployMode是null,deployMode则是CLIENT的值
var deployMode:Int = args.deployModematch {
case "client"| null =>CLIENT
case "cluster"=> CLUSTER
case _=> printErrorAndExit("Deploymode must be either client or cluster"); -1
}
…yarn、Mesos、R等相关的代码,后面研究到了,再看
// Update args.deployMode if it is null. It will bepassed down as a Spark property later.
//由于deployMode是null,deployMode则是CLIENT的值,由会给args.deployMode的值设置client
(args.deployMode, deployMode) match {
case (null, CLIENT) => args.deployMode = "client"
case (null, CLUSTER) => args.deployMode = "cluster"
case _=>
}
。。。。
// Special flag to avoid deprecation warnings at theclient
sysProps("SPARK_SUBMIT") = "true"
// A list of rules to map each argument to systemproperties or command-line options in
// each deploy mode; we iterate throughthese below
val options= List[OptionAssigner](
// All cluster managers
OptionAssigner(args.master, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, sysProp = "spark.master"),
OptionAssigner(args.deployMode, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES,
sysProp = "spark.submit.deployMode"),
OptionAssigner(args.name, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, sysProp = "spark.app.name"),
OptionAssigner(args.jars, ALL_CLUSTER_MGRS, CLIENT, sysProp = "spark.jars"),
OptionAssigner(args.ivyRepoPath, ALL_CLUSTER_MGRS, CLIENT, sysProp = "spark.jars.ivy"),
OptionAssigner(args.driverMemory, ALL_CLUSTER_MGRS, CLIENT,
sysProp = "spark.driver.memory"),
OptionAssigner(args.driverExtraClassPath, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES,
sysProp = "spark.driver.extraClassPath"),
OptionAssigner(args.driverExtraJavaOptions, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES,
sysProp = "spark.driver.extraJavaOptions"),
OptionAssigner(args.driverExtraLibraryPath, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES,
sysProp = "spark.driver.extraLibraryPath"),
// Yarn client only
。。。。
// Other options
OptionAssigner(args.executorCores, STANDALONE | YARN, ALL_DEPLOY_MODES,
sysProp = "spark.executor.cores"),
OptionAssigner(args.executorMemory, STANDALONE | MESOS | YARN, ALL_DEPLOY_MODES,
sysProp = "spark.executor.memory"),
OptionAssigner(args.totalExecutorCores, STANDALONE | MESOS, ALL_DEPLOY_MODES,
sysProp = "spark.cores.max"),
OptionAssigner(args.files, LOCAL | STANDALONE | MESOS, ALL_DEPLOY_MODES,
sysProp = "spark.files"),
OptionAssigner(args.jars, STANDALONE | MESOS, CLUSTER, sysProp = "spark.jars"),
OptionAssigner(args.driverMemory, STANDALONE | MESOS, CLUSTER,
sysProp = "spark.driver.memory"),
OptionAssigner(args.driverCores, STANDALONE | MESOS, CLUSTER,
sysProp = "spark.driver.cores"),
OptionAssigner(args.supervise.toString, STANDALONE| MESOS, CLUSTER,
sysProp = "spark.driver.supervise"),
OptionAssigner(args.ivyRepoPath, STANDALONE, CLUSTER, sysProp = "spark.jars.ivy")
)
// In client mode, launch the application main classdirectly
// In addition, add the mainapplication jar and any added jars (if any) to the classpath
if (deployMode== CLIENT) {
childMainClass = args.mainClass
//就是自的main类路径
//primaryResource就是指定的自已传进去jar:/tool/jarDir/maven_scala-1.0-SNAPSHOT.jar
if (isUserJar(args.primaryResource)) {
childClasspath += args.primaryResource
}
//是否加第三方--jars
if (args.jars != null) { childClasspath ++= args.jars.split(",") }
if (args.childArgs != null) { childArgs ++= args.childArgs }
}
。。。
//Add the application jar automatically so the user doesn't have to callsc.addJar
// For YARN cluster mode, the jar isalready distributed on each node as "app.jar"
// For python and R files, the primaryresource is already distributed as a regular file
//当不是yarn,也不python也不是r的时候会进入这个语句
if (!isYarnCluster&& !args.isPython && !args.isR) {
var jars= sysProps.get("spark.jars").map(x => x.split(",").toSeq).getOrElse(Seq.empty)
if (isUserJar(args.primaryResource)) {
jars = jars ++ Seq(args.primaryResource)
}
//将 第三方--jars放在spark.jars中,按逗号分开
sysProps.put("spark.jars", jars.mkString(","))
}
。。。
// Load any properties specified through --conf and thedefault properties file
for ((k, v) <- args.sparkProperties) {
sysProps.getOrElseUpdate(k, v)
}
。。。。
// Resolve paths in certain spark properties
val pathConfigs= Seq(
"spark.jars",
"spark.files",
"spark.yarn.jar",
"spark.yarn.dist.files",
"spark.yarn.dist.archives")
pathConfigs.foreach { config =>
// Replace old URIs with resolved URIs, if they exist
sysProps.get(config).foreach { oldValue =>
sysProps(config) = Utils.resolveURIs(oldValue)
}
}
。。。。。
(childArgs, childClasspath, sysProps, childMainClass)
}
===》执行完成之后,(会将--class对应的参数,--class对应的jar包,sysProps属性,--class的类全路径)放到元组中返回。
三、接着回到submit(appArgs)方法中
//这个doRunMain()会被它下面代码进行调用
def doRunMain(): Unit = {
if (args.proxyUser != null) {
// spark-submit可以指定liunx的用户, 在提交时指定--proxy-user的值,否则arg.proxyUser是为空
val proxyUser= UserGroupInformation.createProxyUser(args.proxyUser,
UserGroupInformation.getCurrentUser())
try {
proxyUser.doAs(new PrivilegedExceptionAction[Unit](){
override def run(): Unit = {
runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
}
})
} catch {
case e: Exception =>
// Hadoop's AuthorizationException suppresses theexception's stack trace, which
// makes the message printed tothe output by the JVM not very helpful. Instead,
// detect exceptions with emptystack traces here, and treat them differently.
if (e.getStackTrace().length== 0) {
// scalastyle:off println
printStream.println(s"ERROR: ${e.getClass().getName()}: ${e.getMessage()}")
// scalastyle:on println
exitFn(1)
} else {
throw e
}
}
} else {
runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
}
}
// In standalonecluster mode, there are two submission gateways:
// (1) The traditional Akka gateway using o.a.s.deploy.Client as a wrapper
// (2) The new REST-based gateway introduced in Spark 1.3
// The latter is the default behavioras of Spark 1.3, but Spark submit will fail over
// to use the legacy gateway if themaster endpoint turns out to be not a REST server.
//在独立群集模式下,有两个提交网关:
// (1)使用o.a.s.deploy.Client作为包装的传统Akka网关
// (2)Spark 1.3中引入的基于REST的新网关,后者是Spark 1.3的默认行为,但如果主端点不是REST服务器,则Spark提交将故障转移到使用传统网关。
/** http://spark.apache.org/docs/latest/submitting-applications.html
*http://blog.csdn.net/Trigl/article/details/72732241
* 1,standalone 执行apark-submit时,不设置--deploy-mode,它的值就是client。该模式的场景:client需要和master在同一个网段上
* 因为Drive要和Executorr通信,例如Drive需要将Jar包通过NettyHTTP分发到Executor,Driver要给Executor分配任务等
* a,执行spark-submit脚本称为master,执行自已写的main方法是driver,在client中driver和master是在一个节点。
* b,Driver也是一个executer,称为第三方client,Driver进程不在Worker节点上,所以其是独立的,不会消耗Worker集群的资源
* c,client模式中没有监督重启机制(即设置了–supervise没有用),Driver进程如果挂了,需要额外的程序重启。
*
* 2,如果在提交spark-submit脚本时,设置--deploy-mode cluster:使用的场景是worker和master不在一个网段中 表示driver会在worker节点中,会占用worker的资源
* a,cluster模式下,可以设置–supervise对Driver进行监控,如果Driver挂了可以自动重启
* b,cluster模式下,worker和master一般不在一个网段,所以各个jar提前放到worker节点中
* c,Driver类即自己的main类,分配在哪个点中是由master来分配的
*/
// master.startsWith("spark://")&& deployMode == "cluster" deployMode 默认是 client。useRest默认是true
if (args.isStandaloneCluster&& args.useRest) {
try {
// scalastyle:off println
printStream.println("Running Sparkusing the REST application submission protocol.")
// scalastyle:on println
doRunMain()
} catch {
// Fail over to use the legacy submission gateway
case e:SubmitRestConnectionException =>
printWarning(s"Master endpoint ${args.master} was not a REST server. " +
"Falling back to legacy submission gatewayinstead.")
args.useRest = false
submit(args)
}
// In all other modes, just run the main class as prepared
} else{
doRunMain()
}
}
四,进入runMain(childArgs, childClasspath, sysProps, childMainClass,args.verbose)代码实现:
* childArgs:表示运行主类的main的参数是个ArrayBuffer,childClasspath:是主类的jar:/tool/jarDir/maven_scala-1.0-SNAPSHOT.jar
sysProps:sparkConf相关的属性值是个Map(解析SparkSubmit.prepareSubmitEnvironment()得到的数据), childMainClass就是主类的类全路径:dt.scala.App
*/
private def runMain(
childArgs: Seq[String],
childClasspath: Seq[String],
sysProps: Map[String, String],
childMainClass: String,
verbose: Boolean): Unit = {
// scalastyle:off println
//这个verbose只要在spark提交的时候加入:--verbose参数就可以变成true,可以看源码实现很easy
if (verbose){
printStream.println(s"Main class:\n$childMainClass")
printStream.println(s"Arguments:\n${childArgs.mkString("\n")}")
printStream.println(s"Systemproperties:\n${sysProps.mkString("\n")}")
printStream.println(s"Classpathelements:\n${childClasspath.mkString("\n")}")
printStream.println("\n")
}
// scalastyle:on println
//继承URLClassLoader将jar包加到jvm中。会使用MutableURLClassLoader来加载jar包
val loader =
if (sysProps.getOrElse("spark.driver.userClassPathFirst", "false").toBoolean){
new ChildFirstURLClassLoader(new Array[URL](0),
Thread.currentThread.getContextClassLoader)
} else {
new MutableURLClassLoader(new Array[URL](0),
Thread.currentThread.getContextClassLoader)
}
Thread.currentThread.setContextClassLoader(loader)
for (jar <- childClasspath) {
addJarToClasspath(jar , loader)
}
for ((key , value) <- sysProps) {
System.setProperty(key , value)
}
var mainClass: Class [_] = null
//将主类通过class.forName()反射出来
try {
mainClass = Utils.classForName(childMainClass)
} catch {
case e: ClassNotFoundException =>
e.printStackTrace( printStream )
if (childMainClass.contains( "thriftserver" )) {
//scalastyle:off println
printStream .println( s"Failed to load main class $ childMainClass ." )
printStream .println( "You need to build Spark with -Phive and-Phive-thriftserver." )
//scalastyle:on println
}
System.exit( CLASS_NOT_FOUND_EXIT_STATUS )
case e: NoClassDefFoundError =>
e.printStackTrace( printStream )
if (e.getMessage.contains( "org/apache/hadoop/hive" )) {
//scalastyle:off println
printStream .println( s"Failed to load hive class." )
printStream .println( "You need to build Spark with -Phive and-Phive-thriftserver." )
//scalastyle:on println
}
System.exit( CLASS_NOT_FOUND_EXIT_STATUS )
}
//SPARK-4170
if (classOf[scala.App].isAssignableFrom(mainClass)) {
printWarning( "Subclasses of scala.App may not work correctly. Use a main()method instead." )
}
//从主类的Class中得到main的method
val mainMethod = mainClass.getMethod( "main" , new Array[ String ]( 0 ).getClass)
if (!Modifier.isStatic(mainMethod.getModifiers)) {
thrownew IllegalStateException( "Themain method in the given main class must be static" )
}
def findCause (t: Throwable ): Throwable = t match {
case e: UndeclaredThrowableException =>
if (e.getCause() != null ) findCause(e.getCause()) else e
case e: InvocationTargetException =>
if (e.getCause() != null ) findCause(e.getCause()) else e
case e: Throwable =>
e
}
//Method 对应的方法是静态方法,第一个参数可以是Null
try {
//调用spark-submit对应的主类,所以在standalone的client模式中,driver是通过反射调用的main方法。同时driver和master是在一台服务器上的。
mainMethod.invoke( null , childArgs.toArray)
} catch {
case t: Throwable =>
findCause(t) match {
case SparkUserAppException (exitCode) =>
System.exit(exitCode)
case t: Throwable =>
throw t
}
}
}
到此如果是自己的application,就会进入自己写的--class中的main方法进行执行