spark 2.2 源码分析 Spark-submit 篇
本文主要分析spark的第一步spark-submit类。之前shell阶段主要就是环境变量的加载,而个人认为spark-submit才是spark程序的真正步。由上文可知org.apache.spark.launcher.Main启动了 org.apache.spark.deploy.SparkSubmit。我们来看一下submit的main入口函数:
override def main(args: Array[String]): Unit = { // Initialize logging if it hasn't been done yet. Keep track of whether logging needs to // be reset before the application starts. val uninitLog = initializeLogIfNecessary(true, silent = true) // SparkSubmitArguments封装spark-submit传入的参数 例如/bin/spark-submit --master yarn-cluster --num-executors 10 //SparkSubmitArguments中的参数比较多,详情请见源码变量列表 val appArgs: SparkSubmitArguments = new SparkSubmitArguments(args) // be reset before the application starts. // 判断的是否启用类debug模式 if (appArgs.verbose) { // scalastyle:off println printStream.println(appArgs) // scalastyle:on println } appArgs.action match { // 调用类submit的入口函数 默认 case SparkSubmitAction.SUBMIT => submit(appArgs, uninitLog) case SparkSubmitAction.KILL => kill(appArgs) case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs) } }
在这里对sparksubmitArguments说明一下,由于里面参数较多,请自行百度,如下面,主要是loadEnvriomentArguments函数进行赋值
private def loadEnvironmentArguments(): Unit = { master = Option(master) .orElse(sparkProperties.get("spark.master")) .orElse(env.get("MASTER")) .orNull driverExtraClassPath = Option(driverExtraClassPath) .orElse(sparkProperties.get("spark.driver.extraClassPath")) .orNull driverExtraJavaOptions = Option(driverExtraJavaOptions) .orElse(sparkProperties.get("spark.driver.extraJavaOptions")) .orNull driverExtraLibraryPath = Option(driverExtraLibraryPath) .orElse(sparkProperties.get("spark.driver.extraLibraryPath")) .orNull driverMemory = Option(driverMemory) .orElse(sparkProperties.get("spark.driver.memory")) .orElse(env.get("SPARK_DRIVER_MEMORY")) .orNull
现在我们看一下submit的函数
private def submit(args: SparkSubmitArguments, uninitLog: Boolean): Unit = { // childMainClass就是我们要运行的主类,而根据spark的集群模式(--master)和集群driver部署模式(--deploy-mode)不同 // 对childMainClass进行不同的赋值。目前大部公司都采用yarn模式,我们真对yarn的两种部署模式进行分析。 val (childArgs, childClasspath, sysProps, childMainClass) = prepareSubmitEnvironment(args) def doRunMain(): Unit = { if (args.proxyUser != null) { val proxyUser = UserGroupInformation.createProxyUser(args.proxyUser, UserGroupInformation.getCurrentUser()) try { proxyUser.doAs(new PrivilegedExceptionAction[Unit]() { override def run(): Unit = { runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose) } }) } catch { case e: Exception => // Hadoop's AuthorizationException suppresses the exception's stack trace, which // makes the message printed to the output by the JVM not very helpful. Instead, // detect exceptions with empty stack traces here, and treat them differently. if (e.getStackTrace().length == 0) { // scalastyle:off println printStream.println(s"ERROR: ${e.getClass().getName()}: ${e.getMessage()}") // scalastyle:on println exitFn(1) } else { throw e } } } else { runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose) } }如上图,对环境变量准备和main函数指定后,运行runmain函数进行反射调用。
private def runMain( childArgs: Seq[String], childClasspath: Seq[String], sysProps: Map[String, String], childMainClass: String, verbose: Boolean): Unit = { // scalastyle:off println if (verbose) { printStream.println(s"Main class:\n$childMainClass") printStream.println(s"Arguments:\n${childArgs.mkString("\n")}") // sysProps may contain sensitive information, so redact before printing printStream.println(s"System properties:\n${Utils.redact(sysProps).mkString("\n")}") printStream.println(s"Classpath elements:\n${childClasspath.mkString("\n")}") printStream.println("\n") } // scalastyle:on println val loader = if (sysProps.getOrElse("spark.driver.userClassPathFirst", "false").toBoolean) { new ChildFirstURLClassLoader(new Array[URL](0), Thread.currentThread.getContextClassLoader) } else { new MutableURLClassLoader(new Array[URL](0), Thread.currentThread.getContextClassLoader) } Thread.currentThread.setContextClassLoader(loader) for (jar <- childClasspath) { // 添加jar依赖 addJarToClasspath(jar, loader) } for ((key, value) <- sysProps) { System.setProperty(key, value) } var mainClass: Class[_] = null try { // 通过反射获得main函数的class mainClass = Utils.classForName(childMainClass) } catch { case e: ClassNotFoundException => e.printStackTrace(printStream) if (childMainClass.contains("thriftserver")) { // scalastyle:off println printStream.println(s"Failed to load main class $childMainClass.") printStream.println("You need to build Spark with -Phive and -Phive-thriftserver.") // scalastyle:on println } System.exit(CLASS_NOT_FOUND_EXIT_STATUS) case e: NoClassDefFoundError => e.printStackTrace(printStream) if (e.getMessage.contains("org/apache/hadoop/hive")) { // scalastyle:off println printStream.println(s"Failed to load hive class.") printStream.println("You need to build Spark with -Phive and -Phive-thriftserver.") // scalastyle:on println } System.exit(CLASS_NOT_FOUND_EXIT_STATUS) } // SPARK-4170 if (classOf[scala.App].isAssignableFrom(mainClass)) { printWarning("Subclasses of scala.App may not work correctly. Use a main() method instead.") } // 获取mainclass的main函数的方法 val mainMethod = mainClass.getMethod("main", new Array[String](0).getClass //main函数必须要是静态的方法,进行检查 if (!Modifier.isStatic(mainMethod.getModifiers)) { throw new IllegalStateException("The main method in the given main class must be static") } @tailrec def findCause(t: Throwable): Throwable = t match { case e: UndeclaredThrowableException => if (e.getCause() != null) findCause(e.getCause()) else e case e: InvocationTargetException => if (e.getCause() != null) findCause(e.getCause()) else e case e: Throwable => e } try { // 调用main函数进行执行 mainMethod.invoke(null, childArgs.toArray) } catch { case t: Throwable => findCause(t) match { case SparkUserAppException(exitCode) => System.exit(exitCode) case t: Throwable => throw t } } }目前就可以开始执行用户的提交的main的函数的程序了,进行sparkSession的创建,以及后面rdd的执行等待。
值得注意的是spark-shell模式 其实是org.apache.spark.repl.Main进行相关的创建。
同时,client和cluster的模式 会有所不同。