spark 2.2 源码分析 Spark-submit 篇

7 篇文章 0 订阅
3 篇文章 0 订阅

spark 2.2 源码分析 Spark-submit 篇

本文主要分析spark的第一步spark-submit类。之前shell阶段主要就是环境变量的加载,而个人认为spark-submit才是spark程序的真正步。由上文可知org.apache.spark.launcher.Main启动了 org.apache.spark.deploy.SparkSubmit。我们来看一下submit的main入口函数:

override def main(args: Array[String]): Unit = {
  // Initialize logging if it hasn't been done yet. Keep track of whether logging needs to
  // be reset before the application starts.
  val uninitLog = initializeLogIfNecessary(true, silent = true)

  // SparkSubmitArguments封装spark-submit传入的参数 例如/bin/spark-submit --master yarn-cluster --num-executors 10
  //SparkSubmitArguments中的参数比较多,详情请见源码变量列表
  val appArgs: SparkSubmitArguments = new SparkSubmitArguments(args)

  // be reset before the application starts.
  // 判断的是否启用类debug模式
  if (appArgs.verbose) {
    // scalastyle:off println
    printStream.println(appArgs)
    // scalastyle:on println
  }
  appArgs.action match {
      // 调用类submit的入口函数 默认
    case SparkSubmitAction.SUBMIT => submit(appArgs, uninitLog)
    case SparkSubmitAction.KILL => kill(appArgs)
    case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
  }
}

在这里对sparksubmitArguments说明一下,由于里面参数较多,请自行百度,如下面,主要是loadEnvriomentArguments函数进行赋值

private def loadEnvironmentArguments(): Unit = {
  master = Option(master)
    .orElse(sparkProperties.get("spark.master"))
    .orElse(env.get("MASTER"))
    .orNull
  driverExtraClassPath = Option(driverExtraClassPath)
    .orElse(sparkProperties.get("spark.driver.extraClassPath"))
    .orNull
  driverExtraJavaOptions = Option(driverExtraJavaOptions)
    .orElse(sparkProperties.get("spark.driver.extraJavaOptions"))
    .orNull
  driverExtraLibraryPath = Option(driverExtraLibraryPath)
    .orElse(sparkProperties.get("spark.driver.extraLibraryPath"))
    .orNull
  driverMemory = Option(driverMemory)
    .orElse(sparkProperties.get("spark.driver.memory"))
    .orElse(env.get("SPARK_DRIVER_MEMORY"))
    .orNull

现在我们看一下submit的函数

private def submit(args: SparkSubmitArguments, uninitLog: Boolean): Unit = {
  // childMainClass就是我们要运行的主类,而根据spark的集群模式(--master)和集群driver部署模式(--deploy-mode)不同
  // 对childMainClass进行不同的赋值。目前大部公司都采用yarn模式,我们真对yarn的两种部署模式进行分析。
  val (childArgs, childClasspath, sysProps, childMainClass) = prepareSubmitEnvironment(args)

  def doRunMain(): Unit = {
    if (args.proxyUser != null) {
      val proxyUser = UserGroupInformation.createProxyUser(args.proxyUser,
        UserGroupInformation.getCurrentUser())
      try {
        proxyUser.doAs(new PrivilegedExceptionAction[Unit]() {
          override def run(): Unit = {
            
            runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
          }
        })
      } catch {
        case e: Exception =>
          // Hadoop's AuthorizationException suppresses the exception's stack trace, which
          // makes the message printed to the output by the JVM not very helpful. Instead,
          // detect exceptions with empty stack traces here, and treat them differently.
          if (e.getStackTrace().length == 0) {
            // scalastyle:off println
            printStream.println(s"ERROR: ${e.getClass().getName()}: ${e.getMessage()}")
            // scalastyle:on println
            exitFn(1)
          } else {
            throw e
          }
      }
    } else {
      runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
    }
  }
如上图,对环境变量准备和main函数指定后,运行runmain函数进行反射调用。

private def runMain(
    childArgs: Seq[String],
    childClasspath: Seq[String],
    sysProps: Map[String, String],
    childMainClass: String,
    verbose: Boolean): Unit = {
  // scalastyle:off println
  if (verbose) {
    printStream.println(s"Main class:\n$childMainClass")
    printStream.println(s"Arguments:\n${childArgs.mkString("\n")}")
    // sysProps may contain sensitive information, so redact before printing
    printStream.println(s"System properties:\n${Utils.redact(sysProps).mkString("\n")}")
    printStream.println(s"Classpath elements:\n${childClasspath.mkString("\n")}")
    printStream.println("\n")
  }
  // scalastyle:on println

  val loader =
    if (sysProps.getOrElse("spark.driver.userClassPathFirst", "false").toBoolean) {
      new ChildFirstURLClassLoader(new Array[URL](0),
        Thread.currentThread.getContextClassLoader)
    } else {
      new MutableURLClassLoader(new Array[URL](0),
        Thread.currentThread.getContextClassLoader)
    }
  Thread.currentThread.setContextClassLoader(loader)
  for (jar <- childClasspath) {
    // 添加jar依赖
    addJarToClasspath(jar, loader)
  }

  for ((key, value) <- sysProps) {
    System.setProperty(key, value)
  }

  var mainClass: Class[_] = null

  try {
    // 通过反射获得main函数的class
    mainClass = Utils.classForName(childMainClass)
  } catch {
    case e: ClassNotFoundException =>
      e.printStackTrace(printStream)
      if (childMainClass.contains("thriftserver")) {
        // scalastyle:off println
        printStream.println(s"Failed to load main class $childMainClass.")
        printStream.println("You need to build Spark with -Phive and -Phive-thriftserver.")
        // scalastyle:on println
      }
      System.exit(CLASS_NOT_FOUND_EXIT_STATUS)
    case e: NoClassDefFoundError =>
      e.printStackTrace(printStream)
      if (e.getMessage.contains("org/apache/hadoop/hive")) {
        // scalastyle:off println
        printStream.println(s"Failed to load hive class.")
        printStream.println("You need to build Spark with -Phive and -Phive-thriftserver.")
        // scalastyle:on println
      }
      System.exit(CLASS_NOT_FOUND_EXIT_STATUS)
  }

  // SPARK-4170
  if (classOf[scala.App].isAssignableFrom(mainClass)) {
    printWarning("Subclasses of scala.App may not work correctly. Use a main() method instead.")
  }
  // 获取mainclass的main函数的方法
  val mainMethod = mainClass.getMethod("main", new Array[String](0).getClass
    //main函数必须要是静态的方法,进行检查
  if (!Modifier.isStatic(mainMethod.getModifiers)) {
    throw new IllegalStateException("The main method in the given main class must be static")
  }

  @tailrec
  def findCause(t: Throwable): Throwable = t match {
    case e: UndeclaredThrowableException =>
      if (e.getCause() != null) findCause(e.getCause()) else e
    case e: InvocationTargetException =>
      if (e.getCause() != null) findCause(e.getCause()) else e
    case e: Throwable =>
      e
  }

  try {
    // 调用main函数进行执行
    mainMethod.invoke(null, childArgs.toArray)
  } catch {
    case t: Throwable =>
      findCause(t) match {
        case SparkUserAppException(exitCode) =>
          System.exit(exitCode)

        case t: Throwable =>
          throw t
      }
  }
}
目前就可以开始执行用户的提交的main的函数的程序了,进行sparkSession的创建,以及后面rdd的执行等待。

值得注意的是spark-shell模式 其实是org.apache.spark.repl.Main进行相关的创建。

同时,client和cluster的模式 会有所不同。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值