9. Spark源码解析之Yarn Cluster模式启动流程源码解析

最新推荐文章于 2021-04-30 17:39:57 发布

訾零

最新推荐文章于 2021-04-30 17:39:57 发布

阅读量790

点赞数 2

分类专栏： Spark

本文链接：https://blog.csdn.net/lingeio/article/details/98477183

版权

这里解读当sparksubmit提交模式为Yarn Cluster模式时的启动流程。

SparkSubmit类的runMain()中执行到start()时，本地模式会进入本地提交的--class类的main中开始执行。

      // 启动实例
      app.start(childArgs.toArray, sparkConf)

而Yarn Cluster模式，在prepareSubmitEnvironment()中准备运行环境时有判断过，所以start()其实调用的是org.apache.spark.deploy.yarn.YarnClusterApplication类的start()。

    // In yarn-cluster mode, use yarn.Client as a wrapper around the user class
    // yarn-cluster模式,使用yarn.client作为用户提交类的包装执行器
    if (isYarnCluster) {
      // object SparkSubmit中有定义为"org.apache.spark.deploy.yarn.YarnClusterApplication"
      childMainClass = YARN_CLUSTER_SUBMIT_CLASS
     
       ...

      }
 
      // 遍历所有args参数,添加到子类参数中
      if (args.childArgs != null) {
        args.childArgs.foreach { arg => childArgs += ("--arg", arg) }
      }
    }

YarnClusterApplication

YarnClusterApplication类在org.apache.spark.deploy.yarn.Client类下，其实也就是加载运行环境的资源到运行服务器本地，然后通过Client类的run()运行。

// 同样继承了SparkApplication,重写了start()
private[spark] class YarnClusterApplication extends SparkApplication {

  override def start(args: Array[String], conf: SparkConf): Unit = {
    // SparkSubmit would use yarn cache to distribute files & jars in yarn mode,
    // so remove them from sparkConf here for yarn mode.
    // yarn模式使用缓存来分发jars和文件,所以移除之前spark的配置
    // 可以回头看看prepareSubmitEnvironment()运行环境准备,各种部署模式设置相应参数的方法options()
    conf.remove("spark.jars")
    conf.remove("spark.files")

    // 构建client实例,而首先又构建了ClientArguments实例解析参数
    new Client(new ClientArguments(args), conf).run()
  }

}

ClientArguments

就是加载代码和jars、参数，jar，class，args。

// TODO: Add code and support for ensuring that yarn resource 'tasks' are location aware !
private[spark] class ClientArguments(args: Array[String]) {

  var userJar: String = null
  var userClass: String = null
  var primaryPyFile: String = null
  var primaryRFile: String = null
  var userArgs: ArrayBuffer[String] = new ArrayBuffer[String]()

  parseArgs(args.toList)

  // 解析传入的参数
  private def parseArgs(inputArgs: List[String]): Unit = {
    var args = inputArgs

    while (!args.isEmpty) {
      args match {
        case ("--jar") :: value :: tail =>
          userJar = value
          args = tail

        case ("--class") :: value :: tail =>
          userClass = value
          args = tail

        case ("--primary-py-file") :: value :: tail =>
          primaryPyFile = value
          args = tail

        case ("--primary-r-file") :: value :: tail =>
          primaryRFile = value
          args = tail

        case ("--arg") :: value :: tail =>
          userArgs += value
          args = tail

        case Nil =>

        case _ =>
          throw new IllegalArgumentException(getUsageMessage(args))
      }
    }

    // pyfile和Rfile不能同时设置
    if (primaryPyFile != null && primaryRFile != null) {
      throw new IllegalArgumentException("Cannot have primary-py-file and primary-r-file" +
        " at the same time")
    }
  }

  private def getUsageMessage(unknownParam: List[String] = null): String = {
    val message = if (unknownParam != null) s"Unknown/unsupported param $unknownParam\n" else ""
    message +
      s"""
      |Usage: org.apache.spark.deploy.yarn.Client [options]
      |Options:
      |  --jar JAR_PATH           Path to your application's JAR file (required in yarn-cluster
      |                           mode)
      |  --class CLASS_NAME       Name of your application's main class (required)
      |  --primary-py-file        A main Python file
      |  --primary-r-file         A main R file
      |  --arg ARG                Argument to be passed to your application's main class.
      |                           Multiple invocations are possible, each will be passed in order.
      """.stripMargin
  }
}

Client

直接进入Client的run()。

private[spark] class Client(
    val args: ClientArguments,
    val sparkConf: SparkConf)
  extends Logging {
    ...

      /**
   * Submit an application to the ResourceManager.
   * If set spark.yarn.submit.waitAppCompletion to true, it will stay alive
   * reporting the application's status until the application has exited for any reason.
   * Otherwise, the client process will exit after submission.
   * If the application finishes with a failed, killed, or undefined status,
   * throw an appropriate SparkException.
   */
  // 向RM提交app
  def run(): Unit = {
    // 提交app获取id
    // spark.yarn.submit.waitAppCompletion设置为true,进程会保存存活并报告app状态,直到app完成
    // 如果fail,kill级undefined状态退出,会抛出异常
    this.appId = submitApplication()

    // 监控application状态
    if (!launcherBackend.isConnected() && fireAndForget) {
      val report = getApplicationReport(appId)
      val state = report.getYarnApplicationState
      logInfo(s"Application report for $appId (state: $state)")
      logInfo(formatReportDetails(report))
      if (state == YarnApplicationState.FAILED || state == YarnApplicationState.KILLED) {
        throw new SparkException(s"Application $appId finished with status: $state")
      }
    } else { 
      val YarnAppReport(appState, finalState, diags) = monitorApplication(appId)
      if (appState == YarnApplicationState.FAILED || finalState == FinalApplicationStatus.FAILED) {
        diags.foreach { err =>
          logError(s"Application diagnostics message: $err")
        }
        throw new SparkException(s"Application $appId finished with failed status")
      }
      if (appState == YarnApplicationState.KILLED || finalState == FinalApplicationStatus.KILLED) {
        throw new SparkException(s"Application $appId is killed")
      }
      if (finalState == FinalApplicationStatus.UNDEFINED) {
        throw new SparkException(s"The final status of application $appId is undefined")
      }
    }
  }
}

submitApplication()

看看提交app获取id的过程。

  def submitApplication(): ApplicationId = {
    var appId: ApplicationId = null
    try {
      // 初始化launcherBackend,与launcherServer建立连接
      launcherBackend.connect()
      // 初始化yarnClinet
      yarnClient.init(hadoopConf)
      // 启动yarnClient,连接到集群,获取节点信息
      yarnClient.start()

      // 输出节点个数
      logInfo("Requesting a new application from cluster with %d NodeManagers"
        .format(yarnClient.getYarnClusterMetrics.getNumNodeManagers))

      // Get a new application from our RM
      // 调用接口向RM创建一个app
      val newApp = yarnClient.createApplication()
      // 获取app请求的响应
      val newAppResponse = newApp.getNewApplicationResponse()
      // 获取app的id
      appId = newAppResponse.getApplicationId()

      // 建立客户端,用于与hadoop通讯
      new CallerContext("CLIENT", sparkConf.get(APP_CALLER_CONTEXT),
        Option(appId.toString)).setCurrentContext()

      // Verify whether the cluster has enough resources for our AM
      // 验证集群是否有足够资源运行AM
      verifyClusterResources(newAppResponse)

      // Set up the appropriate contexts to launch our AM
      // 启动Container用于启动AM,并设置环境变量
      val containerContext = createContainerLaunchContext(newAppResponse)
      val appContext = createApplicationSubmissionContext(newApp, containerContext)

      // Finally, submit and monitor the application
      logInfo(s"Submitting application $appId to ResourceManager")
      // 提交app,通过appContext获取资源情况
      yarnClient.submitApplication(appContext)
      // 监控提交的状况
      launcherBackend.setAppId(appId.toString)
      reportLauncherState(SparkAppHandle.State.SUBMITTED)

      // 返回appId
      appId
    } catch {
      case e: Throwable =>