yarnCluster运行源码(2)

最新推荐文章于 2022-11-19 22:12:56 发布

qq_42506914

最新推荐文章于 2022-11-19 22:12:56 发布

阅读量281

点赞数

分类专栏： spark

本文链接：https://blog.csdn.net/qq_42506914/article/details/88582690

版权

spark 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

上次说到yarnClient 把命令行和容器信息提交到RM 运行AM

首先看AM的main方法

def main(args: Array[String]): Unit = {
    SignalUtils.registerLogger(log)
    val amArgs = new ApplicationMasterArguments(args)
    。。。
    SparkHadoopUtil.get.runAsSparkUser { () =>
//new YarnRMClient 用来和RM来通讯的
      master = new ApplicationMaster(amArgs, new YarnRMClient)
//注意master的run方法
      System.exit(master.run())
    }
  }

run方法

final def run(): Int = {
    try {
      val appAttemptId = client.getAttemptId()

      var attemptID: Option[String] = None

      if (isClusterMode) {
      。。。。
        设置一些参数
      }

      new CallerContext("APPMASTER",
        Option(appAttemptId.getApplicationId.toString), attemptID).setCurrentContext()

      logInfo("ApplicationAttemptId: " + appAttemptId)
        //文件系统
      val fs = FileSystem.get(yarnConf)

      // This shutdown hook should run *after* the SparkContext is shut down.
      val priority = ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY - 1
      ShutdownHookManager.addShutdownHook(priority) { () =>
        val maxAppAttempts = client.getMaxRegAttempts(sparkConf, yarnConf)
        val isLastAttempt = client.getAttemptId().getAttemptId() >= maxAppAttempts

        if (!finished) {
          // The default state of ApplicationMaster is failed if it is invoked by shut down hook.
          // This behavior is different compared to 1.x version.
          // If user application is exited ahead of time by calling System.exit(N), here mark
          // this application as failed with EXIT_EARLY. For a good shutdown, user shouldn't call
          // System.exit(0) to terminate the application.
          finish(finalStatus,
            ApplicationMaster.EXIT_EARLY,
            "Shutdown hook called before final status was reported.")
        }

        if (!unregistered) {
          // we only want to unregister if we don't want the RM to retry
          if (finalStatus == FinalApplicationStatus.SUCCEEDED || isLastAttempt) {
            unregister(finalStatus, finalMsg)
            cleanupStagingDir(fs)
          }
        }
      }

  。。。。
      if (isClusterMode) {
//集群模式  runDriver
        runDriver(securityMgr)
      } else {
//client 模式调用这个
        runExecutorLauncher(securityMgr)
      }
    } catch {
      case e: Exception =>
        // catch everything else if not specifically handled
        logError("Uncaught exception: ", e)
        finish(FinalApplicationStatus.FAILED,
          ApplicationMaster.EXIT_UNCAUGHT_EXCEPTION,
          "Uncaught exception: " + e)
    }
    exitCode
  }

ApplicationMaster的runDriver()方法

  private def runDriver(securityMgr: SecurityManager): Unit = {
    addAmIpFilter()
//启动一个线程名字叫Driver  运行userClass的main方法
//下面的代码任然在异步运行，申请资源等等
    userClassThread = startUserApplication()

    // This a bit hacky, but we need to wait until the spark.driver.port property has
    // been set by the Thread executing the user class.
    logInfo("Waiting for spark context initialization...")
    val totalWaitTime = sparkConf.get(AM_MAX_WAIT_TIME)
    try {
      val sc = ThreadUtils.awaitResult(sparkContextPromise.future,
        Duration(totalWaitTime, TimeUnit.MILLISECONDS))
      if (sc != null) {
        rpcEnv = sc.env.rpcEnv
        val driverRef = runAMEndpoint(
          sc.getConf.get("spark.driver.host"),
          sc.getConf.get("spark.driver.port"),
          isClusterMode = true)
//注册应用  申请资源
//请求资源，RM给出的资源container，选择
//nmClient 根据命令 command  java -cp CoarseGrainedExecutorBackend
        registerAM(sc.getConf, rpcEnv, driverRef, sc.ui.map(_.appUIAddress).getOrElse(""),
          securityMgr)
      } else {
        // Sanity check; should never happen in normal operation, since sc should only be null
        // if the user app did not create a SparkContext.
        if (!finished) {
          throw new IllegalStateException("SparkContext is null but app is still running!")
        }
      }
//userClass里面还会做很多事情，初始sc  切分任务  等等  
//ApplicationMaster 需要等着
      userClassThread.join()
    } catch {
      case e: SparkException if e.getCause().isInstanceOf[TimeoutException] =>
        logError(
          s"SparkContext did not initialize after waiting for $totalWaitTime ms. " +
           "Please check earlier log output for errors. Failing the application.")
        finish(FinalApplicationStatus.FAILED,
          ApplicationMaster.EXIT_SC_NOT_INITED,
          "Timed out waiting for SparkContext.")
    }
  }

ApplicationMaster的startUserApplication()方法

主要是把启动一个线程来启动 userClass的main方法，并把这个线程的名字改成Driver

这里例外一个线程来运行Driver 来初始化sc 任务切分等等。而applicationMaster同时在运行后面的方法，

申请资源，等等

/**
   * Start the user class, which contains the spark driver, in a separate Thread.
   * If the main routine exits cleanly or exits with System.exit(N) for any N
   * we assume it was successful, for all other cases we assume failure.
   *
   * Returns the user thread that was started.
   */
  private def startUserApplication(): Thread = {
    logInfo("Starting the user application in a separate Thread")

    val classpath = Client.getUserClasspath(sparkConf)
    val urls = classpath.map { entry =>
      new URL("file:" + new File(entry.getPath()).getAbsolutePath())
    }

    val userClassLoader =
      if (Client.isUserClassPathFirst(sparkConf, isDriver = true)) {
        new ChildFirstURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
      } else {
        new MutableURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
      }

    var userArgs = args.userArgs
    if (args.primaryPyFile != null && args.primaryPyFile.endsWith(".py")) {
      // When running pyspark, the app is run using PythonRunner. The second argument is the list
      // of files to add to PYTHONPATH, which Client.scala already handles, so it's empty.
      userArgs = Seq(args.primaryPyFile, "") ++ userArgs
    }
    if (args.primaryRFile != null && args.primaryRFile.endsWith(".R")) {
      // TODO(davies): add R dependencies here
    }
//这里的userClass  是用户自己写的class  里面包含了初始化 sc  等等
    val mainMethod = userClassLoader.loadClass(args.userClass)
      .getMethod("main", classOf[Array[String]])
//创建一个线程来执行main方法
    val userThread = new Thread {
      override def run() {
        try {
//执行main方法
          mainMethod.invoke(null, userArgs.toArray)
          finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS)
          logDebug("Done running users class")
        } catch {
          case e: InvocationTargetException =>
            e.getCause match {
              case _: InterruptedException =>
                // Reporter thread can interrupt to stop user class
              case SparkUserAppException(exitCode) =>
                val msg = s"User application exited with status $exitCode"
                logError(msg)
                finish(FinalApplicationStatus.FAILED, exitCode, msg)
              case cause: Throwable =>
                logError("User class threw exception: " + cause, cause)
                finish(FinalApplicationStatus.FAILED,
                  ApplicationMaster.EXIT_EXCEPTION_USER_CLASS,
                  "User class threw exception: " + cause)
            }
            sparkContextPromise.tryFailure(e.getCause())
        } finally {
          // Notify the thread waiting for the SparkContext, in case the application did not
          // instantiate one. This will do nothing when the user code instantiates a SparkContext
          // (with the correct master), or when the user code throws an exception (due to the
          // tryFailure above).
          sparkContextPromise.trySuccess(null)
        }
      }
    }
    userThread.setContextClassLoader(userClassLoader)
//启动之前把当前线程叫做Driver 
//一个线程，线程里面执行了userclass的main方法
    userThread.setName("Driver")
    userThread.start()
    userThread
  }

ApplicationMaster
       main()
           SparkHadoopUtil.get.runAsSparkUser { () =>
           //包含了RMClient 与RM联系
           master = new ApplicationMaster(amArgs, new YarnRMClient)
           System.exit(master.run())
           }
           run()
               //集群模式 run的是userclass 用户自定义的类
               runDriver(securityMgr)
                   //例外起一个线程来运行userclass 初始化sc ，切分job 等等然后把线程的名字改成driver
                   userClassThread = startUserApplication()
                   //applicationmaster的进程，同时进行资源的申请
                   //注册就是申请资源
                   /**
                   根据RM给的资源container
                   然后通过nmClient 启动这些container 运行准备的commond
                   运行 CoarseGrainedExecutorBackend
                   */
                   registerAM(sc.getConf, rpcEnv, driverRef, sc.ui.map(_.appUIAddress)
                   //资源申请完成后，要等userclass线程结束才可以
                   userClassThread.join()