Spark Event Log (二) 开始和结束

最新推荐文章于 2023-09-13 16:04:19 发布

三丰

最新推荐文章于 2023-09-13 16:04:19 发布

阅读量1.3k

点赞数

分类专栏： Spark 文章标签： spark

本文链接：https://blog.csdn.net/zpf336/article/details/119917175

版权

Spark Event Log Start

上一节已经看过操作Spaek Event Log的类是org.apache.spark.scheduler.EventLoggingListener,现在来分析一下start()方法

  /**
   * Creates the log file in the configured log directory.
   */
  def start() {
   
    if (!fileSystem.getFileStatus(new Path(logBaseDir)).isDirectory) {
   
      throw new IllegalArgumentException(s"Log directory $logBaseDir is not a directory.")
    }

    val workingPath = logPath + IN_PROGRESS
    val path = new Path(workingPath)
    val uri = path.toUri
    val defaultFs = FileSystem.getDefaultUri(hadoopConf).getScheme
    val isDefaultLocal = defaultFs == null || defaultFs == "file"

    if (shouldOverwrite && fileSystem.delete(path, true)) {
   
      logWarning(s"Event log $path already exists. Overwriting...")
    }

    /* The Hadoop LocalFileSystem (r1.0.4) has known issues with syncing (HADOOP-7844).
     * Therefore, for local files, use FileOutputStream instead. */
    // 创建一个OutputStream到Event Log File
    val dstream =
      if ((isDefaultLocal && uri.getScheme == null) || uri.getScheme == "file") {
   
        new FileOutputStream(uri.getPath)
      } else {
   
        hadoopDataStream = Some(fileSystem.create(path))
        hadoopDataStream.get
      }

    try {
   
      val cstream = compressionCodec.map(_.compressedOutputStream(dstream)).getOrElse(dstream)
      val bstream = new BufferedOutputStream(cstream, outputBufferSize)
	  // 初始化Event Log
      EventLoggingListener.initEventLog(bstream, testing, loggedEvents)
      fileSystem.setPermission(path, LOG_FILE_PERMISSIONS)
      writer = Some(new PrintWriter(bstream))
      logInfo("Logging events to %s".format(logPath))
    } catch {
   
      case e: Exception =>
        dstream.close()
        throw e
    }
  }

接着到EventLoggingListener.initEventLog(bstream, testing, loggedEvents)看看怎么初始化的。

  /**
   * Write metadata about an event log to the given stream.
   * The metadata is encoded in the first line of the event log as JSON.
   *
   * @param logStream Raw output stream to the event log file.
   */
  def initEventLog(
      logStream: OutputStream,
      testing: Boolean,
      loggedEvents: ArrayBuffer[JValue]): Unit = {
   
    // 要写的内容是Spark的Version信息
    val metadata = SparkListenerLogStart(SPARK_VERSION)
    // 把要写的内容转化正JSON格式
    val eventJson = JsonProtocol.logStartToJson(metadata)
    // 转化成字符串，并追加换行符
    val metadataJson = compact(eventJson) + "\n"
    // 写入到Event Log 文件
    logStream.write(metadataJson.getBytes(StandardCharsets.UTF_8))
    if (testing && loggedEvents != null) {
   
      loggedEvents += eventJson
    }
  }

看一下val eventJson = JsonProtocol.logStartToJson(metadata)

  def logStartToJson(logStart: SparkListenerLogStart): JValue = {
   
    ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.logStart) ~
    ("Spark Version" -> SPARK_VERSION)
  }

对比一下上一篇Event Log的第一行：

{
   "Event":"SparkListenerLogStart","Spark Version":"2.4.4"}

是不是很熟悉，能对的上吧。
那么问题来了，start()方法是什么时候被调用的？

什么时候调用Start方法

在SparkConrext里初始化eventLogger的时候调用的

    _eventLogger =
      // 当spark.eventLog.enabled为true时，会初始化EventLoggingListener
      if (isEventLogEnabled) {
   
        // 初始化event logger
        val logger =
          new EventLoggingListener(_applicationId, _applicationAttemptId, _eventLogDir.get,
            _conf, _hadoopConfiguration)
        // 调用start方法，写入spark version信息到event log file
        logger.start()
        listenerBus.addToEventLogQueue(logger)
        Some(logger)
      } else {
   
        None
      }

那么什么时候初始化_eventLogger呢，看源码不难发现，这段逻辑就在SparkContext内，也就是初始化SparkContext的时候就会执行这段逻辑。

再向前追溯，什么时候初始化SparkContext呢？我们来看一看怎么创建SparkSession的就知道了：

        val spark = SparkSession.builder()
            .appName("TestEventLog")
            .master("local")
            .config("spark.eventLog.enabled", true)
            .config("spark.eventLog.dir", "file:///Users/darren.zhang/test/")
            //创建SparkSession
            .getOrCreate()

    /**
     * Gets an existing [[SparkSession]] or, if there is no existing one, creates a new
     * one based on the options set in this builder.
     *
     * This method first checks whether there is a valid thread-local SparkSession,
     * and if yes, return that one. It then checks whether there is a valid global
     * default SparkSession, and if yes, return that one. If no valid global default
     * SparkSession exists, the method creates a new SparkSession and assigns the
     * newly created SparkSession as the global default.
     *
     * In case an existing SparkSession is returned, the config options specified in
     * this builder will be applied to the existing SparkSession.
     *
     * @since 2.0.0
     */
    def getOrCreate(): SparkSession = synchronized {
   
      assertOnDriver()
      // Get the session from current thread's active session.
      // 第一次还没创建出来，这里肯定拿不到
      var session = activeThreadSession.get()
      if ((session ne null) && !session.sparkContext.isStopped) {
   
        options.foreach {
    case (k, v) => session.sessionState.conf.setConfString(k, v) }
        if (options.nonEmpty) {
   
          logWarning("Using an existing SparkSession; some configuration may not take effect.")
        }
        return session
      }

      // Global synchronization so we will only set the default session once.
      // 一样，第一次肯定都没有
      SparkSession.synchronized {
   
        // If the current thread does not have an active session, get it from the global session.
        session = defaultSession.get()
        if ((session ne null) && !session.sparkContext.isStopped) {
   
          options.foreach {
    case (k, v) => session.sessionState.conf.setConfString(k, v) }
          if (options.nonEmpty) {
   
            logWarning("Using an existing SparkSession; some configuration may not take effect.")
          }
          return session
        }

        // No active nor global default session. Create a new one.
        // 这里写的很清楚，都没有的时候创建一个新的，但第一件事是创建SparkContext
        val sparkContext = userSuppliedContext.getOrElse {
   
          val sparkConf = new SparkConf()
          options.foreach {
    case (k, v) => sparkConf.set(k, v) }

          // set a random app name if not given.
          if (!sparkConf.contains("spark.app.name")) {
   
            sparkConf.setAppName(java.util.UUID.randomUUID().toString)
          }
		  // 这里方法名也能够见明知意，就是要创建SparkContext
          SparkContext.getOrCreate(sparkConf)
          // Do not update `SparkConf` for existing `SparkContext`, as it's shared by all sessions.
        }

        // Initialize extensions if the user has defined a configurator class.
        val extensionConfOption = sparkContext.conf.get(StaticSQLConf.SPARK_SESSION_EXTENSIONS)
        if (extensionConfOption.isDefined) {
   
          val extensionConfClassName = extensionConfOption.get
          try {
   
            val extensionConfClass = Utils.classForName(extensionConfClassName)
            val extensionConf = extensionConfClass.newInstance()
              .asInstanceOf[SparkSessionExtensions => Unit]
            extensionConf(extensions)
          } catch {
   
            // Ignore the error if we cannot find the class or when the class has the wrong type.
            case e @ (_: ClassCastException |
                      _: ClassNotFoundException |
                      _: NoClassDefFoundError

最低0.47元/天解锁文章