Spark Event Log Start
上一节已经看过操作Spaek Event Log的类是org.apache.spark.scheduler.EventLoggingListener,现在来分析一下start()方法
/**
* Creates the log file in the configured log directory.
*/
def start() {
if (!fileSystem.getFileStatus(new Path(logBaseDir)).isDirectory) {
throw new IllegalArgumentException(s"Log directory $logBaseDir is not a directory.")
}
val workingPath = logPath + IN_PROGRESS
val path = new Path(workingPath)
val uri = path.toUri
val defaultFs = FileSystem.getDefaultUri(hadoopConf).getScheme
val isDefaultLocal = defaultFs == null || defaultFs == "file"
if (shouldOverwrite && fileSystem.delete(path, true)) {
logWarning(s"Event log $path already exists. Overwriting...")
}
/* The Hadoop LocalFileSystem (r1.0.4) has known issues with syncing (HADOOP-7844).
* Therefore, for local files, use FileOutputStream instead. */
// 创建一个OutputStream到Event Log File
val dstream =
if ((isDefaultLocal && uri.getScheme == null) || uri.getScheme == "file") {
new FileOutputStream(uri.getPath)
} else {
hadoopDataStream = Some(fileSystem.create(path))
hadoopDataStream.get
}
try {
val cstream = compressionCodec.map(_.compressedOutputStream(dstream)).getOrElse(dstream)
val bstream = new BufferedOutputStream(cstream, outputBufferSize)
// 初始化Event Log
EventLoggingListener.initEventLog(bstream, testing, loggedEvents)
fileSystem.setPermission(path, LOG_FILE_PERMISSIONS)
writer = Some(new PrintWriter(bstream))
logInfo("Logging events to %s".format(logPath))
} catch {
case e: Exception =>
dstream.close()
throw e
}
}
接着到EventLoggingListener.initEventLog(bstream, testing, loggedEvents)看看怎么初始化的。
/**
* Write metadata about an event log to the given stream.
* The metadata is encoded in the first line of the event log as JSON.
*
* @param logStream Raw output stream to the event log file.
*/
def initEventLog(
logStream: OutputStream,
testing: Boolean,
loggedEvents: ArrayBuffer[JValue]): Unit = {
// 要写的内容是Spark的Version信息
val metadata = SparkListenerLogStart(SPARK_VERSION)
// 把要写的内容转化正JSON格式
val eventJson = JsonProtocol.logStartToJson(metadata)
// 转化成字符串,并追加换行符
val metadataJson = compact(eventJson) + "\n"
// 写入到Event Log 文件
logStream.write(metadataJson.getBytes(StandardCharsets.UTF_8))
if (testing && loggedEvents != null) {
loggedEvents += eventJson
}
}
看一下val eventJson = JsonProtocol.logStartToJson(metadata)
def logStartToJson(logStart: SparkListenerLogStart): JValue = {
("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.logStart) ~
("Spark Version" -> SPARK_VERSION)
}
对比一下上一篇Event Log的第一行:
{
"Event":"SparkListenerLogStart","Spark Version":"2.4.4"}
是不是很熟悉,能对的上吧。
那么问题来了,start()方法是什么时候被调用的?
什么时候调用Start方法
在SparkConrext里初始化eventLogger的时候调用的
_eventLogger =
// 当spark.eventLog.enabled为true时,会初始化EventLoggingListener
if (isEventLogEnabled) {
// 初始化event logger
val logger =
new EventLoggingListener(_applicationId, _applicationAttemptId, _eventLogDir.get,
_conf, _hadoopConfiguration)
// 调用start方法,写入spark version信息到event log file
logger.start()
listenerBus.addToEventLogQueue(logger)
Some(logger)
} else {
None
}
那么什么时候初始化_eventLogger呢,看源码不难发现,这段逻辑就在SparkContext内,也就是初始化SparkContext的时候就会执行这段逻辑。
再向前追溯,什么时候初始化SparkContext呢?我们来看一看怎么创建SparkSession的就知道了:
val spark = SparkSession.builder()
.appName("TestEventLog")
.master("local")
.config("spark.eventLog.enabled", true)
.config("spark.eventLog.dir", "file:///Users/darren.zhang/test/")
//创建SparkSession
.getOrCreate()
/**
* Gets an existing [[SparkSession]] or, if there is no existing one, creates a new
* one based on the options set in this builder.
*
* This method first checks whether there is a valid thread-local SparkSession,
* and if yes, return that one. It then checks whether there is a valid global
* default SparkSession, and if yes, return that one. If no valid global default
* SparkSession exists, the method creates a new SparkSession and assigns the
* newly created SparkSession as the global default.
*
* In case an existing SparkSession is returned, the config options specified in
* this builder will be applied to the existing SparkSession.
*
* @since 2.0.0
*/
def getOrCreate(): SparkSession = synchronized {
assertOnDriver()
// Get the session from current thread's active session.
// 第一次还没创建出来,这里肯定拿不到
var session = activeThreadSession.get()
if ((session ne null) && !session.sparkContext.isStopped) {
options.foreach {
case (k, v) => session.sessionState.conf.setConfString(k, v) }
if (options.nonEmpty) {
logWarning("Using an existing SparkSession; some configuration may not take effect.")
}
return session
}
// Global synchronization so we will only set the default session once.
// 一样,第一次肯定都没有
SparkSession.synchronized {
// If the current thread does not have an active session, get it from the global session.
session = defaultSession.get()
if ((session ne null) && !session.sparkContext.isStopped) {
options.foreach {
case (k, v) => session.sessionState.conf.setConfString(k, v) }
if (options.nonEmpty) {
logWarning("Using an existing SparkSession; some configuration may not take effect.")
}
return session
}
// No active nor global default session. Create a new one.
// 这里写的很清楚,都没有的时候创建一个新的,但第一件事是创建SparkContext
val sparkContext = userSuppliedContext.getOrElse {
val sparkConf = new SparkConf()
options.foreach {
case (k, v) => sparkConf.set(k, v) }
// set a random app name if not given.
if (!sparkConf.contains("spark.app.name")) {
sparkConf.setAppName(java.util.UUID.randomUUID().toString)
}
// 这里方法名也能够见明知意,就是要创建SparkContext
SparkContext.getOrCreate(sparkConf)
// Do not update `SparkConf` for existing `SparkContext`, as it's shared by all sessions.
}
// Initialize extensions if the user has defined a configurator class.
val extensionConfOption = sparkContext.conf.get(StaticSQLConf.SPARK_SESSION_EXTENSIONS)
if (extensionConfOption.isDefined) {
val extensionConfClassName = extensionConfOption.get
try {
val extensionConfClass = Utils.classForName(extensionConfClassName)
val extensionConf = extensionConfClass.newInstance()
.asInstanceOf[SparkSessionExtensions => Unit]
extensionConf(extensions)
} catch {
// Ignore the error if we cannot find the class or when the class has the wrong type.
case e @ (_: ClassCastException |
_: ClassNotFoundException |
_: NoClassDefFoundError