一、SparkListener
spark 提供了一系列整个任务生命周期中各个阶段变化的事件监听机制,通过这一机制可以在任务的各个阶段做一些自定义的各种动作,SparkListener便是这些阶段的事件监听接口类,实现这个类中的各种方法便可实现自定义的事件处理。
搬砖中需要对数据量进行统计,依赖spark监听器完成,对每个action算子的job进行监听,数据量依赖dataframe持久化与累加器完成。
二、主要方法
实现该类的一些方法
package org.apache.spark.scheduler
@org.apache.spark.annotation.DeveloperApi
abstract class SparkListener() extends scala.AnyRef with org.apache.spark.scheduler.SparkListenerInterface {
//job中stage结束触发
override def onStageCompleted(stageCompleted : org.apache.spark.scheduler.SparkListenerStageCompleted) : scala.Unit = { /* compiled code */ }
override def onStageSubmitted(stageSubmitted : org.apache.spark.scheduler.SparkListenerStageSubmitted) : scala.Unit = { /* compiled code */ }
override def onTaskStart(taskStart : org.apache.spark.scheduler.SparkListenerTaskStart) : scala.Unit = { /* compiled code */ }
override def onTaskGettingResult(taskGettingResult : org.apache.spark.scheduler.SparkListenerTaskGettingResult) : scala.Unit = { /* compiled code */ }
//task结束触发
override def onTaskEnd(taskEnd : org.apache.spark.scheduler.SparkListenerTaskEnd) : scala.Unit = { /* compiled code */ }
//job开始触发
override def onJobStart(jobStart : org.apache.spark.scheduler.SparkListenerJobStart) : scala.Unit = { /* compiled code */ }
//job结束触发
override def onJobEnd(jobEnd : org.apache.spark.scheduler.SparkListenerJobEnd) : scala.Unit = { /* compiled code */ }
override def onEnvironmentUpdate(environmentUpdate : org.apache.spark.scheduler.SparkListenerEnvironmentUpdate) : scala.Unit = { /* compiled code */ }
override def onBlockManagerAdded(blockManagerAdded : org.apache.spark.scheduler.SparkListenerBlockManagerAdded) : scala.Unit = { /* compiled code */ }
override def onBlockManagerRemoved(blockManagerRemoved : org.apache.spark.scheduler.SparkListenerBlockManagerRemoved) : scala.Unit = { /* compiled code */ }
override def onUnpersistRDD(unpersistRDD : org.apache.spark.scheduler.SparkListenerUnpersistRDD) : scala.Unit = { /* compiled code */ }
override def onApplicationStart(applicationStart : org.apache.spark.scheduler.SparkListenerApplicationStart) : scala.Unit = { /* compiled code */ }
override def onApplicationEnd(applicationEnd : org.apache.spark.scheduler.SparkListenerApplicationEnd) : scala.Unit = { /* compiled code */ }
override def onExecutorMetricsUpdate(executorMetricsUpdate : org.apache.spark.scheduler.SparkListenerExecutorMetricsUpdate) : scala.Unit = { /* compiled code */ }
override def onStageExecutorMetrics(executorMetrics : org.apache.spark.scheduler.SparkListenerStageExecutorMetrics) : scala.Unit = { /* compiled code */ }
override def onExecutorAdded(executorAdded : org.apache.spark.scheduler.SparkListenerExecutorAdded) : scala.Unit = { /* compiled code */ }
override def onExecutorRemoved(executorRemoved : org.apache.spark.scheduler.SparkListenerExecutorRemoved) : scala.Unit = { /* compiled code */ }
override def onExecutorBlacklisted(executorBlacklisted : org.apache.spark.scheduler.SparkListenerExecutorBlacklisted) : scala.Unit = { /* compiled code */ }
def onExecutorBlacklistedForStage(executorBlacklistedForStage : org.apache.spark.scheduler.SparkListenerExecutorBlacklistedForStage) : scala.Unit = { /* compiled code */ }
def onNodeBlacklistedForStage(nodeBlacklistedForStage : org.apache.spark.scheduler.SparkListenerNodeBlacklistedForStage) : scala.Unit = { /* compiled code */ }
override def onExecutorUnblacklisted(executorUnblacklisted : org.apache.spark.scheduler.SparkListenerExecutorUnblacklisted) : scala.Unit = { /* compiled code */ }
override def onNodeBlacklisted(nodeBlacklisted : org.apache.spark.scheduler.SparkListenerNodeBlacklisted) : scala.Unit = { /* compiled code */ }
override def onNodeUnblacklisted(nodeUnblacklisted : org.apache.spark.scheduler.SparkListenerNodeUnblacklisted) : scala.Unit = { /* compiled code */ }
override def onBlockUpdated(blockUpdated : org.apache.spark.scheduler.SparkListenerBlockUpdated) : scala.Unit = { /* compiled code */ }
override def onSpeculativeTaskSubmitted(speculativeTask : org.apache.spark.scheduler.SparkListenerSpeculativeTaskSubmitted) : scala.Unit = { /* compiled code */ }
override def onOtherEvent(event : org.apache.spark.scheduler.SparkListenerEvent) : scala.Unit = { /* compiled code */ }
}
package cn.my.apps
import org.apache.spark.internal.Logging
import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd, SparkListenerApplicationStart}
class MySparkAppListener extends SparkListener with Logging{
override def onApplicationStart(applicationStart: SparkListenerApplicationStart): Unit = {
val appId = applicationStart.appId
logInfo("***************************************************" + appId.get)
}
override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
logInfo("************************ app end time ************************ " + applicationEnd.time)
}
override def onStageCompleted(stageCompleted : org.apache.spark.scheduler.SparkListenerStageCompleted) : scala.Unit = {
logInfo("onStageCompleted************************************************" + stageCompleted.stageInfo)
}
override def onTaskStart(taskStart : org.apache.spark.scheduler.SparkListenerTaskStart) : scala.Unit = {
logInfo("onTaskStart************************************************" +taskStart.stageId)
}
override def onTaskEnd(taskEnd : org.apache.spark.scheduler.SparkListenerTaskEnd) : scala.Unit = {
logInfo("taskEnd************************************************" + taskEnd.stageId)
}
override def onJobStart(jobStart : org.apache.spark.scheduler.SparkListenerJobStart) : scala.Unit = {
logInfo("jobStart************************************************" + jobStart.jobId)
}
override def onJobEnd(jobEnd : org.apache.spark.scheduler.SparkListenerJobEnd) : scala.Unit = {
logInfo("jobEnd************************************************" + jobEnd.jobId)
}
}
注册监听器,可以使用config属性,也可在sparkcontext添加
val spark =SparkSession.builder().master("local").appName("jdbc test").getOrCreate()
spark.sparkContext.addSparkListener(new MySparkAppListener)
执行程序,会有对应日志打出
关于数据量,使用累加器统计条数
val persistsData = df.persist(StorageLevel.MEMORY_ONLY)
val acc = new LongAccumulator()
spark.sparkContext.register(acc)
persistsData.foreach(x=>{
acc.add(1)
})
println("条数:"+acc.value)
参考:
[1] https://zhuanlan.zhihu.com/p/248004741
[2] https://blog.csdn.net/u011291159/article/details/51160343
[3] https://blog.csdn.net/weixin_30873847/article/details/95315799
[4] https://blog.csdn.net/weixin_30677073/article/details/97602074
[5] https://www.jianshu.com/p/0ab692416c94