spark 监听
spark 提供了一系列整个任务生命周期中各个阶段变化的事件监听机制。通过这一机制可以在任务的各个阶段做一些自定义的各种动作。
SparkListener便是这些阶段的事件监听接口类,通过实现这个类中的各种方法便可实现自定义的事件处理动作。
如下是sparkListener的实现方法,可以通过下面的参数获取spark的数据
object SparkAppListener extends SparkListener with Logging{
//完成的job数量
private var JobNum = 0
//Job和Job信息(包括总task数,当前完成task数,当前Job百分比)的映射
val jobToJobInfo = new scala.collection.mutable.HashMap[Int, (Int, Int, Int)]
//stageId和Job的映射,用户获取task对应的job
private val stageToJob = new scala.collection.mutable.HashMap[Int, Int]
//完成的job数量
private var finishJobNum = 0
private var hasException: Boolean = false
var totalPercent = 0
val jobMap = new scala.collection.mutable.HashMap[String,String]
override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = {
System.out.println("stageCompleted")
}
override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = {
}
override def onTaskStart(taskStart: SparkListenerTaskStart): Unit ={
System.out.println("taskStart")
}
override def onTaskGettingResult(taskGettingResult: SparkListenerTaskGettingResult): Unit ={
}
override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit ={
//==========================输入输出================================
val inputMetrics = taskEnd.taskMetrics.inputMetrics
val outputMetrics = taskEnd.taskMetrics.outputMetrics
val input_output = scala.collection.mutable.HashMap(
"bytesRead" -> inputMetrics.bytesRead, //读取的大小
"recordsRead" -> inputMetrics.recordsRead, //总记录数
"bytesWritten" -> outputMetrics.bytesWritten,//输出的大小
"recordsWritten" -> outputMetrics.recordsWritten//输出的记录数
)
println(input_output)
val metrics = taskEnd.taskMetrics
val taskMetricsMap = scala.collection.mutable.HashMap(
"executorDeserializeTime" -> metrics.executorDeserializeTime, //executor的反序列化时间
"executorDeserializeCpuTime" -> metrics.executorDeserializeCpuTime, //executor的反序列化的 cpu时间
"executorRunTime" -> metrics.executorRunTime, //executoor的运行时间
"resultSize" -> metrics.resultSize, //结果集大小
"jvmGCTime" -> metrics.jvmGCTime, //
"resultSerializationTime" -> metrics.resultSerializationTime,
"memoryBytesSpilled" -> metrics.memoryBytesSpilled, //内存溢写的大小
"diskBytesSpilled" -> metrics.diskBytesSpilled, //溢写到磁盘的大小
"peakExecutionMemory" -> metrics.peakExecutionMemory //executor的最大内存
)
println(taskMetricsMap)
//======================shuffle指标================================
val shuffleReadMetrics = metrics.shuffleReadMetrics
val shuffleWriteMetrics = metrics.shuffleWriteMetrics
val shuffleMap = scala.collection.mutable.HashMap(
"remoteBlocksFetched" -> shuffleReadMetrics.remoteBlocksFetched, //shuffle远程拉取数据块
"localBlocksFetched" -> shuffleReadMetrics.localBlocksFetched, //本地块拉取
"remoteBytesRead" -> shuffleReadMetrics.remoteBytesRead, //shuffle远程读取的字节数
"localBytesRead" -> shuffleReadMetrics.localBytesRead, //读取本地数据的字节
"fetchWaitTime" -> shuffleReadMetrics.fetchWaitTime, //拉取数据的等待时间
"recordsRead" -> shuffleReadMetrics.recordsRead, //shuffle读取的记录总数
"bytesWritten" -> shuffleWriteMetrics.bytesWritten, //shuffle写的总大小
"recordsWritte" -> shuffleWriteMetrics.recordsWritten, //shuffle写的总记录数
"writeTime" -> shuffleWriteMetrics.writeTime
)
println(shuffleMap)
val taskInfo: TaskInfo = taskEnd.taskInfo
val taskInfoMap = scala.collection.mutable.HashMap(
"taskId" -> taskInfo.taskId ,
"host" -> taskInfo.host ,
"speculative" -> taskInfo.speculative , //推测执行
"failed" -> taskInfo.failed ,
"killed" -> taskInfo.killed ,
"running" -> taskInfo.running
)
println(taskInfoMap)
val stageId = taskEnd.stageId
val jobId = stageToJob.get(stageId).get
val (totalTaskNum: Int, finishTaskNum: Int, percent: Int) = jobToJobInfo.get(jobId).get
val currentFinishTaskNum = finishTaskNum + 1
val newPercent = currentFinishTaskNum * 100 / totalTaskNum
jobToJobInfo(jobId) = (totalTaskNum,currentFinishTaskNum,newPercent)
if (newPercent > percent){
//hanlde application progress
totalPercent = newPercent
println(totalPercent)
}
log.info("taskEnd")
}
override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
val dateFormat = new SimpleDateFormat("yyyy-MM-dd" + " HH:mm:ss")
jobMap += ("startTime" -> dateFormat.format(jobStart.time))
println(jobMap)
val jobId = jobStart.jobId
JobNum = jobStart.stageInfos.map(stageInfo => stageInfo.numTasks).sum
jobToJobInfo += (jobId ->(JobNum, 0, 0))
jobStart.stageIds.map(stageId => stageToJob(stageId) = jobId)
log.info("jobStart")
}
override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = {
val dateFormat = new SimpleDateFormat("yyyy-MM-dd" + " HH:mm:ss")
jobMap += ("totalPercent" -> totalPercent.toString)
jobMap += ("endTime" -> dateFormat.format(jobEnd.time))
println(jobMap)
log.info("jobEnd")
}
override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate): Unit = {
}
override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded): Unit = {
}
override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved): Unit = {
}
override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = {
}
override def onApplicationStart(applicationStart: SparkListenerApplicationStart): Unit = {
log.info("start")
}
override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
log.info("applicationEnd")
}
override def onExecutorMetricsUpdate(executorMetricsUpdate: SparkListenerExecutorMetricsUpdate): Unit = {
}
override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = {
}
override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = {
}
override def onExecutorBlacklisted(executorBlacklisted: SparkListenerExecutorBlacklisted): Unit = {
}
override def onExecutorUnblacklisted(executorUnblacklisted: SparkListenerExecutorUnblacklisted): Unit = {
}
override def onNodeBlacklisted(nodeBlacklisted: SparkListenerNodeBlacklisted): Unit = {
}
override def onNodeUnblacklisted(nodeUnblacklisted: SparkListenerNodeUnblacklisted): Unit = {
}
override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = {
}
override def onSpeculativeTaskSubmitted(speculativeTask: SparkListenerSpeculativeTaskSubmitted): Unit = {
}
override def onOtherEvent(event: SparkListenerEvent): Unit = {
}