Sparkstreaming之JobScheduler

1.processEvent方法

//接口继承关系
private[scheduler] sealed trait JobSchedulerEvent
private[scheduler] case class JobStarted(job: Job, startTime: Long) extends JobSchedulerEvent
private[scheduler] case class JobCompleted(job: Job, completedTime: Long) extends JobSchedulerEvent
private[scheduler] case class ErrorReported(msg: String, e: Throwable) extends JobSchedulerEvent
private def processEvent(event: JobSchedulerEvent) {
  try {
    event match {
//开始监听作业时间,统计开始时间
      case JobStarted(job, startTime) => handleJobStart(job, startTime)
//结束时间信息统计
      case JobCompleted(job, completedTime) => handleJobCompletion(job, completedTime)
//持有的loack全部采用condition.signalAll方式进行提交
      case ErrorReported(m, e) => handleError(m, e)
    }
  } catch {
    case e: Throwable =>
      reportError("Error in job scheduler", e)
  }
}

1.1 handleJobStart方法

private def handleJobStart(job: Job, startTime: Long) {
//设置hash表值
  val jobSet = jobSets.get(job.time)
//判断开始时间是否是set的第一个,如果小于0就是第一个
  val isFirstJobOfJobSet = !jobSet.hasStarted
//开始设置开始时间
  jobSet.handleJobStart(job)
//是第一个
  if (isFirstJobOfJobSet) {
   //将事件放置到阻塞队里中
    listenerBus.post(StreamingListenerBatchStarted(jobSet.toBatchInfo))
  }
//设置开始时间
  job.setStartTime(startTime)
  listenerBus.post(StreamingListenerOutputOperationStarted(job.toOutputOperationInfo))
  logInfo("Starting job " + job.id + " from job set of time " + jobSet.time)
}
1.2 handleJobCompletion方法
private def handleJobCompletion(job: Job, completedTime: Long) {
  val jobSet = jobSets.get(job.time)
  jobSet.handleJobCompletion(job)
  job.setEndTime(completedTime)
  listenerBus.post(StreamingListenerOutputOperationCompleted(job.toOutputOperationInfo))
  logInfo("Finished job " + job.id + " from job set of time " + jobSet.time)
  if (jobSet.hasCompleted) {
    listenerBus.post(StreamingListenerBatchCompleted(jobSet.toBatchInfo))
  }
  job.result match {
    case Failure(e) =>
      reportError("Error running job " + job, e)
    case _ =>
      if (jobSet.hasCompleted) {
//jobset没有元素了移除jobset的时间
        jobSets.remove(jobSet.time)
//设置clearmetadata时间
        jobGenerator.onBatchCompletion(jobSet.time)
//后面就是时间计算。
        logInfo("Total delay: %.3f s for time %s (execution: %.3f s)".format(
          jobSet.totalDelay / 1000.0, jobSet.time.toString,
          jobSet.processingDelay / 1000.0
        ))
      }
  }

2. start方法

def start(): Unit = synchronized {
  if (eventLoop != null) return // scheduler has already been started

  logDebug("Starting JobScheduler")
//这行代码很巧妙,EventLoop是一个抽象类,new对象的时候首先是初始化变量,会初始化一个EventLoop的线程而这个线程,而这个线程又不停的从阻塞队里中取元素,然后调用子类的onReceive方法。
  eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {
    override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)

    override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
  }
//后台监视事件,前台来开始线程。这个函数注意有一个onStart方法主要用来保证onReceive在开始之后才跑,因为这个事件队里是阻塞式的事件队列,当没有元素的时候是不会开始循环调onReceive方法的。然后真正将后台的线程开始启动起来。
  eventLoop.start()

  // attach rate controllers of input streams to receive batch completion updates
  for {
//输入流
    inputDStream <- ssc.graph.getInputStreams
//跟踪消费速度    
rateController <- inputDStream.rateController
  } 
  //放到一个copyonwrite的集合里,为啥?当然是实时可以看到【准确】图像啦
 ssc.addStreamingListener(rateController)

  listenerBus.start()
//初始化:
  receiverTracker = new ReceiverTracker(ssc)
  inputInfoTracker = new InputInfoTracker(ssc)

  val executorAllocClient: ExecutorAllocationClient = ssc.sparkContext.schedulerBackend match {
    case b: ExecutorAllocationClient => b.asInstanceOf[ExecutorAllocationClient]
    case _ => null
  }
//主要用于管理分配给StreamingContext的executor,动态请求,杀死执行器。可收缩可扩展,不错啊。val averageBatchProcTime = //batchProcTimeSum / batchProcTimeCount
//val ratio = averageBatchProcTime.toDouble / batchDurationMs
//if (ratio >= scalingUpRatio) {
//  logDebug("Requesting executors")
//  val numNewExecutors = math.max(math.round(ratio).toInt, 1)
/**这种情况EXCUTOR会被kill掉**/
//if (ratio <= scalingDownRatio) {
//  logDebug("Killing executors")
//  killExecutor()
//}
//怎么调?
//private val scalingUpRatio = conf.getDouble(SCALING_UP_RATIO_KEY, SCALING_UP_RATIO_DEFAULT)
//SCALING_UP_RATIO_KEY的值是啥?
//   SCALING_UP_RATIO_KEY =“spark.streaming.dynamicAllocation.scalingUpRatio”的值
//    val SCALING_UP_RATIO_DEFAULT = 0.9 默认的值
//private val scalingDownRatio = conf.getDouble(SCALING_DOWN_RATIO_KEY, SCALING_DOWN_RATIO_DEFAULT)
// SCALING_DOWN_RATIO_KEY ="spark.streaming.dynamicAllocation.scalingDownRatio"的值
//   val SCALING_DOWN_RATIO_DEFAULT = 0.3 
//private val minNumExecutors = conf.getInt(MIN_EXECUTORS_KEY,math.max(1, receiverTracker.numReceivers(这个值//是receiverInputStreams数组的大小)))
//   val MIN_EXECUTORS_KEY = "spark.streaming.dynamicAllocation.minExecutors"    
//private val maxNumExecutors = conf.getInt(MAX_EXECUTORS_KEY, Integer.MAX_VALUE)
//  val MAX_EXECUTORS_KEY = "spark.streaming.dynamicAllocation.maxExecutors"   
//private val timer = new RecurringTimer(clock, scalingIntervalSecs * 1000,
//  _ => manageAllocation(), "streaming-executor-allocation-manager")
//设置周期多少时间去调用manageAllocation这个修改excutor的函数
 //val SCALING_INTERVAL_KEY = "spark.streaming.dynamicAllocation.scalingInterval"
//  val SCALING_INTERVAL_DEFAULT_SECS = 60
//如果你要动态分配val ENABLED_KEY = "spark.streaming.dynamicAllocation.enabled"这个设置为true并且实例个数不要设置
//    val numExecutor = conf.getInt("spark.executor.instances", 0)
//    val streamingDynamicAllocationEnabled = conf.getBoolean(ENABLED_KEY, false)
//    if (numExecutor != 0 && streamingDynamicAllocationEnabled) {


executorAllocationManager = ExecutorAllocationManager.createIfEnabled(
    executorAllocClient,
    receiverTracker,
    ssc.conf,
    ssc.graph.batchDuration.milliseconds,
    clock)
  executorAllocationManager.foreach(ssc.addStreamingListener)
// endpoint = ssc.env.rpcEnv.setupEndpoint(
//     "ReceiverTracker", new ReceiverTrackerEndpoint(ssc.env.rpcEnv)) 
//if (!skipReceiverLaunch) launchReceivers()分发到各个节点上,怎么发的下次再写 
  receiverTracker.start()
//同样道理EventLoop
  jobGenerator.start()
  executorAllocationManager.foreach(_.start())//这里的start方法如果配置了动态的就动态调excutor数量了
  logInfo("Started JobScheduler")
}

3.submitJobSet方法

  // listenerBus.post(StreamingListenerBatchSubmitted(jobSet.toBatchInfo))
  //    jobSets.put(jobSet.time, jobSet)
   //   jobSet.jobs.foreach(job => jobExecutor.execute(new JobHandler(job)))
    //  logInfo("Added jobs for time " + jobSet.time)

jobExecutor其实就是一个线程池而已   

//private val jobExecutor =ThreadUtils.newDaemonFixedThreadPool(numConcurrentJobs, "streaming-job-executor")

//  private val numConcurrentJobs = ssc.conf.getInt("spark.streaming.concurrentJobs", 1)默认作业数量是1

4.JobHandler这个线程

private class JobHandler(job: Job) extends Runnable with Logging {
    import JobScheduler._

    def run() {

//获取配置信息
      val oldProps = ssc.sparkContext.getLocalProperties
      try {
        ssc.sparkContext.setLocalProperties(SerializationUtils.clone(ssc.savedProperties.get()))
        val formattedTime = UIUtils.formatBatchTime(
          job.time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS = false)
        val batchUrl = s"/streaming/batch/?id=${job.time.milliseconds}"
        val batchLinkText = s"[output operation ${job.outputOpId}, batch time ${formattedTime}]"

        ssc.sc.setJobDescription(
          s"""Streaming job from <a href="$batchUrl">$batchLinkText</a>""")

          val BATCH_TIME_PROPERTY_KEY = "spark.streaming.internal.batchTime"
           val OUTPUT_OP_ID_PROPERTY_KEY = "spark.streaming.internal.outputOpId"
        ssc.sc.setLocalProperty(BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
        ssc.sc.setLocalProperty(OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)
       //检查checkpoint血缘关系被截断。
        ssc.sparkContext.setLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS, "true")

        
        var _eventLoop = eventLoop
        if (_eventLoop != null) {

//监听到事件准备开始啦
          _eventLoop.post(JobStarted(job, clock.getTimeMillis()))
          PairRDDFunctions.disableOutputSpecValidation.withValue(true) {

//开始啦
            job.run()
          }
          _eventLoop = eventLoop
          if (_eventLoop != null) {
            _eventLoop.post(JobCompleted(job, clock.getTimeMillis()))
          }
        } else {
          // JobScheduler has been stopped.
        }
      } finally {
        ssc.sparkContext.setLocalProperties(oldProps)
      }
    }
  }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值