Spark学习笔记之-Spark-Streaming

最新推荐文章于 2020-05-11 13:35:01 发布

dandykang

最新推荐文章于 2020-05-11 13:35:01 发布

阅读量699

点赞数 1

本文链接：https://blog.csdn.net/dandykang/article/details/48469811

版权

spark-streaming源码分析

最近也在看spark-streaming的流程，稍稍整理了下，记录一下。

1、首先来看入口，如果要使用spark-streaming，首先要实例化StreamingContext如ssc，入口点是从ssc.start()开始的。

实例化的同时，也初始化了一些变量/对象，val graph，val progressListener，val startSite，var state: StreamingContextState = INITIALIZED，scheduler = new JobScheduler(this)……等

file: StreamingContext.scala

def start(): Unit = synchronized {
  state match {
    case INITIALIZED =>//初始状态下执行启动
      startSite.set(DStream.getCreationSite())
      sparkContext.setCallSite(startSite.get)
      StreamingContext.ACTIVATION_LOCK.synchronized {
        StreamingContext.assertNoOtherContextIsActive()
        try {
          validate() //做一些校验工作
          scheduler.start()//入口点
          state = StreamingContextState.ACTIVE
        } catch {
          case NonFatal(e) =>
            logError("Error starting the context, marking it as stopped", e)
            scheduler.stop(false)
            state = StreamingContextState.STOPPED
            throw e
        }
        StreamingContext.setActiveContext(this)
      }
      shutdownHookRef = Utils.addShutdownHook(
        StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
      uiTab.foreach(_.attach())
      logInfo("StreamingContext started")
    case ACTIVE =>
      logWarning("StreamingContext has already been started")
    case STOPPED =>
      throw new IllegalStateException("StreamingContext has already been stopped")
  }
}

2、进入上面的scheduler.start(),启动listenerBus，receiverTracker，jobGenerator

file: JobScheduler.scala

 def start(): Unit = synchronized {
    if (eventLoop != null) return // scheduler has already been started

    logDebug("Starting JobScheduler")
    eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {
      override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)//接受事件消息

      override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
    }
    eventLoop.start()

    listenerBus.start(ssc.sparkContext)
    receiverTracker = new ReceiverTracker(ssc)
    inputInfoTracker = new InputInfoTracker(ssc)
    receiverTracker.start()
    jobGenerator.start()//启动job生成
    logInfo("Started JobScheduler")
  }

3、file： JobGenerator.scala

在实例化JobGenerator的同时，也对timer进行实例化，在构造器里使用匿名函数初始化了回调函数（callback）如下

private val timer = new RecurringTimer(clock, ssc.graph.batchDuration.milliseconds,
    longTime => eventLoop.post(GenerateJobs(new Time(longTime))), "JobGenerator")

回过头来看jobGenerator.start()

/** Start generation of jobs */
  def start(): Unit = synchronized {
    if (eventLoop != null) return // generator has already been started

    eventLoop = new EventLoop[JobGeneratorEvent]("JobGenerator") {
      override protected def onReceive(event: JobGeneratorEvent): Unit = processEvent(event)//处理生成job消息

      override protected def onError(e: Throwable): Unit = {
        jobScheduler.reportError("Error in job generator", e)
      }
    }
    eventLoop.start()

    if (ssc.isCheckpointPresent) {
      restart()
    } else {
      startFirstTime()//第一次启动
    }
  }

进入startFirstTime（）

  /** Starts the generator for the first time */
  private def startFirstTime() {
    val startTime = new Time(timer.getStartTime())
    graph.start(startTime - graph.batchDuration)
    timer.start(startTime.milliseconds)//启动timer，实际上是调用了上面的callback：<span style="color: rgb(0, 130, 0); font-family: Consolas, 'Bitstream Vera Sans Mono', 'Courier New', Courier, monospace; line-height: 13.2px; white-space: nowrap;">GenerateJobs(new Time(longTime))</span>
    logInfo("Started JobGenerator at " + startTime)
  }

4、进入timer.start

file：RecurringTimer.scala

  /**
   * Start at the given start time.
   */在给定时间循环启动执行

  def start(startTime: Long): Long = synchronized {
    nextTime = startTime
    thread.start()
    logInfo("Started timer for " + name + " at time " + nextTime)
    nextTime
  }

<pre name="code" class="java">private val thread = new Thread("RecurringTimer - " + name) {
    setDaemon(true)
    override def run() { loop }
  }

private def loop() {
    try {
      while (!stopped) {
        clock.waitTillTime(nextTime)
        callback(nextTime)//实际是调用eventloop.post(<span style="color: rgb(0, 130, 0); font-family: Consolas, 'Bitstream Vera Sans Mono', 'Courier New', Courier, monospace; line-height: 13.2px; white-space: nowrap;">GenerateJobs(new Time(longTime)))</span>
        prevTime = nextTime
        nextTime += period
        logDebug("Callback for " + name + " called at time " + prevTime)
      }
    } catch {
      case e: InterruptedException =>
    }
  }
}

5、在上面走到了callback的发送GenerateJobs消息，JobGeneretorEvent被接收

file: JobGenerator.scala

 /** Processes all events */
  private def processEvent(event: JobGeneratorEvent) {
    logDebug("Got event " + event)
    event match {
      case GenerateJobs(time) => generateJobs(time)//在这里。。
      case ClearMetadata(time) => clearMetadata(time)
      case DoCheckpoint(time, clearCheckpointDataLater) =>
        doCheckpoint(time, clearCheckpointDataLater)
      case ClearCheckpointData(time) => clearCheckpointData(time)
    }
  }

进入generateJobs（time）看下

 /** Generate jobs and perform checkpoint for the given `time`.  */
  private def generateJobs(time: Time) {
    // Set the SparkEnv in this thread, so that job generation code can access the environment
    // Example: BlockRDDs are created in this thread, and it needs to access BlockManager
    // Update: This is probably redundant after threadlocal stuff in SparkEnv has been removed.
    SparkEnv.set(ssc.env)
    Try {

<span style="white-space:pre">	</span>//申请blocks

      <span style="line-height: 2; font-family: 宋体, 'Arial Narrow', arial, serif;">j</span><span style="line-height: 2; font-family: 宋体, 'Arial Narrow', arial, serif;">obScheduler.receiverTracker.allocateBlocksToBatch(time) // allocate received blocks to batch</span>

      //生成job

<span style="white-space:pre">	</span>graph.generateJobs(time) // generate jobs using allocated block
    } match {
      case Success(jobs) =>
        val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time）
        val streamIdToNumRecords = streamIdToInputInfos.mapValues(_.numRecords)
        jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToNumRecords))//提交job
      case Failure(e) =>
        jobScheduler.reportError("Error generating jobs for time " + time, e)
    }
    eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater = false))
  }

6、进入到graph.generateJobs函数

file: DStreamGraph.scala

def generateJobs(time: Time): Seq[Job] = {
    logDebug("Generating jobs for time " + time)
    val jobs = this.synchronized {
      outputStreams.flatMap(outputStream => outputStream.generateJob(time))
    }
    logDebug("Generated " + jobs.length + " jobs for time " + time)
    jobs
  }

继续深入outputStream.generateJob

file:DStream.scala

  /**
   * Generate a SparkStreaming job for the given time. This is an internal method that
   * should not be called directly. This default implementation creates a job
   * that materializes the corresponding RDD. Subclasses of DStream may override this
   * to generate their own jobs.
   */
  private[streaming] def generateJob(time: Time): Option[Job] = {
    getOrCompute(time) match {
      case Some(rdd) => {
        val jobFunc = () => {
          val emptyFunc = { (iterator: Iterator[T]) => {} }
          context.sparkContext.runJob(rdd, emptyFunc)
        }
        Some(new Job(time, jobFunc))
      }
      case None => None
    }
  }

7、submitJobSet

file: JobScheduler.scala

def submitJobSet(jobSet: JobSet) {
    if (jobSet.jobs.isEmpty) {
      logInfo("No jobs added for time " + jobSet.time)
    } else {
      listenerBus.post(StreamingListenerBatchSubmitted(jobSet.toBatchInfo))
      jobSets.put(jobSet.time, jobSet)
      jobSet.jobs.foreach(job => jobExecutor.execute(new JobHandler(job)))//jobSet执行，调用JobHandler（job）
      logInfo("Added jobs for time " + jobSet.time)
    }
  }

8、JobHandler处理job，包含run,completed..触发第2步中的消息接收

file: JobScheduler.scala

private class JobHandler(job: Job) extends Runnable with Logging {
    def run() {
      ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
      ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)
      try {
        eventLoop.post(JobStarted(job))
        // Disable checks for existing output directories in jobs launched by the streaming
        // scheduler, since we may need to write output to an existing directory during checkpoint
        // recovery; see SPARK-4835 for more details.
        PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
          job.run()
        }
        eventLoop.post(JobCompleted(job))
      } finally {
        ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, null)
        ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, null)
      }
    }
  }
}

对应事件处理函数：

private def processEvent(event: JobSchedulerEvent) {
    try {
      event match {
        case JobStarted(job) => handleJobStart(job)
        case JobCompleted(job) => handleJobCompletion(job)
        case ErrorReported(m, e) => handleError(m, e)
      }
    } catch {
      case e: Throwable =>
        reportError("Error in job scheduler", e)
    }
  }