第18课：Spark Streaming中空RDD处理及流处理程序优雅的停止

最新推荐文章于 2022-10-22 09:21:50 发布

罗白莲

最新推荐文章于 2022-10-22 09:21:50 发布

阅读量2.7k

点赞数

分类专栏： Spark发行版笔记

本文链接：https://blog.csdn.net/luobailian/article/details/51726686

版权

Spark发行版笔记专栏收录该内容

20 篇文章 0 订阅

订阅专栏

1.SparkStreaming中空RDD处理

JobGenerator

/** Generate jobsand perform checkpoint for the given `time`. */
private def generateJobs(time: Time) {
// Set the SparkEnv in this thread, so that jobgeneration code can access the environment
// Example: BlockRDDs are created inthis thread, and it needs to access BlockManager
// Update: This is probably redundantafter threadlocal stuff in SparkEnv has been removed.
SparkEnv.set(ssc.env)
Try {
    //第一步：获取当前时间段里面的数据。根据分配的时间来分配具体要处理的数据。
    jobScheduler.receiverTracker.allocateBlocksToBatch(time)// allocate received blocks to batch
    //第二步：生成Job，获取RDD的DAG依赖关系。在此基于DStream生成了RDD实例。
    graph.generateJobs(time)// generate jobs using allocated block
} match {
    case Success(jobs) =>
      //第三步：获取streamIdToInputInfos的信息。BacthDuractions要处理的数据，以及我们要处理的业务逻辑。
    val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
      //第四步：将生成的Job交给jobScheduler
      jobScheduler.submitJobSet(JobSet(time, jobs,streamIdToInputInfos))
    case Failure(e) =>
      jobScheduler.reportError("Error generating jobs for time " + time, e)
}
//第五步：进行checkpoint
eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater = false))
}

ReceivedBlockTracker

/** Allocate allunallocated blocks to the given batch. */
def allocateBlocksToBatch(batchTime: Time): Unit= {
if (receiverInputStreams.nonEmpty) {
receivedBlockTracker.allocateBlocksToBatch(batchTime)
}
}

/**
* Allocate all unallocated blocks to thegiven batch.
* This event will get written to thewrite ahead log (if enabled).
*/
//分配所有未分配的块给定的批次。本次活动将得到写入到写前日志（如果启用）
def allocateBlocksToBatch(batchTime: Time): Unit= synchronized {
if (lastAllocatedBatchTime == null ||batchTime > lastAllocatedBatchTime) {
    val streamIdToBlocks= streamIds.map { streamId =>
        (streamId,getReceivedBlockQueue(streamId).dequeueAll(x => true))
    }.toMap
    val allocatedBlocks= AllocatedBlocks(streamIdToBlocks)
    if (writeToLog(BatchAllocationEvent(batchTime,allocatedBlocks))) {
      //接收到的数据根据KEY放进去lastAllocatedBatchTime
      timeToAllocatedBlocks.put(batchTime,allocatedBlocks)
      lastAllocatedBatchTime = batchTime
    } else {
      logInfo(s"Possibly processed batch $batchTime need to be processed again in WAL recovery")
    }
} else {
    // This situation occurs when:
    // 1. WAL is ended withBatchAllocationEvent, but without BatchCleanupEvent,
    // possibly processed batch job orhalf-processed batch job need to be processed again,
    // so the batchTime will be equal tolastAllocatedBatchTime.
    // 2. Slow checkpointing makesrecovered batch time older than WAL recovered
    // lastAllocatedBatchTime.
    // This situation will only occurs inrecovery time.
    logInfo(s"Possiblyprocessed batch $batchTime need to be processed again in WAL recovery")
}
}

DStreamGraph

JobGenerator.generateJobs方法中graph.generateJobs(time)点击进入

//此时的outputStream是整个DStream中的最后一个DStream，也就是foreachDStream.
def generateJobs(time: Time): Seq[Job] = {
logDebug("Generating jobs for time " + time)
val jobs =this.synchronized {
    outputStreams.flatMap{ outputStream =>
      //根据最后一个DStream，然后根据时间生成Job.
    val jobOption = outputStream.generateJob(time)
     jobOption.foreach(_.setCallSite(outputStream.creationSite))
      jobOption
    }
}
logDebug("Generated " + jobs.length + " jobs for time " + time)
jobs
}

DStream

   * 此时的JobFunc就是我们前面提到的用函数封装了Job。
generateJob基于给定的时间生成Spark Streaming 的Job，这个方法会基于我们的DStream的操作物化成了RDD，由此可以看出，DStream是逻辑级别的，RDD是物理级别的。
    */
private[streaming]def generateJob(time: Time): Option[Job] = {
    getOrCompute(time) match {
      case Some(rdd) => {
        val jobFunc= () => {
          val emptyFunc= { (iterator: Iterator[T]) => {} }
          //rdd => 就是RDD的依赖关系
          context.sparkContext.runJob(rdd, emptyFunc)
        }
        //Job这个类就代表了Spark业务逻辑，可能包含很多Spark Jobs.
        Some(new Job(time, jobFunc))
      }
      case None=> None
    }
}

/**
* Get the RDD corresponding to the giventime; either retrieve it from cache
* or compute-and-cache it.
*/
//此时put函数中的RDD是最后一个RDD，虽然触发Job是基于时间，但是也是基于DStream的action的。
private[streaming] final def getOrCompute(time: Time): Option[RDD[T]] = {
// If RDD was already generated, then retrieve it fromHashMap,
// or else compute the RDD
//基于时间生成RDD
generatedRDDs.get(time).orElse {
    // Compute the RDD if time is valid (e.g. correct time ina sliding window)
    // of RDD generation, else generatenothing.
    if (isTimeValid(time)){

      val rddOption= createRDDWithLocalProperties(time, displayInnerRDDOps = false) {
        // Disable checks for existing output directories in jobslaunched by the streaming
        // scheduler, since we may needto write output to an existing directory during checkpoint
        // recovery; see SPARK-4835 formore details. We need to have this call here because
        // compute() might cause Sparkjobs to be launched.
        PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
          compute(time)
        }
      }
      //然后对generated RDD进行checkpoint
      rddOption.foreach { case newRDD=>
        // Register the generated RDD for caching andcheckpointing
        if (storageLevel != StorageLevel.NONE) {
          newRDD.persist(storageLevel)
          logDebug(s"Persisting RDD ${newRDD.id} for time $time to $storageLevel")
        }
        if (checkpointDuration != null &&(time - zeroTime).isMultipleOf(checkpointDuration)) {
          newRDD.checkpoint()
          logInfo(s"Marking RDD ${newRDD.id} for time $time for checkpointing")
        }
        //以时间为Key,RDD为Value,此时的RDD为最后一个RDD
        generatedRDDs.put(time, newRDD)
      }
      rddOption
    } else {
      None
    }
}
}

RDD

/**
* @note due to complications in the internal implementation, this method willraise an
* exception if called on an RDD of `Nothing` or `Null`. This may be come up in practice
* because, for example, the type of `parallelize(Seq())` is `RDD[Nothing]`.
* (`parallelize(Seq())` should be avoided anyway in favor of `parallelize(Seq[T]())`.)
* @return true if and only if the RDD contains no elements at all. Note that an RDD
* may be empty even when it has at least 1 partition.
*/
def isEmpty(): Boolean = withScope {
partitions.length == 0 || take(1).length == 0
}

判断是否为空

if(rdd.partitions>0){

rdd.partitions.isEmpty不行

}

要采用

if(!rdd.isEmpty())

2.SparkStreamin流处理优雅的停止

如果什么值也不传的时候会停止sparkContext，而且数据没有处理完也会被停止

StreamingContext

/**
* Stop the execution of the streamsimmediately (does not wait for all received data
* to be processed). By default, if `stopSparkContext` is not specified, the underlying
* SparkContext will also be stopped.This implicit behavior can be configured using the
* SparkConf configurationspark.streaming.stopSparkContextByDefault.
*
* @param stopSparkContext If true, stops the associated SparkContext. Theunderlying SparkContext
*                         will be stoppedregardless of whether this StreamingContext has been
*                         started.
*/
def stop(
    stopSparkContext: Boolean = conf.getBoolean("spark.streaming.stopSparkContextByDefault", true)
   ): Unit = synchronized {
stop(stopSparkContext, false)
}

这个stop会等数据处理完后再停止

/**
   * Stop the execution of the streams,with option of ensuring all received data
   * has been processed.
   *
   * @param stopSparkContext if true, stops the associated SparkContext. Theunderlying SparkContext
   *                         will be stoppedregardless of whether this StreamingContext has been
   *                         started.
   * @param stopGracefully if true, stops gracefully by waiting for the processingof all
   *                       received data to becompleted
   */
def stop(stopSparkContext:Boolean, stopGracefully: Boolean): Unit = {
    var shutdownHookRefToRemove:AnyRef = null
    if (AsynchronousListenerBus.withinListenerThread.value) {
      throw new SparkException("Cannotstop StreamingContext within listener thread of" +
        " AsynchronousListenerBus")
    }
    synchronized {
      try {
        state match{
          case INITIALIZED =>
            logWarning("StreamingContext has not been startedyet")
          case STOPPED =>
            logWarning("StreamingContext has already beenstopped")
          case ACTIVE =>
            scheduler.stop(stopGracefully)
            // Removing the streamingSource to de-register themetrics on stop()
            env.metricsSystem.removeSource(streamingSource)
            uiTab.foreach(_.detach())
            StreamingContext.setActiveContext(null)
            waiter.notifyStop()
            if (shutdownHookRef != null) {
              shutdownHookRefToRemove = shutdownHookRef
              shutdownHookRef = null
            }
            logInfo("StreamingContext stoppedsuccessfully")
        }
      } finally {
        // The state should always be Stopped after calling`stop()`, even if we haven't started yet
        state = STOPPED
      }
    }
    if (shutdownHookRefToRemove!= null) {
      ShutdownHookManager.removeShutdownHook(shutdownHookRefToRemove)
    }
    // Even if we have already stopped, we still need toattempt to stop the SparkContext because
    // a user might stop(stopSparkContext= false) and then call stop(stopSparkContext = true).
    if (stopSparkContext)sc.stop()
}

  private def stopOnShutdown(): Unit = {
    val stopGracefully= conf.getBoolean("spark.streaming.stopGracefullyOnShutdown", false)
    logInfo(s"Invoking stop(stopGracefully=$stopGracefully) from shutdown hook")
    // Do not stop SparkContext, let its own shutdown hookstop it
    stop(stopSparkContext = false,stopGracefully = stopGracefully)
}
}

/**
* Start the execution of the streams.
*
* @throws IllegalStateException if the StreamingContext is already stopped.
*/
def start(): Unit = synchronized {
state match{
    case INITIALIZED =>
      startSite.set(DStream.getCreationSite())
      StreamingContext.ACTIVATION_LOCK.synchronized {
        StreamingContext.assertNoOtherContextIsActive()
        try {
          validate()

          // Start the streaming scheduler in a new thread, so that thread localproperties
          // like call sites and jobgroups can be reset without affecting those of the
          // current thread.
          ThreadUtils.runInNewThread("streaming-start") {
            sparkContext.setCallSite(startSite.get)
            sparkContext.clearJobGroup()
           sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
            scheduler.start()
          }
          state =StreamingContextState.ACTIVE
        } catch {
          case NonFatal(e) =>
            logError("Error starting the context, marking itas stopped", e)
            scheduler.stop(false)
            state =StreamingContextState.STOPPED
            throw e
        }
        StreamingContext.setActiveContext(this)
      }
      shutdownHookRef = ShutdownHookManager.addShutdownHook(
        StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
      // Registering Streaming Metrics at the start of the StreamingContext
      assert(env.metricsSystem != null)
      env.metricsSystem.registerSource(streamingSource)
      uiTab.foreach(_.attach())
      logInfo("StreamingContext started")
    case ACTIVE =>
      logWarning("StreamingContext has already beenstarted")
    case STOPPED =>
      throw new IllegalStateException("StreamingContexthas already been stopped")
}
}