第18课:Spark Streaming中空RDD处理及流处理程序优雅的停止

1.SparkStreaming中空RDD处理

JobGenerator

/** Generate jobsand perform checkpoint for the given `time`.  */
private def generateJobs(time: Time) {
  // Set the SparkEnv in this thread, so that jobgeneration code can access the environment
  // Example: BlockRDDs are created inthis thread, and it needs to access BlockManager
  // Update: This is probably redundantafter threadlocal stuff in SparkEnv has been removed.
 
SparkEnv.set(ssc.env)
  Try {
    //第一步:获取当前时间段里面的数据。根据分配的时间来分配具体要处理的数据。
   
jobScheduler.receiverTracker.allocateBlocksToBatch(time)// allocate received blocks to batch
    //
第二步:生成Job,获取RDD的DAG依赖关系。在此基于DStream生成了RDD实例。
    graph
.generateJobs(time)// generate jobs using allocated block
 
} match {
    case Success(jobs) =>
      //第三步:获取streamIdToInputInfos的信息。BacthDuractions要处理的数据,以及我们要处理的业务逻辑。
   
val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
      //第四步:将生成的Job交给jobScheduler
     
jobScheduler.submitJobSet(JobSet(time, jobs,streamIdToInputInfos))
    case Failure(e) =>
      jobScheduler.reportError("Error generating jobs for time " + time, e)
  }
  //第五步:进行checkpoint
 
eventLoop
.post(DoCheckpoint(time, clearCheckpointDataLater = false))
}

 

ReceivedBlockTracker

 

/** Allocate allunallocated blocks to the given batch. */
def allocateBlocksToBatch(batchTime: Time): Unit= {
  if (receiverInputStreams.nonEmpty) {
    receivedBlockTracker.allocateBlocksToBatch(batchTime)
  }
}

/**
 * Allocate all unallocated blocks to thegiven batch.
 * This event will get written to thewrite ahead log (if enabled).
 */
//分配所有未分配的块给定的批次。本次活动将得到写入到写前日志(如果启用)
def allocateBlocksToBatch(batchTime: Time): Unit= synchronized {
  if (lastAllocatedBatchTime == null ||batchTime > lastAllocatedBatchTime) {
    val streamIdToBlocks= streamIds.map { streamId =>
        (streamId,getReceivedBlockQueue(streamId).dequeueAll(x => true))
    }.toMap
    val allocatedBlocks= AllocatedBlocks(streamIdToBlocks)
    if (writeToLog(BatchAllocationEvent(batchTime,allocatedBlocks))) {
      //接收到的数据根据KEY放进去lastAllocatedBatchTime
      timeToAllocatedBlocks
.put(batchTime,allocatedBlocks)
      lastAllocatedBatchTime = batchTime
    } else {
      logInfo(s"Possibly processed batch $batchTime need to be processed again in WAL recovery")
    }
  } else {
    // This situation occurs when:
    // 1. WAL is ended withBatchAllocationEvent, but without BatchCleanupEvent,
    // possibly processed batch job orhalf-processed batch job need to be processed again,
    // so the batchTime will be equal tolastAllocatedBatchTime.
    // 2. Slow checkpointing makesrecovered batch time older than WAL recovered
    // lastAllocatedBatchTime.
    // This situation will only occurs inrecovery time.
   
logInfo(s"Possiblyprocessed batch $batchTime need to be processed again in WAL recovery")
  }
}

DStreamGraph

JobGenerator.generateJobs方法中graph.generateJobs(time)点击进入

//此时的outputStream是整个DStream中的最后一个DStream,也就是foreachDStream.
def generateJobs(time: Time): Seq[Job] = {
  logDebug("Generating jobs for time " + time)
  val jobs =this.synchronized {
    outputStreams.flatMap{ outputStream =>
      //根据最后一个DStream,然后根据时间生成Job.
   
val jobOption = outputStream.generateJob(time)
     jobOption.foreach(_.setCallSite(outputStream.creationSite))
      jobOption
    }
  }
  logDebug("Generated " + jobs.length + " jobs for time " + time)
  jobs
}

 

DStream

   * 此时的JobFunc就是我们前面提到的用函数封装了Job。
generateJob基于给定的时间生成Spark Streaming 的Job,这个方法会基于我们的DStream的操作物化成了RDD,由此可以看出,DStream是逻辑级别的,RDD是物理级别的。
    */
 
private[streaming]def generateJob(time: Time): Option[Job] = {
    getOrCompute(time) match {
      case Some(rdd) => {
        val jobFunc= () => {
          val emptyFunc= { (iterator: Iterator[T]) => {} }
          //rdd => 就是RDD的依赖关系
         
context.sparkContext.runJob(rdd, emptyFunc)
        }
        //Job这个类就代表了Spark业务逻辑,可能包含很多Spark Jobs.
        Some
(new Job(time, jobFunc))
      }
      case None=> None
    }
  }

 

 

/**
 * Get the RDD corresponding to the giventime; either retrieve it from cache
 * or compute-and-cache it.
 */
//此时put函数中的RDD是最后一个RDD,虽然触发Job是基于时间,但是也是基于DStream的action的。
private[streaming] final def getOrCompute(time: Time): Option[RDD[T]] = {
  // If RDD was already generated, then retrieve it fromHashMap,
  // or else compute the RDD
  //
基于时间生成RDD
 
generatedRDDs
.get(time).orElse {
    // Compute the RDD if time is valid (e.g. correct time ina sliding window)
    // of RDD generation, else generatenothing.
   
if (isTimeValid(time)){

      val rddOption= createRDDWithLocalProperties(time, displayInnerRDDOps = false) {
        // Disable checks for existing output directories in jobslaunched by the streaming
        // scheduler, since we may needto write output to an existing directory during checkpoint
        // recovery; see SPARK-4835 formore details. We need to have this call here because
        // compute() might cause Sparkjobs to be launched.
       
PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
          compute(time)
        }
      }
      //然后对generated RDD进行checkpoint
     
rddOption.foreach { case newRDD=>
        // Register the generated RDD for caching andcheckpointing
       
if (storageLevel != StorageLevel.NONE) {
          newRDD.persist(storageLevel)
          logDebug(s"Persisting RDD ${newRDD.id} for time $time to $storageLevel")
        }
        if (checkpointDuration != null &&(time - zeroTime).isMultipleOf(checkpointDuration)) {
          newRDD.checkpoint()
          logInfo(s"Marking RDD ${newRDD.id} for time $time for checkpointing")
        }
        //以时间为Key,RDD为Value,此时的RDD为最后一个RDD
        generatedRDDs
.put(time, newRDD)
      }
      rddOption
    } else {
      None
    }
  }
}

RDD

/**
 *
@note due to complications in the internal implementation, this method willraise an
 * exception if called on an RDD of
`Nothing` or `Null`. This may be come up in practice
 * because, for example, the type of
`parallelize(Seq())` is `RDD[Nothing]`.
 * (
`parallelize(Seq())` should be avoided anyway in favor of `parallelize(Seq[T]())`.)
 *
@return true if and only if the RDD contains no elements at all. Note that an RDD
 *        may be empty even when it has at least 1 partition.
 */
def isEmpty(): Boolean = withScope {
  partitions.length == 0 || take(1).length == 0
}

 

 

判断是否为空

if(rdd.partitions>0){

   rdd.partitions.isEmpty不行

}

要采用

if(!rdd.isEmpty())

2.SparkStreamin流处理优雅的停止

如果什么值也不传的时候会停止sparkContext,而且数据没有处理完也会被停止

StreamingContext

 

/**
 * Stop the execution of the streamsimmediately (does not wait for all received data
 * to be processed). By default, if
`stopSparkContext` is not specified, the underlying
 * SparkContext will also be stopped.This implicit behavior can be configured using the
 * SparkConf configurationspark.streaming.stopSparkContextByDefault.
 *
 *
@param stopSparkContext If true, stops the associated SparkContext. Theunderlying SparkContext
 *                         will be stoppedregardless of whether this StreamingContext has been
 *                         started.
 */
def stop(
    stopSparkContext: Boolean = conf.getBoolean("spark.streaming.stopSparkContextByDefault", true)
   ): Unit = synchronized {
  stop(stopSparkContext, false)
}

 

这个stop会等数据处理完后再停止

/**
   * Stop the execution of the streams,with option of ensuring all received data
   * has been processed.
   *
   *
@param stopSparkContext if true, stops the associated SparkContext. Theunderlying SparkContext
   *                         will be stoppedregardless of whether this StreamingContext has been
   *                         started.
   *
@param stopGracefully if true, stops gracefully by waiting for the processingof all
   *                       received data to becompleted
   */
 
def stop(stopSparkContext:Boolean, stopGracefully: Boolean): Unit = {
    var shutdownHookRefToRemove:AnyRef = null
    if
(AsynchronousListenerBus.withinListenerThread.value) {
      throw new SparkException("Cannotstop StreamingContext within listener thread of" +
        " AsynchronousListenerBus")
    }
    synchronized {
      try {
        state match{
          case INITIALIZED =>
            logWarning("StreamingContext has not been startedyet")
          case STOPPED =>
            logWarning("StreamingContext has already beenstopped")
          case ACTIVE =>
            scheduler.stop(stopGracefully)
            // Removing the streamingSource to de-register themetrics on stop()
           
env
.metricsSystem.removeSource(streamingSource)
            uiTab.foreach(_.detach())
            StreamingContext.setActiveContext(null)
            waiter.notifyStop()
            if (shutdownHookRef != null) {
              shutdownHookRefToRemove = shutdownHookRef
              shutdownHookRef
= null
           
}
            logInfo("StreamingContext stoppedsuccessfully")
        }
      } finally {
        // The state should always be Stopped after calling`stop()`, even if we haven't started yet
       
state
= STOPPED
     
}
    }
    if (shutdownHookRefToRemove!= null) {
      ShutdownHookManager.removeShutdownHook(shutdownHookRefToRemove)
    }
    // Even if we have already stopped, we still need toattempt to stop the SparkContext because
    // a user might stop(stopSparkContext= false) and then call stop(stopSparkContext = true).
   
if (stopSparkContext)sc.stop()
  }

  private def stopOnShutdown(): Unit = {
    val stopGracefully= conf.getBoolean("spark.streaming.stopGracefullyOnShutdown", false)
    logInfo(s"Invoking stop(stopGracefully=$stopGracefully) from shutdown hook")
    // Do not stop SparkContext, let its own shutdown hookstop it
   
stop(stopSparkContext = false,stopGracefully = stopGracefully)
  }
}

 

 

/**
 * Start the execution of the streams.
 *
 *
@throws IllegalStateException if the StreamingContext is already stopped.
 */
def start(): Unit = synchronized {
  state match{
    case INITIALIZED =>
      startSite.set(DStream.getCreationSite())
      StreamingContext.ACTIVATION_LOCK.synchronized {
        StreamingContext.assertNoOtherContextIsActive()
        try {
          validate()

          // Start the streaming scheduler in a new thread, so that thread localproperties
          // like call sites and jobgroups can be reset without affecting those of the
          // current thread.
         
ThreadUtils.runInNewThread("streaming-start") {
            sparkContext.setCallSite(startSite.get)
            sparkContext.clearJobGroup()
           sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
            scheduler.start()
          }
          state =StreamingContextState.ACTIVE
       
} catch {
          case NonFatal(e) =>
            logError("Error starting the context, marking itas stopped", e)
            scheduler.stop(false)
            state =StreamingContextState.STOPPED
           
throw e
        }
        StreamingContext.setActiveContext(this)
      }
      shutdownHookRef = ShutdownHookManager.addShutdownHook(
        StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
      // Registering Streaming Metrics at the start of the StreamingContext
     
assert
(env.metricsSystem != null)
      env.metricsSystem.registerSource(streamingSource)
      uiTab.foreach(_.attach())
      logInfo("StreamingContext started")
    case ACTIVE =>
      logWarning("StreamingContext has already beenstarted")
    case STOPPED =>
      throw new IllegalStateException("StreamingContexthas already been stopped")
  }
}

spark.streaming.stopGracefullyOnShutdown可以设置为true,可以完成整个处理,不丢失数据



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值