1.SparkStreaming中空RDD处理
JobGenerator
/** Generate jobsand perform checkpoint for the given `time`. */
private def generateJobs(time: Time) {
// Set the SparkEnv in this thread, so that jobgeneration code can access the environment
// Example: BlockRDDs are created inthis thread, and it needs to access BlockManager
// Update: This is probably redundantafter threadlocal stuff in SparkEnv has been removed.
SparkEnv.set(ssc.env)
Try {
//第一步:获取当前时间段里面的数据。根据分配的时间来分配具体要处理的数据。
jobScheduler.receiverTracker.allocateBlocksToBatch(time)// allocate received blocks to batch
//第二步:生成Job,获取RDD的DAG依赖关系。在此基于DStream生成了RDD实例。
graph.generateJobs(time)// generate jobs using allocated block
} match {
case Success(jobs) =>
//第三步:获取streamIdToInputInfos的信息。BacthDuractions要处理的数据,以及我们要处理的业务逻辑。
val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
//第四步:将生成的Job交给jobScheduler
jobScheduler.submitJobSet(JobSet(time, jobs,streamIdToInputInfos))
case Failure(e) =>
jobScheduler.reportError("Error generating jobs for time " + time, e)
}
//第五步:进行checkpoint
eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater = false))
}
ReceivedBlockTracker
/** Allocate allunallocated blocks to the given batch. */
def allocateBlocksToBatch(batchTime: Time): Unit= {
if (receiverInputStreams.nonEmpty) {
receivedBlockTracker.allocateBlocksToBatch(batchTime)
}
}
/**
* Allocate all unallocated blocks to thegiven batch.
* This event will get written to thewrite ahead log (if enabled).
*/
//分配所有未分配的块给定的批次。本次活动将得到写入到写前日志(如果启用)
def allocateBlocksToBatch(batchTime: Time): Unit= synchronized {
if (lastAllocatedBatchTime == null ||batchTime > lastAllocatedBatchTime) {
val streamIdToBlocks= streamIds.map { streamId =>
(streamId,getReceivedBlockQueue(streamId).dequeueAll(x => true))
}.toMap
val allocatedBlocks= AllocatedBlocks(streamIdToBlocks)
if (writeToLog(BatchAllocationEvent(batchTime,allocatedBlocks))) {
//接收到的数据根据KEY放进去lastAllocatedBatchTime
timeToAllocatedBlocks.put(batchTime,allocatedBlocks)
lastAllocatedBatchTime = batchTime
} else {
logInfo(s"Possibly processed batch $batchTime need to be processed again in WAL recovery")
}
} else {
// This situation occurs when:
// 1. WAL is ended withBatchAllocationEvent, but without BatchCleanupEvent,
// possibly processed batch job orhalf-processed batch job need to be processed again,
// so the batchTime will be equal tolastAllocatedBatchTime.
// 2. Slow checkpointing makesrecovered batch time older than WAL recovered
// lastAllocatedBatchTime.
// This situation will only occurs inrecovery time.
logInfo(s"Possiblyprocessed batch $batchTime need to be processed again in WAL recovery")
}
}
DStreamGraph
JobGenerator.generateJobs方法中graph.generateJobs(time)点击进入
//此时的outputStream是整个DStream中的最后一个DStream,也就是foreachDStream.
def generateJobs(time: Time): Seq[Job] = {
logDebug("Generating jobs for time " + time)
val jobs =this.synchronized {
outputStreams.flatMap{ outputStream =>
//根据最后一个DStream,然后根据时间生成Job.
val jobOption = outputStream.generateJob(time)
jobOption.foreach(_.setCallSite(outputStream.creationSite))
jobOption
}
}
logDebug("Generated " + jobs.length + " jobs for time " + time)
jobs
}
DStream
* 此时的JobFunc就是我们前面提到的用函数封装了Job。
generateJob基于给定的时间生成Spark Streaming 的Job,这个方法会基于我们的DStream的操作物化成了RDD,由此可以看出,DStream是逻辑级别的,RDD是物理级别的。
*/
private[streaming]def generateJob(time: Time): Option[Job] = {
getOrCompute(time) match {
case Some(rdd) => {
val jobFunc= () => {
val emptyFunc= { (iterator: Iterator[T]) => {} }
//rdd => 就是RDD的依赖关系
context.sparkContext.runJob(rdd, emptyFunc)
}
//Job这个类就代表了Spark业务逻辑,可能包含很多Spark Jobs.
Some(new Job(time, jobFunc))
}
case None=> None
}
}
/**
* Get the RDD corresponding to the giventime; either retrieve it from cache
* or compute-and-cache it.
*/
//此时put函数中的RDD是最后一个RDD,虽然触发Job是基于时间,但是也是基于DStream的action的。
private[streaming] final def getOrCompute(time: Time): Option[RDD[T]] = {
// If RDD was already generated, then retrieve it fromHashMap,
// or else compute the RDD
//基于时间生成RDD
generatedRDDs.get(time).orElse {
// Compute the RDD if time is valid (e.g. correct time ina sliding window)
// of RDD generation, else generatenothing.
if (isTimeValid(time)){
val rddOption= createRDDWithLocalProperties(time, displayInnerRDDOps = false) {
// Disable checks for existing output directories in jobslaunched by the streaming
// scheduler, since we may needto write output to an existing directory during checkpoint
// recovery; see SPARK-4835 formore details. We need to have this call here because
// compute() might cause Sparkjobs to be launched.
PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
compute(time)
}
}
//然后对generated RDD进行checkpoint
rddOption.foreach { case newRDD=>
// Register the generated RDD for caching andcheckpointing
if (storageLevel != StorageLevel.NONE) {
newRDD.persist(storageLevel)
logDebug(s"Persisting RDD ${newRDD.id} for time $time to $storageLevel")
}
if (checkpointDuration != null &&(time - zeroTime).isMultipleOf(checkpointDuration)) {
newRDD.checkpoint()
logInfo(s"Marking RDD ${newRDD.id} for time $time for checkpointing")
}
//以时间为Key,RDD为Value,此时的RDD为最后一个RDD
generatedRDDs.put(time, newRDD)
}
rddOption
} else {
None
}
}
}
RDD
/**
* @note due to complications in the internal implementation, this method willraise an
* exception if called on an RDD of `Nothing` or `Null`. This may be come up in practice
* because, for example, the type of `parallelize(Seq())` is `RDD[Nothing]`.
* (`parallelize(Seq())` should be avoided anyway in favor of `parallelize(Seq[T]())`.)
* @return true if and only if the RDD contains no elements at all. Note that an RDD
* may be empty even when it has at least 1 partition.
*/
def isEmpty(): Boolean = withScope {
partitions.length == 0 || take(1).length == 0
}
判断是否为空
if(rdd.partitions>0){
rdd.partitions.isEmpty不行
}
要采用
if(!rdd.isEmpty())
2.SparkStreamin流处理优雅的停止
如果什么值也不传的时候会停止sparkContext,而且数据没有处理完也会被停止
StreamingContext
/**
* Stop the execution of the streamsimmediately (does not wait for all received data
* to be processed). By default, if `stopSparkContext` is not specified, the underlying
* SparkContext will also be stopped.This implicit behavior can be configured using the
* SparkConf configurationspark.streaming.stopSparkContextByDefault.
*
* @param stopSparkContext If true, stops the associated SparkContext. Theunderlying SparkContext
* will be stoppedregardless of whether this StreamingContext has been
* started.
*/
def stop(
stopSparkContext: Boolean = conf.getBoolean("spark.streaming.stopSparkContextByDefault", true)
): Unit = synchronized {
stop(stopSparkContext, false)
}
这个stop会等数据处理完后再停止
/**
* Stop the execution of the streams,with option of ensuring all received data
* has been processed.
*
* @param stopSparkContext if true, stops the associated SparkContext. Theunderlying SparkContext
* will be stoppedregardless of whether this StreamingContext has been
* started.
* @param stopGracefully if true, stops gracefully by waiting for the processingof all
* received data to becompleted
*/
def stop(stopSparkContext:Boolean, stopGracefully: Boolean): Unit = {
var shutdownHookRefToRemove:AnyRef = null
if (AsynchronousListenerBus.withinListenerThread.value) {
throw new SparkException("Cannotstop StreamingContext within listener thread of" +
" AsynchronousListenerBus")
}
synchronized {
try {
state match{
case INITIALIZED =>
logWarning("StreamingContext has not been startedyet")
case STOPPED =>
logWarning("StreamingContext has already beenstopped")
case ACTIVE =>
scheduler.stop(stopGracefully)
// Removing the streamingSource to de-register themetrics on stop()
env.metricsSystem.removeSource(streamingSource)
uiTab.foreach(_.detach())
StreamingContext.setActiveContext(null)
waiter.notifyStop()
if (shutdownHookRef != null) {
shutdownHookRefToRemove = shutdownHookRef
shutdownHookRef = null
}
logInfo("StreamingContext stoppedsuccessfully")
}
} finally {
// The state should always be Stopped after calling`stop()`, even if we haven't started yet
state = STOPPED
}
}
if (shutdownHookRefToRemove!= null) {
ShutdownHookManager.removeShutdownHook(shutdownHookRefToRemove)
}
// Even if we have already stopped, we still need toattempt to stop the SparkContext because
// a user might stop(stopSparkContext= false) and then call stop(stopSparkContext = true).
if (stopSparkContext)sc.stop()
}
private def stopOnShutdown(): Unit = {
val stopGracefully= conf.getBoolean("spark.streaming.stopGracefullyOnShutdown", false)
logInfo(s"Invoking stop(stopGracefully=$stopGracefully) from shutdown hook")
// Do not stop SparkContext, let its own shutdown hookstop it
stop(stopSparkContext = false,stopGracefully = stopGracefully)
}
}
/**
* Start the execution of the streams.
*
* @throws IllegalStateException if the StreamingContext is already stopped.
*/
def start(): Unit = synchronized {
state match{
case INITIALIZED =>
startSite.set(DStream.getCreationSite())
StreamingContext.ACTIVATION_LOCK.synchronized {
StreamingContext.assertNoOtherContextIsActive()
try {
validate()
// Start the streaming scheduler in a new thread, so that thread localproperties
// like call sites and jobgroups can be reset without affecting those of the
// current thread.
ThreadUtils.runInNewThread("streaming-start") {
sparkContext.setCallSite(startSite.get)
sparkContext.clearJobGroup()
sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
scheduler.start()
}
state =StreamingContextState.ACTIVE
} catch {
case NonFatal(e) =>
logError("Error starting the context, marking itas stopped", e)
scheduler.stop(false)
state =StreamingContextState.STOPPED
throw e
}
StreamingContext.setActiveContext(this)
}
shutdownHookRef = ShutdownHookManager.addShutdownHook(
StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
// Registering Streaming Metrics at the start of the StreamingContext
assert(env.metricsSystem != null)
env.metricsSystem.registerSource(streamingSource)
uiTab.foreach(_.attach())
logInfo("StreamingContext started")
case ACTIVE =>
logWarning("StreamingContext has already beenstarted")
case STOPPED =>
throw new IllegalStateException("StreamingContexthas already been stopped")
}
}
spark.streaming.stopGracefullyOnShutdown可以设置为true,可以完成整个处理,不丢失数据