Driver的容错,主要体现在ReceivedBlockTracker、DStreamGraph和JobGenerator容错安全性
从数据层面,ReceivedBlockTracker为整个Spark Streaming应用程序记录元数据信息。
从调度层面,DStreamGraph和JobGenerator是Spark Streaming调度的核心,记录当前调度到哪一进度,和业务有关。
若开启WAL,则将元数据写入WAL,加入ReceivedBlockQueue,此处都是未分配的Block,最后返回结果。
// ReceivedBlockTracker.scala line 85
def addBlock(receivedBlockInfo: ReceivedBlockInfo): Boolean = {
try {
val writeResult = writeToLog(BlockAdditionEvent(receivedBlockInfo))
if (writeResult) {
synchronized {
getReceivedBlockQueue(receivedBlockInfo.streamId) += receivedBlockInfo
}
logDebug(s"Stream ${receivedBlockInfo.streamId} received " +
s"block ${receivedBlockInfo.blockStoreResult.blockId}")
} else {
logDebug(s"Failed to acknowledge stream ${receivedBlockInfo.streamId} receiving " +
s"block ${receivedBlockInfo.blockStoreResult.blockId} in the Write Ahead Log.")
}
writeResult
} catch {
case NonFatal(e) =>
logError(s"Error adding block $receivedBlockInfo", e)
false
}
}
见writeToLog
// ReceivedBlockTracker.scala line 227
/** Write an update to the tracker to the write ahead log */
private def writeToLog(record: ReceivedBlockTrackerLogEvent): Boolean = {
if (isWriteAheadLogEnabled) {
logTrace(s"Writing record: $record")
try {
writeAheadLogOption.get.write(ByteBuffer.wrap(Utils.serialize(record)),
clock.getTimeMillis())
true
} catch {
case NonFatal(e) =>
logWarning(s"Exception thrown while writing record: $record to the WriteAheadLog.", e)
false
}
} else {
true
}
}
isWriteAheadLogEnabled
// ReceivedBlockTracker.scala line 250
/** Optionally create the write ahead log manager only if the feature is enabled */
private def createWriteAheadLog(): Option[WriteAheadLog] = {
checkpointDirOption.map { checkpointDir =>
val logDir = ReceivedBlockTracker.checkpointDirToLogDir(checkpointDirOption.get)
WriteAheadLogUtils.createLogForDriver(conf, logDir, hadoopConf)
}
}
checkpoint目录
// ReceivedBlockTracker.scala line 263
def checkpointDirToLogDir(checkpointDir: String): String = {
new Path(checkpointDir, "receivedBlockMetadata").toString
}
上述都是BlockInfo的WAL,都是数据的元信息。
通过前文可知,加上时间维度:
// JobGenerator.scala line 58
private val timer = new RecurringTimer(clock, ssc.graph.batchDuration.milliseconds,
longTime => eventLoop.post(GenerateJobs(new Time(longTime))), "JobGenerator")
// RecurringTimer.scala line 23
// 此处的period就是上面的ssc.graph.batchDuration.milliseconds
private[streaming]
class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: String)
extends Logging
// RecurringTimer.scala line 92
private def triggerActionForNextInterval(): Unit = {
clock.waitTillTime(nextTime)
callback(nextTime)
prevTime = nextTime
nextTime += period // 准备下次的时间为本次时间+间隔时间
logDebug("Callback for " + name + " called at time " + prevTime)
}
将Block按时间分配给对应的batch,
// JobGenerator.scala line 240 spark 1.6.0
private def generateJobs(time: Time) {
// Set the SparkEnv in this thread, so that job generation code can access the environment
// Example: BlockRDDs are created in this thread, and it needs to access BlockManager
// Update: This is probably redundant after threadlocal stuff in SparkEnv has been removed.
SparkEnv.set(ssc.env)
Try {
// 按时间将block分配到batch中
jobScheduler.receiverTracker.allocateBlocksToBatch(time) // allocate received blocks to batch
graph.generateJobs(time) // generate jobs using allocated block
} match {
case Success(jobs) =>
val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToInputInfos))
case Failure(e) =>
jobScheduler.reportError("Error generating jobs for time " + time, e)
}
// 最后做checkpoint操作
eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater = false))
}
jobScheduler.receiverTracker.allocateBlocksToBatch(time)中做了什么?
def allocateBlocksToBatch(batchTime: Time): Unit = synchronized {
if (lastAllocatedBatchTime == null || batchTime > lastAllocatedBatchTime) {
val streamIdToBlocks = streamIds.map { streamId =>
(streamId, getReceivedBlockQueue(streamId).dequeueAll(x => true))
}.toMap
val allocatedBlocks = AllocatedBlocks(streamIdToBlocks)
// 此处的writeToLog虽然每次都调用,但是方法内部还是会判断WAL是否开启。
if (writeToLog(BatchAllocationEvent(batchTime, allocatedBlocks))) {
timeToAllocatedBlocks.put(batchTime, allocatedBlocks)
lastAllocatedBatchTime = batchTime
} else {
logInfo(s"Possibly processed batch $batchTime need to be processed again in WAL recovery")
}
} else {
// This situation occurs when:
// 1. WAL is ended with BatchAllocationEvent, but without BatchCleanupEvent,
// possibly processed batch job or half-processed batch job need to be processed again,
// so the batchTime will be equal to lastAllocatedBatchTime.
// 2. Slow checkpointing makes recovered batch time older than WAL recovered
// lastAllocatedBatchTime.
// This situation will only occurs in recovery time.
logInfo(s"Possibly processed batch $batchTime need to be processed again in WAL recovery")
}
}
writeToLog内部
// ReceivedBlockTracker.scala line 227
// 这里WAL的是Block的元数据
/** Write an update to the tracker to the write ahead log */
private def writeToLog(record: ReceivedBlockTrackerLogEvent): Boolean = {
if (isWriteAheadLogEnabled) {
logTrace(s"Writing record: $record")
try {
writeAheadLogOption.get.write(ByteBuffer.wrap(Utils.serialize(record)),
clock.getTimeMillis())
true
} catch {
case NonFatal(e) =>
logWarning(s"Exception thrown while writing record: $record to the WriteAheadLog.", e)
false
}
} else {
true
}
}
至此,ReceivedBlockTracker的安全容错已经完成。
回到JobGenerator.generatorJobs,每次定时生成Job后都要调用DoCheckpoint
// JobGenerator.scala line 256 generatorJobs的最后
eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater = false))
先从所有的outputStream按lineage 回溯 各自 所有的DStream并checkpoint。
再将ssc checkpoint
// JobGenerator.scala line 290
private def doCheckpoint(time: Time, clearCheckpointDataLater: Boolean) {
if (shouldCheckpoint && (time - graph.zeroTime).isMultipleOf(ssc.checkpointDuration)) {
logInfo("Checkpointing graph for time " + time)
ssc.graph.updateCheckpointData(time)
checkpointWriter.write(new Checkpoint(ssc, time), clearCheckpointDataLater)
}
}
DStream的 checkpoint
// DStreamGraph.scala line 132
def updateCheckpointData(time: Time) {
logInfo("Updating checkpoint data for time " + time)
this.synchronized {
outputStreams.foreach(_.updateCheckpointData(time))
}
logInfo("Updated checkpoint data for time " + time)
}
// DStream.scala line 491
private[streaming] def updateCheckpointData(currentTime: Time) {
logDebug("Updating checkpoint data for time " + currentTime)
checkpointData.update(currentTime)
dependencies.foreach(_.updateCheckpointData(currentTime))
logDebug("Updated checkpoint data for time " + currentTime + ": " + checkpointData)
}
ssc的checkpoint
// Checkpoint.scala line 275
def write(checkpoint: Checkpoint, clearCheckpointDataLater: Boolean) {
try {
val bytes = Checkpoint.serialize(checkpoint, conf)
executor.execute(new CheckpointWriteHandler(
checkpoint.checkpointTime, bytes, clearCheckpointDataLater))
logInfo("Submitted checkpoint of time " + checkpoint.checkpointTime + " writer queue")
} catch {
case rej: RejectedExecutionException =>
logError("Could not submit checkpoint task to the thread pool executor", rej)
}
}
总结:
ReceivedBlockTracker是通过WAL方式来进行数据容错的。
DStreamGraph和JobGenerator是通过checkpoint方式来进行数据容错的。