第12课:Spark Streaming源码解读之executor容错安全性

本期内容:

1.ExecutorWAL

2.消息重放

3.其它

 

StorageLevel.scala

Memory不够的时候才考虑disk

classStorageLevel private(
   
private var _useDisk: Boolean,
   
private var _useMemory: Boolean,
   
private var _useOffHeap: Boolean,
   
private var _deserialized: Boolean,
   
private var _replication: Int = 1)
 
extends Externalizable {

ReceiverSupervisorImpl.scala

/** Store the bytes of received data as a data block into Spark's memory. */
def pushBytes(
   
bytes: ByteBuffer,
    metadataOption: Option[Any],
    blockIdOption: Option[StreamBlockId]
  ) {
  pushAndReportBlock(ByteBufferBlock(bytes), metadataOption, blockIdOption)
}

/** Store block and report it to driver */
def pushAndReportBlock(
   
receivedBlock: ReceivedBlock,
    metadataOption: Option[Any],
    blockIdOption: Option[StreamBlockId]
  ) {
  val blockId = blockIdOption.getOrElse(nextBlockId)
 
val time = System.currentTimeMillis
 
val blockStoreResult = receivedBlockHandler.storeBlock(blockId, receivedBlock)
 
logDebug(s"Pushed block $blockId in ${(System.currentTimeMillis - time)} ms")
 
val numRecords = blockStoreResult.numRecords
 
val blockInfo = ReceivedBlockInfo(streamId, numRecords, metadataOption, blockStoreResult)
 
trackerEndpoint.askWithRetry[Boolean](AddBlock(blockInfo))
 
logDebug(s"Reported block $blockId")
}

receivedBlockHandler

private val receivedBlockHandler: ReceivedBlockHandler = {
 
if (WriteAheadLogUtils.enableReceiverLog(env.conf)) {
   
if (checkpointDirOption.isEmpty) {
     
throw new SparkException(
       
"Cannot enable receiver write-ahead log without checkpoint directory set. " +
         
"Please use streamingContext.checkpoint() to set the checkpoint directory. " +
         
"See documentation for more details.")
   
}
    new WriteAheadLogBasedBlockHandler(env.blockManager, receiver.streamId,
     
receiver.storageLevel, env.conf, hadoopConf, checkpointDirOption.get)
  } else {
   
new BlockManagerBasedBlockHandler(env.blockManager, receiver.storageLevel)
 
}
}

def enableReceiverLog(conf: SparkConf): Boolean = {
 
conf.getBoolean(RECEIVER_WAL_ENABLE_CONF_KEY, false)
}

ReceivedBlockHandler

def storeBlock(blockId: StreamBlockId, block: ReceivedBlock): ReceivedBlockStoreResult = {

 
var numRecords = None: Option[Long]

 
val putResult: Seq[(BlockId, BlockStatus)] = block match {
   
case ArrayBufferBlock(arrayBuffer) =>
     
numRecords = Some(arrayBuffer.size.toLong)
      blockManager.putIterator(blockId, arrayBuffer.iterator, storageLevel,
        tellMaster = true)
   
case IteratorBlock(iterator) =>
     
val countIterator = new CountingIterator(iterator)
     
val putResult = blockManager.putIterator(blockId, countIterator, storageLevel,
       
tellMaster = true)
     
numRecords = countIterator.count
      putResult
    case ByteBufferBlock(byteBuffer) =>
     
blockManager.putBytes(blockId, byteBuffer, storageLevel, tellMaster = true)
 
  case o =>
     
throw new SparkException(
       
s"Could not store $blockId to block manager, unexpected block type ${o.getClass.getName}")
 
}
  if (!putResult.map { _._1 }.contains(blockId)) {
   
throw new SparkException(
     
s"Could not store $blockId to block manager with storage level $storageLevel")
 
}
  BlockManagerBasedStoreResult(blockId, numRecords)
}

def putIterator(
   
blockId: BlockId,
    values: Iterator[Any],
   
level: StorageLevel,
    tellMaster: Boolean = true,
   
effectiveStorageLevel: Option[StorageLevel] = None): Seq[(BlockId, BlockStatus)] = {
 
require(values != null, "Values is null")
 
doPut(blockId, IteratorValues(values), level, tellMaster, effectiveStorageLevel)
}

FileBasedWriteAheadLog
write

/**
 
* Write a byte buffer to the log file. This method synchronously writes the data in the
 * ByteBuffer to HDFS. When this method returns, the data is guaranteed to have been flushed
 * to HDFS, and will be available for readers to read.
 */
def write(byteBuffer: ByteBuffer, time: Long): FileBasedWriteAheadLogSegment = synchronized {
 
var fileSegment: FileBasedWriteAheadLogSegment = null
 
var
failures = 0
 
var lastException: Exception = null
 
var
succeeded = false
 
while
(!succeeded && failures < maxFailures) {
   
try {
   
  fileSegment = getLogWriter(time).write(byteBuffer)
      if (closeFileAfterWrite) {
       
resetWriter()
      }
      succeeded = true
   
} catch {
     
case ex: Exception =>
       
lastException = ex
        logWarning("Failed to write to write ahead log")
       
resetWriter()
        failures += 1
   
}
 
}
  if (fileSegment == null) {
   
logError(s"Failed to write to write ahead log after $failures failures")
   
throw lastException
 
}
  fileSegment
}

/** Get the current log writer while taking care of rotation */
private def getLogWriter(currentTime: Long): FileBasedWriteAheadLogWriter = synchronized {
 
if (currentLogWriter == null || currentTime > currentLogWriterStopTime) {
   
resetWriter()
    currentLogPath.foreach {
     
pastLogs += LogInfo(currentLogWriterStartTime, currentLogWriterStopTime, _)
   
}
    currentLogWriterStartTime = currentTime
   
currentLogWriterStopTime = currentTime + (rollingIntervalSecs * 1000)
   
val newLogPath = new Path(logDirectory,
     
timeToLogFile(currentLogWriterStartTime, currentLogWriterStopTime))
   
currentLogPath = Some(newLogPath.toString)
   
currentLogWriter = new FileBasedWriteAheadLogWriter(currentLogPath.get, hadoopConf)
 
}
  currentLogWriter
}

readAll

def readAll(): JIterator[ByteBuffer] = synchronized {
 
val logFilesToRead = pastLogs.map{ _.path} ++ currentLogPath
 
logInfo("Reading from the logs:\n" + logFilesToRead.mkString("\n"))
 
def readFile(file: String): Iterator[ByteBuffer] = {
   
logDebug(s"Creating log reader with $file")
   
val reader = new FileBasedWriteAheadLogReader(file, hadoopConf)
   
CompletionIterator[ByteBuffer, Iterator[ByteBuffer]](reader, reader.close _)
 
}
  if (!closeFileAfterWrite) {
   
logFilesToRead.iterator.map(readFile).flatten.asJava
  } else {
   
// For performance gains, it makes sense to parallelize the recovery if
   
// closeFileAfterWrite = true
    seqToParIterator
(threadpool, logFilesToRead, readFile).asJava
 
}
}

FileBasedWriteAheadLogReader

private[streaming] class FileBasedWriteAheadLogReader(path: String, conf: Configuration)
 
extends Iterator[ByteBuffer] with Closeable with Logging {


private val instream = HdfsUtils.getInputStream(path, conf)
private var closed = (instream == null) // the file may be deleted as we're opening the stream
private var nextItem: Option[ByteBuffer] = None

 

override def hasNext: Boolean = synchronized {
 
if (closed) {
   
return false
 
}

 
if (nextItem.isDefined) { // handle the case where hasNext is called without calling next
   
true
 
} else {
   
try {
     
val length = instream.readInt()
     
val buffer = new Array[Byte](length)
     
instream.readFully(buffer)
     
nextItem = Some(ByteBuffer.wrap(buffer))
     
logTrace("Read next item " + nextItem.get)
     
true
   
} catch {
   
  case e: EOFException =>
       
logDebug("Error reading next item, EOF reached", e)
       
close()
        false
     
case
e: IOException =>
       
logWarning("Error while trying to read data. If the file was deleted, " +
         
"this should be okay.", e)
       
close()
        if (HdfsUtils.checkFileExists(path, conf)) {
         
// If file exists, this could be a legitimate error
         
throw e
       
} else {
         
// File was deleted. This can occur when the daemon cleanup thread takes time to
         
// delete the file during recovery.
         
false
       
}

     
case e: Exception =>
       
logWarning("Error while trying to read data from HDFS.", e)
       
close()
        throw e
   
}
  }
}

getInputStream

def getInputStream(path: String, conf: Configuration): FSDataInputStream = {
 
val dfsPath = new Path(path)
 
val dfs = getFileSystemForPath(dfsPath, conf)
 
if (dfs.isFile(dfsPath)) {
   
try {
     
dfs.open(dfsPath)
    } catch {
     
case e: IOException =>
       
// If we are really unlucky, the file may be deleted as we're opening the stream.
       
// This can happen as clean up is performed by daemon threads that may be left over from
        // previous runs.
       
if (!dfs.isFile(dfsPath)) null else throw e
   
}
  } else {
   
null
 
}
}

DirectKafkaInputDStream

@tailrec
protected final def latestLeaderOffsets(retries: Int): Map[TopicAndPartition, LeaderOffset] = {
 
val o = kc.getLatestLeaderOffsets(currentOffsets.keySet)
 
// Either.fold would confuse @tailrec, do it manually
 
if (o.isLeft) {
   
val err = o.left.get.toString
   
if (retries <= 0) {
     
throw new SparkException(err)
   
} else {
     
log.error(err)
      Thread.sleep(kc.config.refreshLeaderBackoffMs)
      latestLeaderOffsets(retries - 1)
   
}
  } else {
   
o.right.get
  }
}

KafkaRDD

private[kafka]
class KafkaRDD[
 
K: ClassTag,
  V: ClassTag,
  U <: Decoder[_]: ClassTag,
  T <: Decoder[_]: ClassTag,
  R: ClassTag] private[spark] (
   
sc: SparkContext,
    kafkaParams: Map[String, String],
    val offsetRanges: Array[OffsetRange],
   
leaders: Map[TopicAndPartition, (String, Int)],
    messageHandler: MessageAndMetadata[K, V] => R
  ) extends RDD[R](sc, Nil) with Logging with HasOffsetRanges {
 
override def getPartitions: Array[Partition] = {
   
offsetRanges.zipWithIndex.map { case (o, i) =>
    
   val (host, port) = leaders(TopicAndPartition(o.topic, o.partition))
       
new KafkaRDDPartition(i, o.topic, o.partition, o.fromOffset, o.untilOffset, host, port)
   
}.toArray
  }

getPreferredLocations

override def getPreferredLocations(thePart: Partition): Seq[String] = {
 
val part = thePart.asInstanceOf[KafkaRDDPartition]
 
// TODO is additional hostname resolution necessary here
 
Seq(part.host)
}

connectLeader

// The idea is to use the provided preferred host, except on task retry atttempts,
// to minimize number of kafka metadata requests
private def connectLeader: SimpleConsumer = {
 
if (context.attemptNumber > 0) {
   
kc.connectLeader(part.topic, part.partition).fold(
      errs => throw new SparkException(
       
s"Couldn't connect to leader for topic ${part.topic} ${part.partition}: " +
         
errs.mkString("\n")),
     
consumer => consumer
    )
  } else {
   
kc.connect(part.host, part.port)
  }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值