基于案例贯通Spark Streaming流计算框架运行源码10

最新推荐文章于 2024-09-07 20:45:42 发布

weixin_34186128

最新推荐文章于 2024-09-07 20:45:42 发布

阅读量90

点赞数

文章标签： scala 大数据 python

原文链接：https://my.oschina.net/corleone/blog/672711

版权

2019独角兽企业重金招聘Python工程师标准>>>

上文从源码分析到 ReceiverSupervisorImpl 已经实例化完成。

关联下代码上下文

// ReceiverTracker.scala line 573
    val supervisor = new ReceiverSupervisorImpl(
      receiver, SparkEnv.get, serializableHadoopConf.value, checkpointDirOption)
    supervisor.start()
    supervisor.awaitTermination()

下一步就是start

// ReceiverSupervisor.scala line 128
/** Start the supervisor */
def start() {
  onStart()
  startReceiver()
}

onStart()

// ReceiverSupervisorImpl.scala line 172
override protected def onStart() {
  registeredBlockGenerators.foreach { _.start() }
}

_.start()。启动上文中提到的两条线程。这两条线程很重要，先抖个包袱

// BlockGenerator.scala line 114
/** Start block generating and pushing threads. */
def start(): Unit = synchronized {
  if (state == Initialized) {
    state = Active
    blockIntervalTimer.start()
    blockPushingThread.start()
    logInfo("Started BlockGenerator")
  } else {
    throw new SparkException(
      s"Cannot start BlockGenerator as its not in the Initialized state [state = $state]")
  }
}

再启动startReceiver

onReceiverStart ；确认receiver存在
receiver.onStart；启动Receiver，本例中是SocketReceiver

// ReceiverSupervisor.scala line 143
/** Start receiver */
def startReceiver(): Unit = synchronized {
  try {
    if (onReceiverStart()) {
      logInfo("Starting receiver")
      receiverState = Started
      receiver.onStart()
      logInfo("Called receiver onStart")
    } else {
      // The driver refused us
      stop("Registered unsuccessfully because Driver refused to start receiver " + streamId, None)
    }
  } catch {
    case NonFatal(t) =>
      stop("Error starting receiver " + streamId, Some(t))
  }
}

SocketReceiver.onStart

// SocketInputDStream.scala line 55
def onStart() {
  // Start the thread that receives data over a connection
  new Thread("Socket Receiver") {
    setDaemon(true)
    override def run() { receive() }
  }.start()
}

receive()

// SocketInputDStream.scala line 69
/** Create a socket connection and receive data until receiver is stopped */
def receive() {
  var socket: Socket = null
  try {
    logInfo("Connecting to " + host + ":" + port)
    socket = new Socket(host, port)
    logInfo("Connected to " + host + ":" + port)
    val iterator = bytesToObjects(socket.getInputStream())
    while(!isStopped && iterator.hasNext) {
      store(iterator.next)
    }
    if (!isStopped()) {
      restart("Socket data stream had no more data")
    } else {
      logInfo("Stopped receiving")
    }
  } catch {// 一些代码
  } finally { // 一些代码
  }
}

// Receiver.scala line 113
/**
 * Store a single item of received data to Spark's memory.
 * These single items will be aggregated together into data blocks before
 * being pushed into Spark's memory.
 */
def store(dataItem: T) {
  supervisor.pushSingle(dataItem)
}

supervisor.pushSingle

// ReceiverSupervisorImpl.scala line 118
/** Push a single record of received data into block generator. */
def pushSingle(data: Any) {
  defaultBlockGenerator.addData(data)
}

至此，数据就已经接收到了，并且交给了BlockGenerator。

先别高兴，数据只是到了Receiver，而真正执行计算的可不是Receiver哦。那么数据是如何到下游的呢？

我们再回到之前抖的包袱里。

// BlockGenerator.scala line 118
blockIntervalTimer.start()

启动定时器，

private[streaming]
class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: String)
  extends Logging {
  
    private val thread = new Thread("RecurringTimer - " + name) {
      setDaemon(true)
      override def run() { loop } // 运行loop 方法
    }
    
    // 一些代码
    // RecurringTimer.scala line 66
    /**
     * Start at the earliest time it can start based on the period.
     */
    def start(): Long = {
      start(getStartTime()) // 调用start方法
    }
    
    // RecurringTimer.scala line 56
    /**
     * Start at the given start time.
     */
    def start(startTime: Long): Long = synchronized {
      nextTime = startTime
      thread.start()  // 启动线程，这里线程终于启动了
      logInfo("Started timer for " + name + " at time " + nextTime)
      nextTime
    }
    // 一些代码
    // line 92
    private def triggerActionForNextInterval(): Unit = {
      clock.waitTillTime(nextTime)        // 等到上次设置的
      callback(nextTime)         // 调用传入的代码块
      prevTime = nextTime
      nextTime += period
      logDebug("Callback for " + name + " called at time " + prevTime)
    }
    /**
     * Repeatedly call the callback every interval.
     */
    private def loop() {
      try {
        while (!stopped) {
          triggerActionForNextInterval()  // 只要没stop，循环调用triggerActionForNextInterval方法
        }
        triggerActionForNextInterval()
      } catch {
        case e: InterruptedException =>
      }
    }
    // 一些代码
}

再看主构造中传入的代码块callback。callback = updateCurrentBuffer

// BlockGenerator.scala line 105
private val blockIntervalTimer =
  new RecurringTimer(clock, blockIntervalMs, updateCurrentBuffer, "BlockGenerator")

updateCurrentBuffer方法定义

// BlockGenerator.scala line 231
/** Change the buffer to which single records are added to. */
private def updateCurrentBuffer(time: Long): Unit = {
  try {
    var newBlock: Block = null
    synchronized {
      if (currentBuffer.nonEmpty) {
        val newBlockBuffer = currentBuffer
        currentBuffer = new ArrayBuffer[Any]
        val blockId = StreamBlockId(receiverId, time - blockIntervalMs)
        listener.onGenerateBlock(blockId)
        newBlock = new Block(blockId, newBlockBuffer)          // 创建了一个block
      }
    }

    if (newBlock != null) {
      blocksForPushing.put(newBlock)  // put is blocking when queue is full ， 此处，将block方法待push的队列中
    }
  } catch {
    case ie: InterruptedException =>
      logInfo("Block updating timer thread was interrupted")
    case e: Exception =>
      reportError("Error in block updating thread", e)
  }
}

至此，receiver接收到的信息已经发送到待发送的队列中。

那么，这些数据又是如何到executor的呢？

BlockGenerator.scala 中还有另一条启动的线程。

启动时调用 keepPushingBlocks方法。

// BlockGenerator.scala line 109
private val blockPushingThread = new Thread() { override def run() { keepPushingBlocks() } }

// BlockGenerator.scala line 256
/** Keep pushing blocks to the BlockManager. */
private def keepPushingBlocks() {
  logInfo("Started block pushing thread")

  def areBlocksBeingGenerated: Boolean = synchronized {
    state != StoppedGeneratingBlocks
  }

  try {
    // While blocks are being generated, keep polling for to-be-pushed blocks and push them.
    // 若状态不是停止，则一直循环
    while (areBlocksBeingGenerated) {
      Option(blocksForPushing.poll(10, TimeUnit.MILLISECONDS)) match {
        case Some(block) => pushBlock(block)
        case None =>
      }
    }

    // At this point, state is StoppedGeneratingBlock. So drain the queue of to-be-pushed blocks.
    // 程序能执行到这里，则说明上面的while已经跳出，状态是StoppedGeneratingBlocks，则需要将剩余的Block都取光。
    logInfo("Pushing out the last " + blocksForPushing.size() + " blocks")
    while (!blocksForPushing.isEmpty) {
      val block = blocksForPushing.take()
      logDebug(s"Pushing block $block")
      pushBlock(block)
      logInfo("Blocks left to push " + blocksForPushing.size())
    }
    logInfo("Stopped block pushing thread")
  } catch {
    case ie: InterruptedException =>
      logInfo("Block pushing thread was interrupted")
    case e: Exception =>
      reportError("Error in block pushing thread", e)
  }
}

// BlockGenerator.scala line 256
  private def pushBlock(block: Block) {
  listener.onPushBlock(block.id, block.buffer)
  logInfo("Pushed block " + block.id)
}

listener.onPushBlock(block.id,block.buffer)

// ReceiverSupervisorImpl.scala line 108
def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]) {
  pushArrayBuffer(arrayBuffer, None, Some(blockId))
}

pushArrayBuffer

// ReceiverSupervisorImpl.scala line 122
/** Store an ArrayBuffer of received data as a data block into Spark's memory. */
def pushArrayBuffer(
    arrayBuffer: ArrayBuffer[_],
    metadataOption: Option[Any],
    blockIdOption: Option[StreamBlockId]
  ) {
  pushAndReportBlock(ArrayBufferBlock(arrayBuffer), metadataOption, blockIdOption) // 封装成 ArrayBufferBlock，后面有模式匹配
}

pushAndReportBlock

// ReceiverSupervisorImpl.scala line 149
/** Store block and report it to driver */
def pushAndReportBlock(
    receivedBlock: ReceivedBlock,
    metadataOption: Option[Any],
    blockIdOption: Option[StreamBlockId]
  ) {
  val blockId = blockIdOption.getOrElse(nextBlockId)
  val time = System.currentTimeMillis
  val blockStoreResult = receivedBlockHandler.storeBlock(blockId, receivedBlock)
  logDebug(s"Pushed block $blockId in ${(System.currentTimeMillis - time)} ms")
  val numRecords = blockStoreResult.numRecords
  val blockInfo = ReceivedBlockInfo(streamId, numRecords, metadataOption, blockStoreResult)
  trackerEndpoint.askWithRetry[Boolean](AddBlock(blockInfo))
  logDebug(s"Reported block $blockId")
}

storeBlock(block,receivedBlock)

// BlockManagerBasedBlockHandler.scala line 70
def storeBlock(blockId: StreamBlockId, block: ReceivedBlock): ReceivedBlockStoreResult = {

  var numRecords = None: Option[Long]

  val putResult: Seq[(BlockId, BlockStatus)] = block match {
    case ArrayBufferBlock(arrayBuffer) =>
      numRecords = Some(arrayBuffer.size.toLong)
      blockManager.putIterator(blockId, arrayBuffer.iterator, storageLevel,
        tellMaster = true)
    // 其他的case class
  }
  if (!putResult.map { _._1 }.contains(blockId)) {
    throw new SparkException(
      s"Could not store $blockId to block manager with storage level $storageLevel")
  }
  BlockManagerBasedStoreResult(blockId, numRecords)
}

blockManager.putIterator

// BlockManager.scala line 638
def putIterator(
    blockId: BlockId,
    values: Iterator[Any],
    level: StorageLevel,
    tellMaster: Boolean = true,
    effectiveStorageLevel: Option[StorageLevel] = None): Seq[(BlockId, BlockStatus)] = {
  require(values != null, "Values is null")
  doPut(blockId, IteratorValues(values), level, tellMaster, effectiveStorageLevel)
}

doPut

// BlockManager.scala line 797
case IteratorValues(iterator) =>
  blockStore.putIterator(blockId, iterator, putLevel, returnValues)

至此，可见数据完全交由BlockManager掌管。同时数据接收也已经解析完。

那就下一步：接收到的数据是如何运行的？

转载于:https://my.oschina.net/corleone/blog/672711

weixin_34186128

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
基于案例贯通Spark Streaming流计算框架运行源码10

2019独角兽企业重金招聘Python工程师标准>>> ...
复制链接

扫一扫