Spark存储管理源码分析系列之BlockManage块写入分析

本文链接：https://blog.csdn.net/lidongmeng0213/article/details/109279364

上一节我们讲述了BlockManager是如何进行数据获取的，当需要的数据还没有计算时候，就需要先生成相应的Block数据，本节我们就来看下BlockManager是怎么把数据写入到Spark存储中的。

写入入口

BlockManager将读取和写入统一入口在getOrElseUpdate，会先从本地&远端获取数据，如果没有，就进行写入操作，写入成功，读取本地数据返回给使用方，如下所示:

def getOrElseUpdate[T](
  blockId: BlockId,
  level: StorageLevel,
  classTag: ClassTag[T],
  makeIterator: () => Iterator[T]): Either[BlockResult, Iterator[T]] = {
  // 如果本地或者其他executor有，就返回
  get[T](blockId)(classTag) match {
    case Some(block) =>
    return Left(block)
    case _ =>
    // Need to compute the block.
  }

  // 数据没有计算过，需要重新计算得到，写入到本地
  doPutIterator(blockId, makeIterator, level, classTag, keepReadLock = true) match {
    case None =>
    val blockResult = getLocalValues(blockId).getOrElse {
      releaseLock(blockId)
      throw new SparkException(s"get() failed for block $blockId even though we held a lock")
    }
    releaseLock(blockId)
    Left(blockResult)
    case Some(iter) =>
    Right(iter)
  }
}

可以看出来，写入操作主要是调用了doPutIterator方法。

doPutIterator

doPutIterator是将迭代器数据写入到存储系统中，它调用了doPut()方法，但是具体的写入存储的逻辑是在本函数中定义的，doPut只是做了一些块写入前和写入后的一些准备和清理工作，块写入的具体步骤如下：

如果块的存储等级使用了内存，就优先将块展开到内存中;
如果存储等级是未序列化的，就调用MemoryStore.putIteratorAsValues()方法，将块数据作为对象写入。反之，如果是序列化的，就调用MemoryStore.putIteratorAsBytes()方法将块数据作为字节流写入；
如果存储内存充足，完全写入了存储内存，则写入完成；但是如果只展开了一部分，说明内存中无法容纳块数据，在存储等级会使用磁盘的情况下，要继续调用DiskStore.put()方法，将多出的数据序列化地溢写到磁盘。若最终仍然没有完全展开，就将剩余的数据记录在iteratorFromFailedMemoryStorePut这个迭代器中[类型为PartiallyUnrolledIterator]；
如果块的存储等级只是使用磁盘，就直接调用DiskStore.put()方法写到磁盘中；
块数据写入完毕之后，如果tellMaster标记为真，调用reportBlockStatus()方法将新块的信息报告给BlockManagerMaster；
检查块的存储等级是否有副本，如果有，还需要调用replicate()方法将块向其他节点复制一份；
方法返回iteratorFromFailedMemoryStorePut迭代器。

private def doPutIterator[T](
  blockId: BlockId,
  iterator: () => Iterator[T], // 具体写入逻辑
  level: StorageLevel,
  classTag: ClassTag[T],
  tellMaster: Boolean = true,
  keepReadLock: Boolean = false): Option[PartiallyUnrolledIterator[T]] = {
  // 调用doPut写入数据
  doPut(blockId, level, classTag, tellMaster = tellMaster, keepReadLock = keepReadLock) { info =>
    val startTimeNs = System.nanoTime()
    var iteratorFromFailedMemoryStorePut: Option[PartiallyUnrolledIterator[T]] = None
    // Size of the block in bytes
    var size = 0L
    if (level.useMemory) {
      // Put it in memory first, even if it also has useDisk set to true;
      // We will drop it to disk later if the memory store can't hold it.
      if (level.deserialized) { // 未序列化数据
        memoryStore.putIteratorAsValues(blockId, iterator(), classTag) match {
          case Right(s) =>
          size = s
          case Left(iter) =>
          // Not enough space to unroll this block; drop to disk if applicable
          if (level.useDisk) { // 内存不足，如果使用了disk，则尝试写入磁盘
            logWarning(s"Persisting block $blockId to disk instead.")
            diskStore.put(blockId) { channel =>
              val out = Channels.newOutputStream(channel)
              serializerManager.dataSerializeStream(blockId, out, iter)(classTag)
            }
            size = diskStore.getSize(blockId)
          } else {
            iteratorFromFailedMemoryStorePut = Some(iter) // 写入错误
          }
        }
      } else { // 序列化数据
        memoryStore.putIteratorAsBytes(blockId, iterator(), classTag, level.memoryMode) match {
          // 内存先写入
          case Right(s) =>
          size = s
          case Left(partiallySerializedValues) =>
          // Not enough space to unroll this block; drop to disk if applicable
          if (level.useDisk) { // 内存不足，存储可以写入磁盘，则磁盘继续写
            logWarning(s"Persisting block $blockId to disk instead.")
            diskStore.put(blockId) { channel =>
              val out = Channels.newOutputStream(channel)
              partiallySerializedValues.finishWritingToStream(out)
            }
            size = diskStore.getSize(blockId)
          } else {
            iteratorFromFailedMemoryStorePut = Some(partiallySerializedValues.valuesIterator)
          }
        }
      }

    } else if (level.useDisk) { // 只使用了磁盘
      diskStore.put(blockId) { channel =>
        val out = Channels.newOutputStream(channel)
        serializerManager.dataSerializeStream(blockId, out, iterator())(classTag)
      }
      size = diskStore.getSize(blockId)
    }

    val putBlockStatus = getCurrentBlockStatus(blockId, info)
    val blockWasSuccessfullyStored = putBlockStatus.storageLevel.isValid
    if (blockWasSuccessfullyStored) {
      // Now that the block is in either the memory or disk store, tell the master about it.
      info.size = size
      if (tellMaster && info.tellMaster) {
        reportBlockStatus(blockId, putBlockStatus) // 是否需要更新master节点的信息
      }
      addUpdatedBlockStatusToTaskMetrics(blockId, putBlockStatus)
      logDebug(s"Put block $blockId locally took ${Utils.getUsedTimeNs(startTimeNs)}")
      if (level.replication > 1) {
        val remoteStartTimeNs = System.nanoTime()
        val bytesToReplicate = doGetLocalBytes(blockId, info)
        // [SPARK-16550] Erase the typed classTag when using default serialization, since
        // NettyBlockRpcServer crashes when deserializing repl-defined classes.
        // TODO(ekl) remove this once the classloader issue on the remote end is fixed.
        val remoteClassTag = if (!serializerManager.canUseKryo(classTag)) {
          scala.reflect.classTag[Any]
        } else {
          classTag
        }
        try { // 看是否需要副本
          replicate(blockId, bytesToReplicate, level, remoteClassTag)
        } finally {
          bytesToReplicate.dispose()
        }
        logDebug(s"Put block $blockId remotely took ${Utils.getUsedTimeNs(remoteStartTimeNs)}")
      }
    }
    assert(blockWasSuccessfullyStored == iteratorFromFailedMemoryStorePut.isEmpty)
    iteratorFromFailedMemoryStorePut
  }
}

doPut

该方法是doPutBytes()和doPutIterator()两个方法公用的方法，是在写入的具体逻辑外添加一些公用的前置和后置工作，具体的写入操作由第二个参数写入函数进行处理，另外keepReadLock标记表示是否在写入后还继续持有对块的读锁，保持读锁只需要进行锁降级，不保持直接释放锁即可，整体执行步骤如下:

首先生成新的BlockInfo，并调用BlockInfoManager.lockNewBlockForWriting()加写锁，准备写入;
调用putBody函数的逻辑，真正地写入块数据;
若写入成功，当keepReadLock为真时，就调用BlockInfoManager.downgradeLock()方法将原先持有的写锁降级为读锁，方便后续读取。反之，当keepReadLock为假时，就直接调用BlockInfoManager.unlock()方法直接释放锁;
若putBody未能写入全部的块数据[返回的迭代器不为空]或者中途抛出了异常，说明写入不成功，调用removeBlockInternal()方法移除失败的块。

private def doPut[T](
  blockId: BlockId,
  level: StorageLevel,
  classTag: ClassTag[_],
  tellMaster: Boolean,
  keepReadLock: Boolean)(putBody: BlockInfo => Option[T]): Option[T] = {

  if (isDecommissioning()) { // 如果当前BlockManager已经下线则抛出异常
    throw new BlockSavedOnDecommissionedBlockManagerException(blockId)
  }

  val putBlockInfo = {
    val newInfo = new BlockInfo(level, classTag, tellMaster) // 先创建BlockInfo
    // 新建并加锁，维护blockId->blockInfo关系写入
    if (blockInfoManager.lockNewBlockForWriting(blockId, newInfo)) {
      newInfo
    } else {
      if (!keepReadLock) {
        releaseLock(blockId) // 有可能创建了但是没加上锁，需要释放锁
      }
      return None
    }
  }

  val startTimeNs = System.nanoTime()
  var exceptionWasThrown: Boolean = true
  val result: Option[T] = try {
    val res = putBody(putBlockInfo) // putBody是具体的写入的逻辑，可能写入到内存或者磁盘
    exceptionWasThrown = false
    if (res.isEmpty) {
      if (keepReadLock) { // 还需要读取，需要读锁，只需要锁降级
        blockInfoManager.downgradeLock(blockId)
      } else { // 不需要锁了，释放锁
        blockInfoManager.unlock(blockId)
      }
    } else { // 写入失败了删除块
      removeBlockInternal(blockId, tellMaster = false)
    }
    res
  } catch {
    case NonFatal(e) =>
    throw e
  } finally {
    if (exceptionWasThrown) { // 有异常写入不成功，删除块信息
      removeBlockInternal(blockId, tellMaster = tellMaster)
      // 写入失败，但是写入逻辑有可能向master报告了status信息，将status置为空
      addUpdatedBlockStatusToTaskMetrics(blockId, BlockStatus.empty)
    }
  }
  val usedTimeMs = Utils.getUsedTimeNs(startTimeNs)
  result
}

副本写入

当存储级别中的副本数量大于1时候，需要对块进行副本存储，但是复制块的过程是阻塞的<uploadBlockSync>，会造成块写入性能下降，以及造成较大的网络传输开销，代码比较简单，源码如下:

private def replicate(
  blockId: BlockId,
  data: BlockData, // 块数据，从本地读取到
  level: StorageLevel,
  classTag: ClassTag[_],
  existingReplicas: Set[BlockManagerId] = Set.empty,
  maxReplicationFailures: Option[Int] = None): Boolean = {
  // 最大复制失败重试次数
  val maxReplicationFailureCount = maxReplicationFailures.getOrElse(
    conf.get(config.STORAGE_MAX_REPLICATION_FAILURE))
  val tLevel = StorageLevel(
    useDisk = level.useDisk,
    useMemory = level.useMemory,
    useOffHeap = level.useOffHeap,
    deserialized = level.deserialized,
    replication = 1)

  val numPeersToReplicateTo = level.replication - 1
  val startTime = System.nanoTime

  val peersReplicatedTo = mutable.HashSet.empty ++ existingReplicas
  val peersFailedToReplicateTo = mutable.HashSet.empty[BlockManagerId]
  var numFailures = 0

  // 获取所有peers
  val initialPeers = getPeers(false).filterNot(existingReplicas.contains)

  // 副本策略，默认是随机副本策略，从所有peers选取需要副本的机器，尽可能在不同的host上面
  var peersForReplication = blockReplicationPolicy.prioritize(
    blockManagerId,
    initialPeers,
    peersReplicatedTo,
    blockId,
    numPeersToReplicateTo)

  while(numFailures <= maxReplicationFailureCount &&
        !peersForReplication.isEmpty &&
        peersReplicatedTo.size < numPeersToReplicateTo) {
    val peer = peersForReplication.head
    try {
      val onePeerStartTime = System.nanoTime
      // 构造可以通过网络传输的块
      val buffer = new BlockManagerManagedBuffer(blockInfoManager, blockId, data, false,
                                                 unlockOnDeallocate = false)
      // netty传输块
      blockTransferService.uploadBlockSync(
        peer.host,
        peer.port,
        peer.executorId,
        blockId,
        buffer,
        tLevel,
        classTag)
      peersForReplication = peersForReplication.tail
      peersReplicatedTo += peer
    } catch { 
      case e: InterruptedException =>
      throw e 
      case NonFatal(e) => 
      peersFailedToReplicateTo += peer
      // 重新选择可以副本的机器
      val filteredPeers = getPeers(true).filter { p =>
        !peersFailedToReplicateTo.contains(p) && !peersReplicatedTo.contains(p)
      }

      numFailures += 1
      peersForReplication = blockReplicationPolicy.prioritize(
        blockManagerId,
        filteredPeers,
        peersReplicatedTo,
        blockId,
        numPeersToReplicateTo - peersReplicatedTo.size)
    }
  }
  if (peersReplicatedTo.size < numPeersToReplicateTo) {
    return false
  }

  return true
}