上一节我们讲述了BlockManager
是如何进行数据获取的,当需要的数据还没有计算时候,就需要先生成相应的Block数据,本节我们就来看下BlockManager
是怎么把数据写入到Spark存储中的。
写入入口
BlockManager
将读取和写入统一入口在getOrElseUpdate
,会先从本地&远端获取数据,如果没有,就进行写入操作,写入成功,读取本地数据返回给使用方,如下所示:
def getOrElseUpdate[T](
blockId: BlockId,
level: StorageLevel,
classTag: ClassTag[T],
makeIterator: () => Iterator[T]): Either[BlockResult, Iterator[T]] = {
// 如果本地或者其他executor有,就返回
get[T](blockId)(classTag) match {
case Some(block) =>
return Left(block)
case _ =>
// Need to compute the block.
}
// 数据没有计算过,需要重新计算得到,写入到本地
doPutIterator(blockId, makeIterator, level, classTag, keepReadLock = true) match {
case None =>
val blockResult = getLocalValues(blockId).getOrElse {
releaseLock(blockId)
throw new SparkException(s"get() failed for block $blockId even though we held a lock")
}
releaseLock(blockId)
Left(blockResult)
case Some(iter) =>
Right(iter)
}
}
可以看出来,写入操作主要是调用了doPutIterator
方法。
doPutIterator
doPutIterator
是将迭代器数据写入到存储系统中,它调用了doPut()
方法,但是具体的写入存储的逻辑是在本函数中定义的,doPut
只是做了一些块写入前和写入后的一些准备和清理工作,块写入的具体步骤如下:
- 如果块的存储等级使用了内存,就优先将块展开到内存中;
- 如果存储等级是未序列化的,就调用
MemoryStore.putIteratorAsValues()
方法,将块数据作为对象写入。反之,如果是序列化的,就调用MemoryStore.putIteratorAsBytes()
方法将块数据作为字节流写入; - 如果存储内存充足,完全写入了存储内存,则写入完成;但是如果只展开了一部分,说明内存中无法容纳块数据,在存储等级会使用磁盘的情况下,要继续调用
DiskStore.put()
方法,将多出的数据序列化地溢写到磁盘。若最终仍然没有完全展开,就将剩余的数据记录在iteratorFromFailedMemoryStorePut
这个迭代器中[类型为PartiallyUnrolledIterator]; - 如果块的存储等级只是使用磁盘,就直接调用
DiskStore.put()
方法写到磁盘中; - 块数据写入完毕之后,如果
tellMaster
标记为真,调用reportBlockStatus()
方法将新块的信息报告给BlockManagerMaster
; - 检查块的存储等级是否有副本,如果有,还需要调用
replicate()
方法将块向其他节点复制一份; - 方法返回
iteratorFromFailedMemoryStorePut
迭代器。
private def doPutIterator[T](
blockId: BlockId,
iterator: () => Iterator[T], // 具体写入逻辑
level: StorageLevel,
classTag: ClassTag[T],
tellMaster: Boolean = true,
keepReadLock: Boolean = false): Option[PartiallyUnrolledIterator[T]] = {
// 调用doPut写入数据
doPut(blockId, level, classTag, tellMaster = tellMaster, keepReadLock = keepReadLock) { info =>
val startTimeNs = System.nanoTime()
var iteratorFromFailedMemoryStorePut: Option[PartiallyUnrolledIterator[T]] = None
// Size of the block in bytes
var size = 0L
if (level.useMemory) {
// Put it in memory first, even if it also has useDisk set to true;
// We will drop it to disk later if the memory store can't hold it.
if (level.deserialized) { // 未序列化数据
memoryStore.putIteratorAsValues(blockId, iterator(), classTag) match {
case Right(s) =>
size = s
case Left(iter) =>
// Not enough space to unroll this block; drop to disk if applicable
if (level.useDisk) { // 内存不足,如果使用了disk,则尝试写入磁盘
logWarning(s"Persisting block $blockId to disk instead.")
diskStore.put(blockId) { channel =>
val out = Channels.newOutputStream(channel)
serializerManager.dataSerializeStream(blockId, out, iter)(classTag)
}
size = diskStore.getSize(blockId)
} else {
iteratorFromFailedMemoryStorePut = Some(iter) // 写入错误
}
}
} else { // 序列化数据
memoryStore.putIteratorAsBytes(blockId, iterator(), classTag, level.memoryMode) match {
// 内存先写入
case Right(s) =>
size = s
case Left(partiallySerializedValues) =>
// Not enough space to unroll this block; drop to disk if applicable
if (level.useDisk) { // 内存不足,存储可以写入磁盘,则磁盘继续写
logWarning(s"Persisting block $blockId to disk instead.")
diskStore.put(blockId) { channel =>
val out = Channels.newOutputStream(channel)
partiallySerializedValues.finishWritingToStream(out)
}
size = diskStore.getSize(blockId)
} else {
iteratorFromFailedMemoryStorePut = Some(partiallySerializedValues.valuesIterator)
}
}
}
} else if (level.useDisk) { // 只使用了磁盘
diskStore.put(blockId) { channel =>
val out = Channels.newOutputStream(channel)
serializerManager.dataSerializeStream(blockId, out, iterator())(classTag)
}
size = diskStore.getSize(blockId)
}
val putBlockStatus = getCurrentBlockStatus(blockId, info)
val blockWasSuccessfullyStored = putBlockStatus.storageLevel.isValid
if (blockWasSuccessfullyStored) {
// Now that the block is in either the memory or disk store, tell the master about it.
info.size = size
if (tellMaster && info.tellMaster) {
reportBlockStatus(blockId, putBlockStatus) // 是否需要更新master节点的信息
}
addUpdatedBlockStatusToTaskMetrics(blockId, putBlockStatus)
logDebug(s"Put block $blockId locally took ${Utils.getUsedTimeNs(startTimeNs)}")
if (level.replication > 1) {
val remoteStartTimeNs = System.nanoTime()
val bytesToReplicate = doGetLocalBytes(blockId, info)
// [SPARK-16550] Erase the typed classTag when using default serialization, since
// NettyBlockRpcServer crashes when deserializing repl-defined classes.
// TODO(ekl) remove this once the classloader issue on the remote end is fixed.
val remoteClassTag = if (!serializerManager.canUseKryo(classTag)) {
scala.reflect.classTag[Any]
} else {
classTag
}
try { // 看是否需要副本
replicate(blockId, bytesToReplicate, level, remoteClassTag)
} finally {
bytesToReplicate.dispose()
}
logDebug(s"Put block $blockId remotely took ${Utils.getUsedTimeNs(remoteStartTimeNs)}")
}
}
assert(blockWasSuccessfullyStored == iteratorFromFailedMemoryStorePut.isEmpty)
iteratorFromFailedMemoryStorePut
}
}
doPut
该方法是doPutBytes()
和doPutIterator()
两个方法公用的方法,是在写入的具体逻辑外添加一些公用的前置和后置工作,具体的写入操作由第二个参数写入函数进行处理,另外keepReadLock
标记表示是否在写入后还继续持有对块的读锁,保持读锁只需要进行锁降级,不保持直接释放锁即可,整体执行步骤如下:
- 首先生成新的
BlockInfo
,并调用BlockInfoManager.lockNewBlockForWriting()
加写锁,准备写入; - 调用
putBody
函数的逻辑,真正地写入块数据; - 若写入成功,当keepReadLock为真时,就调用
BlockInfoManager.downgradeLock()
方法将原先持有的写锁降级为读锁,方便后续读取。反之,当keepReadLock为假时,就直接调用BlockInfoManager.unlock()
方法直接释放锁; - 若putBody未能写入全部的块数据[返回的迭代器不为空]或者中途抛出了异常,说明写入不成功,调用
removeBlockInternal()
方法移除失败的块。
private def doPut[T](
blockId: BlockId,
level: StorageLevel,
classTag: ClassTag[_],
tellMaster: Boolean,
keepReadLock: Boolean)(putBody: BlockInfo => Option[T]): Option[T] = {
if (isDecommissioning()) { // 如果当前BlockManager已经下线则抛出异常
throw new BlockSavedOnDecommissionedBlockManagerException(blockId)
}
val putBlockInfo = {
val newInfo = new BlockInfo(level, classTag, tellMaster) // 先创建BlockInfo
// 新建并加锁,维护blockId->blockInfo关系写入
if (blockInfoManager.lockNewBlockForWriting(blockId, newInfo)) {
newInfo
} else {
if (!keepReadLock) {
releaseLock(blockId) // 有可能创建了但是没加上锁,需要释放锁
}
return None
}
}
val startTimeNs = System.nanoTime()
var exceptionWasThrown: Boolean = true
val result: Option[T] = try {
val res = putBody(putBlockInfo) // putBody是具体的写入的逻辑,可能写入到内存或者磁盘
exceptionWasThrown = false
if (res.isEmpty) {
if (keepReadLock) { // 还需要读取,需要读锁,只需要锁降级
blockInfoManager.downgradeLock(blockId)
} else { // 不需要锁了,释放锁
blockInfoManager.unlock(blockId)
}
} else { // 写入失败了删除块
removeBlockInternal(blockId, tellMaster = false)
}
res
} catch {
case NonFatal(e) =>
throw e
} finally {
if (exceptionWasThrown) { // 有异常写入不成功,删除块信息
removeBlockInternal(blockId, tellMaster = tellMaster)
// 写入失败,但是写入逻辑有可能向master报告了status信息,将status置为空
addUpdatedBlockStatusToTaskMetrics(blockId, BlockStatus.empty)
}
}
val usedTimeMs = Utils.getUsedTimeNs(startTimeNs)
result
}
副本写入
当存储级别中的副本数量大于1时候,需要对块进行副本存储,但是复制块的过程是阻塞的<uploadBlockSync
>,会造成块写入性能下降,以及造成较大的网络传输开销,代码比较简单,源码如下:
private def replicate(
blockId: BlockId,
data: BlockData, // 块数据,从本地读取到
level: StorageLevel,
classTag: ClassTag[_],
existingReplicas: Set[BlockManagerId] = Set.empty,
maxReplicationFailures: Option[Int] = None): Boolean = {
// 最大复制失败重试次数
val maxReplicationFailureCount = maxReplicationFailures.getOrElse(
conf.get(config.STORAGE_MAX_REPLICATION_FAILURE))
val tLevel = StorageLevel(
useDisk = level.useDisk,
useMemory = level.useMemory,
useOffHeap = level.useOffHeap,
deserialized = level.deserialized,
replication = 1)
val numPeersToReplicateTo = level.replication - 1
val startTime = System.nanoTime
val peersReplicatedTo = mutable.HashSet.empty ++ existingReplicas
val peersFailedToReplicateTo = mutable.HashSet.empty[BlockManagerId]
var numFailures = 0
// 获取所有peers
val initialPeers = getPeers(false).filterNot(existingReplicas.contains)
// 副本策略,默认是随机副本策略,从所有peers选取需要副本的机器,尽可能在不同的host上面
var peersForReplication = blockReplicationPolicy.prioritize(
blockManagerId,
initialPeers,
peersReplicatedTo,
blockId,
numPeersToReplicateTo)
while(numFailures <= maxReplicationFailureCount &&
!peersForReplication.isEmpty &&
peersReplicatedTo.size < numPeersToReplicateTo) {
val peer = peersForReplication.head
try {
val onePeerStartTime = System.nanoTime
// 构造可以通过网络传输的块
val buffer = new BlockManagerManagedBuffer(blockInfoManager, blockId, data, false,
unlockOnDeallocate = false)
// netty传输块
blockTransferService.uploadBlockSync(
peer.host,
peer.port,
peer.executorId,
blockId,
buffer,
tLevel,
classTag)
peersForReplication = peersForReplication.tail
peersReplicatedTo += peer
} catch {
case e: InterruptedException =>
throw e
case NonFatal(e) =>
peersFailedToReplicateTo += peer
// 重新选择可以副本的机器
val filteredPeers = getPeers(true).filter { p =>
!peersFailedToReplicateTo.contains(p) && !peersReplicatedTo.contains(p)
}
numFailures += 1
peersForReplication = blockReplicationPolicy.prioritize(
blockManagerId,
filteredPeers,
peersReplicatedTo,
blockId,
numPeersToReplicateTo - peersReplicatedTo.size)
}
}
if (peersReplicatedTo.size < numPeersToReplicateTo) {
return false
}
return true
}