1.每次rdd调用iterator方法时,都先会从cacheManager里打(就是在blockManager里找),找不到就调用computeOrReadCheckpoint方法去读取数据(HadoopRDD的compute方法),读取到数据之后就会放在blockManager里面。下次就可以直接从本地取数据,不用再在hdfs上读取数据。
2.每次在worker节点上launch一个新的Executor的时候都会调用SparkEnv的create方法,这个方法会新建一个blockManager的实例,则每一个Executor都会有一个blockManager.
// Initialize Spark environment (using system properties read above)
private val env = {
if (!isLocal) {
val <span style="color:#FF0000;">_env = SparkEnv.create(conf, executorId, slaveHostname, 0,
isDriver = false, isLocal = false)</span>
SparkEnv.set(_env)
_env.metricsSystem.registerSource(executorSource)
_env
} else {
SparkEnv.get
}
}
3.每次RDD调用iterator的方法时都会先调用cacheManager.getOrCompute方法,此方法先在本地的blockManager上寻找对应的block,如果没有则在远程的blockManager寻找,还找不到则调用RDD的compute方法在hdfs上读取(此处是用HadoopRDD做例子)。
先看看RDD的iterator方法的代码
final def iterator(split: Partition, context: TaskContext): Iterator[T] = {
if (storageLevel != StorageLevel.NONE) {
SparkEnv.get.cacheManager.getOrCompute(this, split, context, storageLevel)
} else {
computeOrReadCheckpoint(split, context)
}
}
def getOrCompute[T](rdd: RDD[T], split: Partition, context: TaskContext, storageLevel: StorageLevel)
: Iterator[T] = {
val key = RDDBlockId(rdd.id, split.index)
blockManager.get(key) match { //在本地与远程的blockManager上找
case Some(values) => //找到
// Partition is already materialized, so just return its values
new InterruptibleIterator(context, values.asInstanceOf[Iterator[T]])
case None =>
val computedValues = rdd.computeOrReadCheckpoint(split, context) //找不到则调用RDD的Compute方法
// Persist the result, so long as the task is not running locally
if (context.runningLocally) { return computedValues }
val elements = new ArrayBuffer[Any]
elements ++= computedValues
<span style="color:#FF0000;"> blockManager.put(key, elements, storageLevel, tellMaster = true) </span>//读取到的数据会放入到blockManager,方便下次使用
elements.iterator.asInstanceOf[Iterator[T]]
}
4.BlockManager的Get方法
def get(blockId: BlockId): Option[Iterator[Any]] = {
val local = getLocal(blockId) //先在本地找
if (local.isDefined) {
logInfo("Found block %s locally".format(blockId))
return local
}
val remote = <span style="color:#FF0000;">getRemote</span>(blockId) //本地不有则去远程找
if (remote.isDefined) {
logInfo("Found block %s remotely".format(blockId))
return remote
}
None
}
getLocal会调用doGetLocal
private def doGetLocal(blockId: BlockId, asValues: Boolean): Option[Any] = {
val info = blockInfo.get(blockId).orNull
val level = info.level
// Look for the block in memory
if (level.useMemory) {
val result = if (asValues) { //分两种格式,一种是以值为格式,另一种是以字节为格式
memoryStore.getValues(blockId)
} else {
memoryStore.getBytes(blockId)
}
result match {
case Some(values) =>
return Some(values)
case None =>
logDebug("Block " + blockId + " not found in memory")
}
}
//内存中没有则从diskStore上读取
if (level.useDisk) {
val bytes: ByteBuffer = diskStore.getBytes(blockId) match {
case Some(bytes) => bytes
case None =>
throw new Exception("Block " + blockId + " not found on disk, though it should be")
}
if (!level.useMemory) {
// If the block shouldn't be stored in memory, we can just return it:
if (asValues) {
return Some(dataDeserialize(blockId, bytes))
} else {
return Some(bytes)
}
}
}
}
getRemote 会调用doGetRemote
private def doGetRemote(blockId: BlockId, asValues: Boolean): Option[Any] = {
val locations = Random.shuffle(master.getLocations(blockId)) //locations为BlockManagerId类的实例,BlockManagerId唯一地对应一个BlockManager
for (loc <- locations) { //BlockManagerId(executorId, host, port, Option(nettyPort))
val data = BlockManagerWorker.syncGetBlock(
GetBlock(blockId), ConnectionManagerId(loc.host, loc.port))
if (data != null) {
if (asValues) {
return Some(dataDeserialize(blockId, data))
} else {
return Some(data)
}
}
}
logDebug("Block " + blockId + " not found")
None
}
5.blockManager的put方法
def put(blockId: BlockId, values: Iterator[Any], level: StorageLevel, tellMaster: Boolean)
: Long = {
val elements = new ArrayBuffer[Any]
elements ++= values
put(blockId, elements, level, tellMaster)
}
/**
* Put a new block of values to the block manager. Returns its (estimated) size in bytes.
*/
def put(blockId: BlockId, values: ArrayBuffer[Any], level: StorageLevel,
tellMaster: Boolean = true) : Long = {
require(values != null, "Values is null")
doPut(blockId, Left(values), level, tellMaster)
}
<pre name="code" class="ruby">private def doPut(blockId: BlockId, data: Either[ArrayBuffer[Any], ByteBuffer], level: StorageLevel, tellMaster: Boolean = true): Long = {
data match {
case Left(values) => {
if (level.useMemory) {
// Save it just to memory first, even if it also has useDisk set to true; we will
// drop it to disk later if the memory store can't hold it.
val res = memoryStore.putValues(blockId, values, level, true)
size = res.size
res.data match {
case Right(newBytes) => bytesAfterPut = newBytes
case Left(newIterator) => valuesAfterPut = newIterator
}
} else {
// Save directly to disk.
// Don't get back the bytes unless we replicate them.
val askForBytes = level.replication > 1
val res = <span style="color:#FF0000;">diskStore.putValues(blockId, values, level, askForBytes)</span>
size = res.size
res.data match {
case Right(newBytes) => bytesAfterPut = newBytes
case _ =>
}
}
}
case Right(bytes) => { //当是内容是字节的情况
bytes.rewind()
// Store it only in memory at first, even if useDisk is also set to true
(if (level.useMemory) memoryStore else diskStore).putBytes(blockId, bytes, level)
size = bytes.limit
}
}