BlockManager



1.每次rdd调用iterator方法时,都先会从cacheManager里打(就是在blockManager里找),找不到就调用computeOrReadCheckpoint方法去读取数据(HadoopRDD的compute方法),读取到数据之后就会放在blockManager里面。下次就可以直接从本地取数据,不用再在hdfs上读取数据。


2.每次在worker节点上launch一个新的Executor的时候都会调用SparkEnv的create方法,这个方法会新建一个blockManager的实例,则每一个Executor都会有一个blockManager.

// Initialize Spark environment (using system properties read above)
  private val env = {
    if (!isLocal) {
      val <span style="color:#FF0000;">_env = SparkEnv.create(conf, executorId, slaveHostname, 0,
        isDriver = false, isLocal = false)</span>
      SparkEnv.set(_env)
      _env.metricsSystem.registerSource(executorSource)
      _env
    } else {
      SparkEnv.get
    }
  }



3.每次RDD调用iterator的方法时都会先调用cacheManager.getOrCompute方法,此方法先在本地的blockManager上寻找对应的block,如果没有则在远程的blockManager寻找,还找不到则调用RDD的compute方法在hdfs上读取(此处是用HadoopRDD做例子)。

先看看RDD的iterator方法的代码

 final def iterator(split: Partition, context: TaskContext): Iterator[T] = {
    if (storageLevel != StorageLevel.NONE) {
      SparkEnv.get.cacheManager.getOrCompute(this, split, context, storageLevel)
    } else {
      computeOrReadCheckpoint(split, context)
    }
  }


  def getOrCompute[T](rdd: RDD[T], split: Partition, context: TaskContext, storageLevel: StorageLevel)
      : Iterator[T] = {

    val key = RDDBlockId(rdd.id, split.index)

    blockManager.get(key) match {  //在本地与远程的blockManager上找
      case Some(values) =>  //找到
        // Partition is already materialized, so just return its values
        new InterruptibleIterator(context, values.asInstanceOf[Iterator[T]])

      case None =>  

         val computedValues = rdd.computeOrReadCheckpoint(split, context) //找不到则调用RDD的Compute方法
          // Persist the result, so long as the task is not running locally
          if (context.runningLocally) { return computedValues }
          val elements = new ArrayBuffer[Any]
          elements ++= computedValues
         <span style="color:#FF0000;"> blockManager.put(key, elements, storageLevel, tellMaster = true)  </span>//读取到的数据会放入到blockManager,方便下次使用
          elements.iterator.asInstanceOf[Iterator[T]]

}



4.BlockManager的Get方法


def get(blockId: BlockId): Option[Iterator[Any]] = {
    val local = getLocal(blockId)  //先在本地找
    if (local.isDefined) {
      logInfo("Found block %s locally".format(blockId))
      return local
    }
    val remote = <span style="color:#FF0000;">getRemote</span>(blockId) //本地不有则去远程找
    if (remote.isDefined) { 
      logInfo("Found block %s remotely".format(blockId))
      return remote
    }
    None
  }


getLocal会调用doGetLocal

 private def doGetLocal(blockId: BlockId, asValues: Boolean): Option[Any] = {
    val info = blockInfo.get(blockId).orNull

    val level = info.level

    // Look for the block in memory
    if (level.useMemory) {
          val result = if (asValues) {  //分两种格式,一种是以值为格式,另一种是以字节为格式
            memoryStore.getValues(blockId)
          } else {
            memoryStore.getBytes(blockId)
          }

          result match {
            case Some(values) =>
              return Some(values)
            case None =>
              logDebug("Block " + blockId + " not found in memory")
          }
       }

      //内存中没有则从diskStore上读取

      if (level.useDisk) {

          val bytes: ByteBuffer = diskStore.getBytes(blockId) match {
            case Some(bytes) => bytes
            case None =>
              throw new Exception("Block " + blockId + " not found on disk, though it should be")
          }

           if (!level.useMemory) {
            // If the block shouldn't be stored in memory, we can just return it:
            if (asValues) {
              return Some(dataDeserialize(blockId, bytes))
            } else {
              return Some(bytes)
            }
          }
        }
 }

getRemote 会调用doGetRemote


 

 private def doGetRemote(blockId: BlockId, asValues: Boolean): Option[Any] = {
    val locations = Random.shuffle(master.getLocations(blockId))  //locations为BlockManagerId类的实例,BlockManagerId唯一地对应一个BlockManager
    for (loc <- locations) {    //BlockManagerId(executorId, host, port, Option(nettyPort))
      val data = BlockManagerWorker.syncGetBlock(
        GetBlock(blockId), ConnectionManagerId(loc.host, loc.port))
      if (data != null) {
        if (asValues) {
          return Some(dataDeserialize(blockId, data))
        } else {
          return Some(data)
        }
      }
    }
    logDebug("Block " + blockId + " not found")
    None
  }


5.blockManager的put方法

  def put(blockId: BlockId, values: Iterator[Any], level: StorageLevel, tellMaster: Boolean)
    : Long = {
    val elements = new ArrayBuffer[Any]
    elements ++= values
    put(blockId, elements, level, tellMaster)
  }

/**
   * Put a new block of values to the block manager. Returns its (estimated) size in bytes.
   */
  def put(blockId: BlockId, values: ArrayBuffer[Any], level: StorageLevel,
          tellMaster: Boolean = true) : Long = {
    require(values != null, "Values is null")
    doPut(blockId, Left(values), level, tellMaster)
  }


<pre name="code" class="ruby">private def doPut(blockId: BlockId, data: Either[ArrayBuffer[Any], ByteBuffer], level: StorageLevel, tellMaster: Boolean = true): Long = {

    data match {
          case Left(values) => {
            if (level.useMemory) {
              // Save it just to memory first, even if it also has useDisk set to true; we will
              // drop it to disk later if the memory store can't hold it.
              val res = memoryStore.putValues(blockId, values, level, true)
              size = res.size
              res.data match {
                case Right(newBytes) => bytesAfterPut = newBytes
                case Left(newIterator) => valuesAfterPut = newIterator
              }
            } else {
              // Save directly to disk.
              // Don't get back the bytes unless we replicate them.
              val askForBytes = level.replication > 1
              val res = <span style="color:#FF0000;">diskStore.putValues(blockId, values, level, askForBytes)</span>
              size = res.size
              res.data match {
                case Right(newBytes) => bytesAfterPut = newBytes
                case _ =>
              }
            }
          }
          case Right(bytes) => { //当是内容是字节的情况
            bytes.rewind()
            // Store it only in memory at first, even if useDisk is also set to true
            (if (level.useMemory) memoryStore else diskStore).putBytes(blockId, bytes, level)
            size = bytes.limit
          }

}

 



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值