spark-shuffle的读数据源码分析

最新推荐文章于 2021-07-19 21:49:02 发布

ZL小屁孩

最新推荐文章于 2021-07-19 21:49:02 发布

阅读量246

点赞数

分类专栏： spark

本文链接：https://blog.csdn.net/ZH519080/article/details/82787642

版权

spark 专栏收录该内容

23 篇文章 0 订阅

订阅专栏

对指定分区进行计算的抽象接口，以为CoGroupedRDD（或者ShuffleRDD，可能compute细节不同，但是shuffle读取的类或方法的调用时一样的）的compute方法为实现，源码：

override def compute(s: Partition, context: TaskContext): Iterator[(K, Array[Iterable[_]])] = {val split = s.asInstanceOf[CoGroupPartition]
  val numRdds = dependencies.length
  val rddIterators = new ArrayBuffer[(Iterator[Product2[K, Any]], Int)]
  for ((dep, depNum) <- dependencies.zipWithIndex) dep match {
    case oneToOneDependency: OneToOneDependency[Product2[K, Any]] @unchecked =>
      val dependencyPartition = split.narrowDeps(depNum).get.split
      //读取父RDD的数据
      val it = oneToOneDependency.rdd.iterator(dependencyPartition, context)
      rddIterators += ((it, depNum))
    case shuffleDependency: ShuffleDependency[_, _, _] =>
      // 首先从SparkEnv获取ShuffleManager，然后从ShuffleDependency中获取注册到ShuffleManager时得到的shuffleHandle，根据shuffleHandle和当前Task对应的分区ID获取ShuffleWriter,根据获取的ShuffleReader调用read接口，读取Shuffle的Map输出
      val it = SparkEnv.get.shuffleManager
        .getReader(shuffleDependency.shuffleHandle, split.index, split.index + 1, context)
        .read()
      rddIterators += ((it, depNum))
  }
  val map = createExternalMap(numRdds)
  for ((it, depNum) <- rddIterators) {
    map.insertAll(it.map(pair => (pair._1, new CoGroupValue(pair._2, depNum))))
  }
  context.taskMetrics().incMemoryBytesSpilled(map.memoryBytesSpilled)
  context.taskMetrics().incDiskBytesSpilled(map.diskBytesSpilled)
  context.internalMetricsToAccumulators(
    InternalAccumulator.PEAK_EXECUTION_MEMORY).add(map.peakMemoryUsedBytes)
  new InterruptibleIterator(context,
    map.iterator.asInstanceOf[Iterator[(K, Array[Iterable[_]])]])
}

从源码可知，带宽依赖的RDD的compute操作，最终通过SparkEnv的ShuffleManager实例的getReader方法获取数据读取器，然后再调用读取器的read方法读取指定分区范围的Shuffle数据。

特质ShuffleReader是由子类BlockStoreShuffleReader实现，其中BlockStoreShuffleReader的read方法的源码：

/** 为该Reduce任务读取并合并key-values值 */
override def read(): Iterator[Product2[K, C]] = {
  val blockFetcherItr = new ShuffleBlockFetcherIterator(context,  blockManager.shuffleClient,  blockManager,
/** 当ShuffleMapTask完成后注册到mapOutputTracker的元数据信息，会通过mapOutputTracker获取，同时指定获取的分区返回*/
    mapOutputTracker.getMapSizesByExecutorId(handle.shuffleId, startPartition, endPartition),
/**默认是48M，并行读取策略：避免目标机器占用过多带宽，也可以启动并行机制加快读取速度*/
    SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024)
  //针对前面获取的各数据块的唯一标识ID信息及其对应的输入流进行处理
  val wrappedStreams = blockFetcherItr.map { case (blockId, inputStream) =>
    blockManager.wrapForCompression(blockId, inputStream)  // lz4、lzf、snappy三种压缩器
  }
  val ser = Serializer.getSerializer(dep.serializer)
  val serializerInstance = ser.newInstance()
  // 为每个stream创建一个key-values迭代器
  val recordIter = wrappedStreams.flatMap { wrappedStream =>
    serializerInstance.deserializeStream(wrappedStream).asKeyValueIterator
  }
  // 更新上下文任务量
  val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency()
  val metricIter = CompletionIterator[(Any, Any), Iterator[(Any, Any)]](
    recordIter.map(record => {
      readMetrics.incRecordsRead(1)
      record
    }),
    context.taskMetrics().updateShuffleReadMetrics())
  // 为了支持任务取消，必须使用可中断迭代器
  val interruptibleIter = new InterruptibleIterator[(Any, Any)](context, metricIter)
//读取的数据进行聚合处理
  val aggregatedIter: Iterator[Product2[K, C]] = if (dep.aggregator.isDefined) {
    if (dep.mapSideCombine) { //获取的数据在Map端进行聚合处理
      val combinedKeyValuesIterator = interruptibleIter.asInstanceOf[Iterator[(K, C)]]
//Map端各分区针对key进行合并后的结果再次聚合，Map的合并可以大大减少网络传输
      dep.aggregator.get.combineCombinersByKey(combinedKeyValuesIterator, context)
    } else { //只需要在Reduce端进行聚合
      val keyValuesIterator = interruptibleIter.asInstanceOf[Iterator[(K, Nothing)]]
      dep.aggregator.get.combineValuesByKey(keyValuesIterator, context)
    }
  } else {  //不需要聚合
    require(!dep.mapSideCombine, "Map-side combine without Aggregator specified!")
    interruptibleIter.asInstanceOf[Iterator[Product2[K, C]]]
  }

  // 在基于Sort的Shuffle实现过程中，默认基于PartitionId进行排序，在分区的内部数据是没有排序的，因此添加了keyOrdering变量，提供是否需要针对分区内的数据进行排序的标识信息，若定义了排序，则对输出结果进行排序
  dep.keyOrdering match {  //判断是否需要排序
    case Some(keyOrd: Ordering[K]) =>
      // 为了减少内存压力，避免GC开销，引入了外部排序器对数据进行排序。当内存不足以容纳排序的数据量时，会根据配置的spark.shuffle.spill属性来决定是否需要spill到磁盘中，默认是打开spill开关的。
      val sorter =
        new ExternalSorter[K, C, C](context, ordering = Some(keyOrd), serializer = Some(ser))
      sorter.insertAll(aggregatedIter)
      context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled)
      context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled)
      context.internalMetricsToAccumulators(
        InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.peakMemoryUsedBytes)
      CompletionIterator[Product2[K, C], Iterator[Product2[K, C]]](sorter.iterator, sorter.stop())
    case None =>
      aggregatedIter //若不需要排序分区则直接返回
  }
}

在BlockStoreShuffleReader的read方法调用ShuffleBlockFetcherIterator构造器，实现ShuffleBlockFetcherIterator.initialize方法，在initialize方法先后实现splitLocalRemoteBlocks、fetchUpToMaxBytes和fetchLocalBlocks等方法，首先来分析ShuffleBlockFetcherIterator的splitLocalRemoteBlocks方法的源码：

private[this] def splitLocalRemoteBlocks(): ArrayBuffer[FetchRequest] = {
  // 每次最多同时并行的启动5个线程从5个节点上读取数据，每次请求的容量<= spark.reducer.maxMbInFlight(默认是48M)/5
  val targetRequestSize = math.max(maxBytesInFlight / 5, 1L)
  val remoteRequests = new ArrayBuffer[FetchRequest]
  var totalBlocks = 0
  for ((address, blockInfos) <- blocksByAddress) {
    totalBlocks += blockInfos.size
    if (address.executorId == blockManager.blockManagerId.executorId) { //获取本地的数据块
      localBlocks ++= blockInfos.filter(_._2 != 0).map(_._1) //过滤数据块为空的，当数据与BlockManager在同一个节点，则直接把Blocks存入localBlocks中
      numBlocksToFetch += localBlocks.size
    } else { //数据不在本地
      val iterator = blockInfos.iterator
      var curRequestSize = 0L
      var curBlocks = new ArrayBuffer[(BlockId, Long)]
      while (iterator.hasNext) { //BlockId的格式：shuffle_+shuffleId_+mapId_+reduceId
        val (blockId, size) = iterator.next()
        if (size > 0) { //过滤为空的数据块
          curBlocks += ((blockId, size))
          remoteBlocks += blockId //记录远程机器上的数据块Id（BlockId）
          numBlocksToFetch += 1
          curRequestSize += size
        } else if (size < 0) {
          throw new BlockException(blockId, "Negative block size " + size)
        } 
        if (curRequestSize >= targetRequestSize) {
          remoteRequests += new FetchRequest(address, curBlocks)
          curBlocks = new ArrayBuffer[(BlockId, Long)]
          curRequestSize = 0
        }
      } //当数据不在本地时，生成remoteRequests，其条件：curReuestSize大等于maxBytesInFlight/5，会把block信息存入remoteRequests中，包括block位置，blockId，block大小信息
      if (curBlocks.nonEmpty) {
        remoteRequests += new FetchRequest(address, curBlocks)
      }
    } //注意：FetchRequest可能会有内存泄漏，若单个Block过大，fetch过来占用内存过大造成OOM
  }
  remoteRequests
}

ShuffleBlockFetcherIterator的fetchUpToMaxBytes方法是发送请求获取远程的数据，只有到当前的数据量与请求数据量之和小于maxBytesInFlight时才能发送请求：

private def fetchUpToMaxBytes(): Unit = {
  while (fetchRequests.nonEmpty &&
    (bytesInFlight == 0 || bytesInFlight + fetchRequests.front.size <= maxBytesInFlight)) {
    sendRequest(fetchRequests.dequeue())
  }
}

通过实现fetchUpToMaxBytes方法获取完远程数据后，以方法fetchLocalBlocks方法获取本地数据，ShuffleBlockFetcherIterator的fetchLocalBlocks的源码：

private[this] def fetchLocalBlocks() {
  val iter = localBlocks.iterator
  while (iter.hasNext) {
    val blockId = iter.next()
    try {
      val buf = blockManager.getBlockData(blockId)
      shuffleMetrics.incLocalBlocksFetched(1)
      shuffleMetrics.incLocalBytesRead(buf.size)
      buf.retain()
      results.put(new SuccessFetchResult(blockId, blockManager.blockManagerId, 0, buf))
    } catch {
      case e: Exception =>
        results.put(new FailureFetchResult(blockId, blockManager.blockManagerId, e))
        return
    }
  }
}

fetcheLocalBlocks方法获取本地数据块其实是调用BlockManager的getBlockData方法，BlockManager的getBlockData方法真正调用的IndexShuffleBlockResolver或FileShuffleBlockResolver（两类继承特质ShuffleBlockResolver）的getBlockData：

ShuffleBlockFetcherIterator.fetchLocalBlocks -> BlockManager.getBlockData -> ShuffleBlockResolver.getBlockData流程。

IndexShuffleBlockResolver的getBlockData实现的DiskBlockManager的getFile方法；FileShuffleBlockResolver的getBlockData实现的是FileSegmentManagedBuffer构造函数。

private[this] def splitLocalRemoteBlocks(): ArrayBuffer[FetchRequest] = {
  // 每次最多同时并行的启动5个线程从5个节点上读取数据，每次请求的容量<= spark.reducer.maxMbInFlight(默认是48M)/5
  val targetRequestSize = math.max(maxBytesInFlight / 5, 1L)
  val remotRequests = new ArrayBuffer[FetchRequest]
  var totalBlocks = 0
  for ((address, blockInfos) <- blocksByAddress) {
    totalBlocks += blockInfos.size
    if (address.executorId == blockManager.blockManagerId.executorId) { //获取本地的数据块
      localBlocks ++= blockInfos.filter(_._2 != 0).map(_._1) //过滤数据块为空的，当数据与BlockManager在同一个节点，则直接把Blocks存入localBlocks中
      numBlocksToFetch += localBlocks.size
    } else { //数据不在本地
      val iterator = blockInfos.iterator
      var curRequestSize = 0L
      var curBlocks = new ArrayBuffer[(BlockId, Long)]
      while (iterator.hasNext) { //BlockId的格式：shuffle_+shuffleId_+mapId_+reduceId
        val (blockId, size) = iterator.next()
        if (size > 0) { //过滤为空的数据块
          curBlocks += ((blockId, size))
          remoteBlocks += blockId //记录远程机器上的数据块Id（BlockId）
          numBlocksToFetch += 1
          curRequestSize += size
        } else if (size < 0) {
          throw new BlockException(blockId, "Negative block size " + size)
        } 
        if (curRequestSize >= targetRequestSize) {
          remoteRequests += new FetchRequest(address, curBlocks)
          curBlocks = new ArrayBuffer[(BlockId, Long)]
          curRequestSize = 0
        }
      } //当数据不在本地时，生成remoteRequests，其条件：curReuestSize大等于maxBytesInFlight/5，会把block信息存入remoteRequests中，包括block位置，blockId，block大小信息
      if (curBlocks.nonEmpty) {
        remoteRequests += new FetchRequest(address, curBlocks)
      }
    } //注意：FetchRequest可能会有内存泄漏，若单个Block过大，fetch过来占用内存过大造成OOM
  }
  remoteRequests
}

ZL小屁孩

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spark-shuffle的读数据源码分析

对指定分区进行计算的抽象接口，以为CoGroupedRDD（或者ShuffleRDD，可能compute细节不同，但是shuffle读取的类或方法的调用时一样的）的compute方法为实现，源码：override def compute(s: Partition, context: TaskContext): Iterator[(K, Array[Iterable[_]])] = {val ...
复制链接

扫一扫

专栏目录