spark2.3源码分析之in-memory collection

最新推荐文章于 2023-01-10 22:50:13 发布

zhifeng687

最新推荐文章于 2023-01-10 22:50:13 发布

阅读量554

点赞数

分类专栏： spark

本文链接：https://blog.csdn.net/qq_26222859/article/details/81743272

版权

spark 专栏收录该内容

30 篇文章 4 订阅

订阅专栏

AppendOnlyMap

概述

一个只可以添加数据的hash table的实现。它的key值永远不会删除，而每个key的value值可能会改变。

该hash table使用开放探测方法中的二次探测法保存数据，所以内部只有一个数组的数据结构。

该hash table的大小始终为2的幂次方，最多可以支持0.7 * 2 ^ 29个元素。

该hash table为了内存本地性，在同一个数组中保存key和value值；更明确的说，元素的顺序是key0, value0, key1, value1, key2, value2....

该AppendOnlyMap允许null作为key。当null作为key时，返回的value值也为null。

除了没有提供删除功能外，它提供了一个map应有的插入、修改、扩容、查找、迭代功能：

插入：update方法实现。设置key和value值。
修改：changeValue方法实现。修改key的value值。
扩容：growTable方法实现。将该table双倍扩容，并所有元素重哈希。
查找：apply方法实现。用于获取给定key的value值。
迭代：destructiveSortedIterator方法实现。按照给定比较器的排序顺序返回该map的迭代器，该方法不需要使用额外的内存就能将map上的数据排序，但是会破坏map的有效性，底层的数组结构不能再被使用。

AppendOnlyMap可以基于比较器的排序顺序返回该map的迭代器，在这一点上与SortedMap类似。

/**
 * :: DeveloperApi ::
 * A simple open hash table optimized for the append-only use case, where keys
 * are never removed, but the value for each key may be changed.
 *
 * This implementation uses quadratic probing with a power-of-2 hash table
 * size, which is guaranteed to explore all spaces for each key (see
 * http://en.wikipedia.org/wiki/Quadratic_probing).
 *
 * The map can support up to `375809638 (0.7 * 2 ^ 29)` elements.
 *
 * TODO: Cache the hash values of each key? java.util.HashMap does that.
 */
@DeveloperApi
class AppendOnlyMap[K, V](initialCapacity: Int = 64)
  extends Iterable[(K, V)] with Serializable {

  import AppendOnlyMap._

  require(initialCapacity <= MAXIMUM_CAPACITY,
    s"Can't make capacity bigger than ${MAXIMUM_CAPACITY} elements")
  require(initialCapacity >= 1, "Invalid initial capacity")

  private val LOAD_FACTOR = 0.7

  private var capacity = nextPowerOf2(initialCapacity)
  private var mask = capacity - 1
  private var curSize = 0
  private var growThreshold = (LOAD_FACTOR * capacity).toInt

  // Holds keys and values in the same array for memory locality; specifically, the order of
  // elements is key0, value0, key1, value1, key2, value2, etc.
  //为了内存本地性在同一个数组中保存key和value值；
  //更明确的说，元素的顺序是key0, value0, key1, value1, key2, value2....
  private var data = new Array[AnyRef](2 * capacity)

  // Treat the null key differently so we can use nulls in "data" to represent empty items.
  private var haveNullValue = false
  private var nullValue: V = null.asInstanceOf[V]

  // Triggered by destructiveSortedIterator; the underlying data array may no longer be used
  private var destroyed = false
  private val destructionMessage = "Map state is invalid from destructive sorting!"

  /** Get the value for a given key */
//获取给定key的value值
  def apply(key: K): V = {
    assert(!destroyed, destructionMessage)
    val k = key.asInstanceOf[AnyRef]
    if (k.eq(null)) {
      return nullValue
    }
//获取key的hashCode并和mask相与，获取元素应该存放的数组下标（通过pos*2和pos*2+1）
    var pos = rehash(k.hashCode) & mask
    var i = 1
    while (true) {
//key在data数组中的下标为0,2,4,6
//假设data数组只保存key，则下标为0,1,2,3..
//所以2*pos为key在data数组中的下标，2*pos+1为相应value在data数组中的下标
      val curKey = data(2 * pos)
      if (k.eq(curKey) || k.equals(curKey)) {
	     //返回value
        return data(2 * pos + 1).asInstanceOf[V]
      } else if (curKey.eq(null)) {
        return null.asInstanceOf[V]
      } else {
	    //目标位置的key与要查找的key不一样，则使用二次探测法继续查找
	    //使用开放地址法的二次探测法继续探测
		//二次探测就是在线性探测上做一个修改而成的，
		//线性探测中，遇到冲突就自增1，而二次探测中，就是把这个自增1去掉换成一个固定值或自定义值
		//pos + delta是将pos向前偏移delta个位置
		//& mask是防止向前偏移delta个位置后超出数组下标
        val delta = i
        pos = (pos + delta) & mask
        i += 1
      }
    }
    null.asInstanceOf[V]
  }

  /** Set the value for a key */
  def update(key: K, value: V): Unit = {
    assert(!destroyed, destructionMessage)
    val k = key.asInstanceOf[AnyRef]
    if (k.eq(null)) {
      if (!haveNullValue) {
        incrementSize()
      }
      nullValue = value
      haveNullValue = true
      return
    }
	//获取key的hashCode并和mask相与
    var pos = rehash(key.hashCode) & mask
    var i = 1
    while (true) {
      val curKey = data(2 * pos)
      if (curKey.eq(null)) {
        data(2 * pos) = k
        data(2 * pos + 1) = value.asInstanceOf[AnyRef]
        incrementSize()  // Since we added a new key
        return
      } else if (k.eq(curKey) || k.equals(curKey)) {
        data(2 * pos + 1) = value.asInstanceOf[AnyRef]
        return
      } else {
	    //使用开放地址法的二次探测法继续探测
        val delta = i
        pos = (pos + delta) & mask
        i += 1
      }
    }
  }

  /**
   * Set the value for key to updateFunc(hadValue, oldValue), where oldValue will be the old value
   * for key, if any, or null otherwise. Returns the newly updated value.
   */
  def changeValue(key: K, updateFunc: (Boolean, V) => V): V = {
    assert(!destroyed, destructionMessage)
    val k = key.asInstanceOf[AnyRef]
    if (k.eq(null)) {
      if (!haveNullValue) {
        incrementSize()
      }
      nullValue = updateFunc(haveNullValue, nullValue)
      haveNullValue = true
      return nullValue
    }
    var pos = rehash(k.hashCode) & mask
    var i = 1
    while (true) {
      val curKey = data(2 * pos)
      /*updateFunc函数的作用是，如果有值执行aggregator的mergeValue函数，如果没值执行aggregator的createCombiner函数
     */
      if (curKey.eq(null)) {
       //updateFunc函数执行aggregator的createCombiner函数
        val newValue = updateFunc(false, null.asInstanceOf[V])
	  //设置key值：data(2*pos)为k 
        data(2 * pos) = k
	//设置value值：data(2 * pos + 1)为newValue
        data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
        incrementSize()
        return newValue
      } else if (k.eq(curKey) || k.equals(curKey)) {
       //updateFunc函数执行aggregator的mergeValue函数，将旧值合并成新值
        val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V])
        data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
        return newValue
      } else {
	    //使用开放地址法的二次探测法继续探测
        val delta = i
        pos = (pos + delta) & mask
        i += 1
      }
    }
    null.asInstanceOf[V] // Never reached but needed to keep compiler happy
  }

  /** Iterator method from Iterable */
  override def iterator: Iterator[(K, V)] = {
    assert(!destroyed, destructionMessage)
    new Iterator[(K, V)] {
      var pos = -1

      /** Get the next value we should return from next(), or null if we're finished iterating */
	  //在next方法中会调用该方法，从而能返回下一个value值
      def nextValue(): (K, V) = {
        if (pos == -1) {    // Treat position -1 as looking at the null value
          if (haveNullValue) {
            return (null.asInstanceOf[K], nullValue)
          }
          pos += 1
        }
        while (pos < capacity) {
          if (!data(2 * pos).eq(null)) {
		  //返回(k, v)键值对
            return (data(2 * pos).asInstanceOf[K], data(2 * pos + 1).asInstanceOf[V])
          }
          pos += 1
        }
        null
      }

      override def hasNext: Boolean = nextValue() != null

      override def next(): (K, V) = {
        val value = nextValue()
        if (value == null) {
          throw new NoSuchElementException("End of iterator")
        }
        pos += 1
        value
      }
    }
  }

  override def size: Int = curSize

  /** Increase table size by 1, rehashing if necessary */
  private def incrementSize() {
    curSize += 1
    if (curSize > growThreshold) {
      growTable()
    }
  }

  /**
   * Re-hash a value to deal better with hash functions that don't differ in the lower bits.
   */
  private def rehash(h: Int): Int = Hashing.murmur3_32().hashInt(h).asInt()

  /** Double the table's size and re-hash everything */
  //将table双倍扩容，并所有元素重哈希
  protected def growTable() {
    // capacity < MAXIMUM_CAPACITY (2 ^ 29) so capacity * 2 won't overflow
	//新的capacity为原capacity的2倍 
    val newCapacity = capacity * 2
    require(newCapacity <= MAXIMUM_CAPACITY, s"Can't contain more than ${growThreshold} elements")
	//创建新数组，因为要同时保存key和value，所以新数组的大小为2 * newCapacity
    val newData = new Array[AnyRef](2 * newCapacity)
	//新的掩码
    val newMask = newCapacity - 1
    // Insert all our old values into the new array. Note that because our old keys are
    // unique, there's no need to check for equality here when we insert.
    var oldPos = 0
    while (oldPos < capacity) {
      if (!data(2 * oldPos).eq(null)) {
	    //获取原data数组的key和value
        val key = data(2 * oldPos)
        val value = data(2 * oldPos + 1)
		//重哈希，获取key值在新data数组的下标位置
        var newPos = rehash(key.hashCode) & newMask
        var i = 1
        var keepGoing = true
        while (keepGoing) {
          val curKey = newData(2 * newPos)
          if (curKey.eq(null)) {
		    //设置在新data数组的下标位置的key和value值
            newData(2 * newPos) = key
            newData(2 * newPos + 1) = value
            keepGoing = false
          } else {
		    //使用开放地址法的二次探测法继续探测
            val delta = i
            newPos = (newPos + delta) & newMask
            i += 1
          }
        }
      }
      oldPos += 1
    }
    data = newData
    capacity = newCapacity
    mask = newMask
    growThreshold = (LOAD_FACTOR * newCapacity).toInt
  }

  //返回值总与参数n相等，
  private def nextPowerOf2(n: Int): Int = {
    val highBit = Integer.highestOneBit(n)
    if (highBit == n) n else highBit << 1
  }

  /**
   * Return an iterator of the map in sorted order. This provides a way to sort the map without
   * using additional memory, at the expense of destroying the validity of the map.
   */
//按照排序的顺序返回该map的迭代器，该方法不需要使用额外的内存就能将map上的数据排序，但是会破坏
   //map的有效性
  def destructiveSortedIterator(keyComparator: Comparator[K]): Iterator[(K, V)] = {
    destroyed = true
    // Pack KV pairs into the front of the underlying array
    //将不为null的kv键值对移到底层数组的前端，而null键和null值的kv对移到数组末端
    var keyIndex, newIndex = 0
    while (keyIndex < capacity) {
      if (data(2 * keyIndex) != null) {
        data(2 * newIndex) = data(2 * keyIndex)
        data(2 * newIndex + 1) = data(2 * keyIndex + 1)
        newIndex += 1
      }
      keyIndex += 1
    }
    assert(curSize == newIndex + (if (haveNullValue) 1 else 0))
   
//在data数组上排序
    new Sorter(new KVArraySortDataFormat[K, AnyRef]).sort(data, 0, newIndex, keyComparator)

//返回迭代器
    new Iterator[(K, V)] {
      var i = 0
      var nullValueReady = haveNullValue
      def hasNext: Boolean = (i < newIndex || nullValueReady)
      def next(): (K, V) = {
        if (nullValueReady) {
          nullValueReady = false
          (null.asInstanceOf[K], nullValue)
        } else {
          val item = (data(2 * i).asInstanceOf[K], data(2 * i + 1).asInstanceOf[V])
          i += 1
          item
        }
      }
    }
  }

  /**
   * Return whether the next insert will cause the map to grow
   */
  def atGrowThreshold: Boolean = curSize == growThreshold
}

//伴生对象
private object AppendOnlyMap {
  val MAXIMUM_CAPACITY = (1 << 29)
}

SizeTrackingAppendOnlyMap

AppendOnlyMap的子类，可以追踪字节的预估大小。

/**
 * An append-only map that keeps track of its estimated size in bytes.
 */
private[spark] class SizeTrackingAppendOnlyMap[K, V]
  extends AppendOnlyMap[K, V] with SizeTracker
{
  override def update(key: K, value: V): Unit = {
    super.update(key, value)
    super.afterUpdate()
  }

  override def changeValue(key: K, updateFunc: (Boolean, V) => V): V = {
    val newValue = super.changeValue(key, updateFunc)
    super.afterUpdate()
    newValue
  }

  override protected def growTable(): Unit = {
    super.growTable()
    resetSamples()
  }
}

WritablePartitionedPairCollection

一个为保存key/value键值对的size-tracking collection提供以下功能的通用接口。它有以下功能：

1、每个键值对有一个与之关联的partition；
2、支持一个内存效率高的sorted iterator;
3、支持一个WritablePartitionedIterator接口，用于将内容直接写为字节

该size-tracking collection指的其实就是：SizeTrackingAppendOnlyMap及其子类

/**
 * A common interface for size-tracking collections of key-value pairs that
 *
 *  - Have an associated partition for each key-value pair.
 *  - Support a memory-efficient sorted iterator
 *  - Support a WritablePartitionedIterator for writing the contents directly as bytes.
 */
private[spark] trait WritablePartitionedPairCollection[K, V] {
  /**
   * Insert a key-value pair with a partition into the collection
     插入一个键值对和一个partition到该集合中
   */
  def insert(partition: Int, key: K, value: V): Unit

  /**
   * Iterate through the data in order of partition ID and then the given comparator. This may
   * destroy the underlying collection.
   */
   //抽象方法，供子类实现，迭代数据按照它们的partitionId的顺序，然后按照给定的comparator的顺序。
   //这个可能会破坏底层的collection
  def partitionedDestructiveSortedIterator(keyComparator: Option[Comparator[K]])
    : Iterator[((Int, K), V)]

  /**
   * Iterate through the data and write out the elements instead of returning them. Records are
   * returned in order of their partition ID and then the given comparator.
   * This may destroy the underlying collection.
     调用partitionedDestructiveSortedIterator方法，使用比较器将集合上的元素排序，
     并返回排序后的集合的迭代器，然后用该迭代器迭代元素，写入磁盘文件。
   */
  def destructiveSortedWritablePartitionedIterator(keyComparator: Option[Comparator[K]])
    : WritablePartitionedIterator = {
    //抽象方法partitionedDestructiveSortedIterator()被子类实现后，在此处被调用，属于模板设计模式的使用
    val it = partitionedDestructiveSortedIterator(keyComparator)
    //实现WritablePartitionedIterator trait，并创建一个实例对象
    new WritablePartitionedIterator {
	  //获取当前的迭代元素
      private[this] var cur = if (it.hasNext) it.next() else null
      
	  //用DiskBlockObjectWriter写入当前的迭代元素
      def writeNext(writer: DiskBlockObjectWriter): Unit = {
        writer.write(cur._1._2, cur._2)
        cur = if (it.hasNext) it.next() else null
      }

      def hasNext(): Boolean = cur != null

      def nextPartition(): Int = cur._1._1
    }
  }
}

//伴生对象
//伴生对象中定义的字段和方法， 对应同名trait/class中的静态方法
private[spark] object WritablePartitionedPairCollection {
  /**
   * A comparator for (Int, K) pairs that orders them by only their partition ID.
     一个(Int,k)键值对的比较器，仅仅根据它们的partitionId进行排序
   */
  def partitionComparator[K]: Comparator[(Int, K)] = new Comparator[(Int, K)] {
    override def compare(a: (Int, K), b: (Int, K)): Int = {
      a._1 - b._1
    }
  }

  /**
   * A comparator for (Int, K) pairs that orders them both by their partition ID and a key ordering.
     一个(Int,k)键值对的比较器，同时根据它们的partitionId和key值进行排序
     其实是将传入的参数keyComparator装饰成partitionKeyComparator
   */
  def partitionKeyComparator[K](keyComparator: Comparator[K]): Comparator[(Int, K)] = {
    new Comparator[(Int, K)] {
      override def compare(a: (Int, K), b: (Int, K)): Int = {
        //先比较partitionId的大小，如果partitionId不同，可以直接返回比较结果
        val partitionDiff = a._1 - b._1
        if (partitionDiff != 0) {
          partitionDiff
        } else {
          //如果partitionId相同,再比较key
          keyComparator.compare(a._2, b._2)
        }
      }
    }
  }
}

/**
 * Iterator that writes elements to a DiskBlockObjectWriter instead of returning them. Each element
 * has an associated partition.
   将元素写入一个DiskBlockObjectWriter，每个元素都有一个与之关联的partition
 */
private[spark] trait WritablePartitionedIterator {
  def writeNext(writer: DiskBlockObjectWriter): Unit

  def hasNext(): Boolean

  def nextPartition(): Int
}

PartitionedAppendOnlyMap

WritablePartitionPairCollection的实现，它是map的包装器，该map的key值是一个(partition ID, K)元组。

如果说在父类AppendOnlyMap中，元素的存储格式为key0, value0, key1, value1, key2, value2....

则在PartitionedAppendonlyMap中，元素的存储格式可以更加具体为

元素的顺序是(partitionId, k)0, value0, (partitionId, k)1, value1, (partitionId, k)2, value2....

图片引自： here

所以，它的迭代器迭代的元素类型也是((Int, K), V)

/**
 * Implementation of WritablePartitionedPairCollection that wraps a map in which the keys are tuples
 * of (partition ID, K)
   WritablePartitionPairCollection的实现，它是map的包装器，该map的key值是一个(partition ID, K)元组
 */
private[spark] class PartitionedAppendOnlyMap[K, V]
  extends SizeTrackingAppendOnlyMap[(Int, K), V] with WritablePartitionedPairCollection[K, V] {

  //迭代数据按照它们的partitionId的顺序，然后按照给定的comparator的顺序。
  def partitionedDestructiveSortedIterator(keyComparator: Option[Comparator[K]])
    : Iterator[((Int, K), V)] = {
//该map继承了WritablePartitionPairCollection trait
//所以它也能使用同名伴生对象WritablePartitionPairCollection的partitionKeyComparator
//和partitionComparator方法
/*
  Option.map(f:(A)=>B)方法的作用是：如果Option不为None，则将Option的value值作为函数f的参数
  执行函数f，并将f的返回值作为该方法的返回值。如果Option为None，返回None。
  在这里的作用是：如果keyComparator存在，将之装饰成partitionKeyComparator
*/
/*
  getOrElse方法在这里的作用是：如果keyComparator不存在，则使用partitionComparator
*/
    val comparator = keyComparator.map(partitionKeyComparator).getOrElse(partitionComparator)
    //调用父类AppendOnlyMap的destructiveSortediterator方法执行排序
    destructiveSortedIterator(comparator)
  }
  
  //
  def insert(partition: Int, key: K, value: V): Unit = {
    //调用AppendOnlyMap的update方法，设置key和value值
    update((partition, key), value)
  }
}

PartitionedPairBuffer

一个只支持追加的保存kv键值对的buffer。每个kv键值对都具有一个相应的partitionId。

该buffer底层有一个可扩容的数组结构。在该数组中同时保存key和value值，从而很方便地使用KVArraySortDataFormat进行排序。更明确的说，元素的顺序是key0, value0, key1, value1, key2, value2....

该buffer最多支持1073741819个元素。

相比PartitionedAppendOnlyMap，该buffer只提供了插入、扩容、迭代功能：

插入：insert方法实现。元素会被插入数组的末端。在该数组中同时保存元素的key和value值，元素的key值是一个(partition ID, K)元组。
扩容：growArray方法实现。确定新数组的capacity，然后将原数组的内容复制到新数组。
迭代：partitionedDestructiveSortedIterator方法实现。

PartitionedAppendOnlyMap和PartitionedPairBuffer的比较如下：

	PartitionedAppendOnlyMap	PartitionedPairBuffer
插入	使用二次探测法确定元素在数组中的位置，找到为空的位置才插入	直接插入到数组末端
扩容	创建新容量的数组，需对原数组的所有元素进行重哈希，并用二次探测法确定在新数组中的下标位置	创建新容量的数组，然后调用System.arraycopy方法直接将原数组内容复制到新数组
迭代	迭代对底层数组是破坏性的	迭代对底层数组实际上不是破坏性的
修改	支持。二次探测法确定key在数组中的下标位置，并修改它的value值	不支持
查找	支持。二次探测法确定key在数组中的下标位置，并返回它的value值	不支持
用途	ExternalSorter需要map端的聚合时	ExternalSorter不需要map端的聚合时

/**
 * Append-only buffer of key-value pairs, each with a corresponding partition ID, that keeps track
 * of its estimated size in bytes.
 *
 * The buffer can support up to 1073741819 elements.
 */
private[spark] class PartitionedPairBuffer[K, V](initialCapacity: Int = 64)
  extends WritablePartitionedPairCollection[K, V] with SizeTracker
{
  import PartitionedPairBuffer._

  require(initialCapacity <= MAXIMUM_CAPACITY,
    s"Can't make capacity bigger than ${MAXIMUM_CAPACITY} elements")
  require(initialCapacity >= 1, "Invalid initial capacity")

  // Basic growable array data structure. We use a single array of AnyRef to hold both the keys
  // and the values, so that we can sort them efficiently with KVArraySortDataFormat.
  private var capacity = initialCapacity
  private var curSize = 0
  private var data = new Array[AnyRef](2 * initialCapacity)

  /** Add an element into the buffer */
  def insert(partition: Int, key: K, value: V): Unit = {
    if (curSize == capacity) {
      growArray()
    }
    data(2 * curSize) = (partition, key.asInstanceOf[AnyRef])
    data(2 * curSize + 1) = value.asInstanceOf[AnyRef]
    curSize += 1
    afterUpdate()
  }

  /** Double the size of the array because we've reached capacity */
  private def growArray(): Unit = {
    if (capacity >= MAXIMUM_CAPACITY) {
      throw new IllegalStateException(s"Can't insert more than ${MAXIMUM_CAPACITY} elements")
    }
    val newCapacity =
      if (capacity * 2 > MAXIMUM_CAPACITY) { // Overflow
        MAXIMUM_CAPACITY
      } else {
        capacity * 2
      }
    val newArray = new Array[AnyRef](2 * newCapacity)
    System.arraycopy(data, 0, newArray, 0, 2 * capacity)
    data = newArray
    capacity = newCapacity
    resetSamples()
  }

  /** Iterate through the data in a given order. For this class this is not really destructive. */
  override def partitionedDestructiveSortedIterator(keyComparator: Option[Comparator[K]])
    : Iterator[((Int, K), V)] = {
    val comparator = keyComparator.map(partitionKeyComparator).getOrElse(partitionComparator)
    new Sorter(new KVArraySortDataFormat[(Int, K), AnyRef]).sort(data, 0, curSize, comparator)
    iterator
  }

  private def iterator(): Iterator[((Int, K), V)] = new Iterator[((Int, K), V)] {
    var pos = 0

    override def hasNext: Boolean = pos < curSize

    override def next(): ((Int, K), V) = {
      if (!hasNext) {
        throw new NoSuchElementException
      }
      val pair = (data(2 * pos).asInstanceOf[(Int, K)], data(2 * pos + 1).asInstanceOf[V])
      pos += 1
      pair
    }
  }
}

//伴生对象
private object PartitionedPairBuffer {
  val MAXIMUM_CAPACITY: Int = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH / 2
}

参考：Spark Shuffle之SortShuffleWriter

zhifeng687

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spark2.3源码分析之in-memory collection

AppendOnlyMap概述一个只可以添加数据的hash table的实现。它的key值永远不会删除，而每个key的value值可能会改变。该hash table使用开放探测方法中的二次探测法保存数据，所以内部只有一个数组的数据结构。该hash table的大小始终为2的幂次方，最多可以支持0.7 * 2 ^ 29个元素。该hash table为了内存本地性，在同一个数组中保...
复制链接

扫一扫

专栏目录