spark源码-AppendOnlyMap类

最新推荐文章于 2023-02-27 17:25:07 发布
worldchinalee
最新推荐文章于 2023-02-27 17:25:07 发布
阅读量468
点赞数
分类专栏： spark 文章标签： spark 源码
本文链接：https://blog.csdn.net/worldchinalee/article/details/88684260
版权
spark 专栏收录该内容
17 篇文章 0 订阅
订阅专栏
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.util.collection

import java.util.Comparator

import com.google.common.hash.Hashing

import org.apache.spark.annotation.DeveloperApi

/**
 * :: DeveloperApi ::
 * A simple open hash table optimized for the append-only use case, where keys
 * are never removed, but the value for each key may be changed.
 *
 * This implementation uses quadratic probing with a power-of-2 hash table
 * size, which is guaranteed to explore all spaces for each key (see
 * http://en.wikipedia.org/wiki/Quadratic_probing).
 *
 * The map can support up to `375809638 (0.7 * 2 ^ 29)` elements.
 *
 * TODO: Cache the hash values of each key? java.util.HashMap does that.
 */
@DeveloperApi
class AppendOnlyMap[K, V](initialCapacity: Int = 64)
  extends Iterable[(K, V)] with Serializable {

  import AppendOnlyMap._

  require(initialCapacity <= MAXIMUM_CAPACITY,
    s"Can't make capacity bigger than ${MAXIMUM_CAPACITY} elements")
  require(initialCapacity >= 1, "Invalid initial capacity")

  // 用于计算data数组容量增长的阈值的负载因子。
  private val LOAD_FACTOR = 0.7
  /**
  * data数组的当前容量。capacity的初始值的计算方法为取initialCapacity的二进制位的最高位，其余
  * 位补0得到新的整数（记为highBit）。如果highBit与initialCapacity相等，则capacity等于initialCapacity
  * 否则将highBit左移一位后作为capacity的值。
  * 例1，如果initialCapacity等于64时，其二进制为1000000，highBit也为1000000，所以capacity等于64。
  * 例2，如果initialCapacity等于72时，其二进制为1001000，highBit为1000000，所以capacity等于128。
  * 
  */
  private var capacity = nextPowerOf2(initialCapacity)
  // 计算数据存放位置的掩码。
  private var mask = capacity - 1
  // 记录当前已经放入data的key与聚合值的数量
  private var curSize = 0
  // data数组容量增长的阈值
  private var growThreshold = (LOAD_FACTOR * capacity).toInt

  // Holds keys and values in the same array for memory locality; specifically, the order of
  // elements is key0, value0, key1, value1, key2, value2, etc.
  // 用于保存key和聚合值的数组。data保存各个元素的顺序为key0,value0,key1,value1,key2,value2...
  // data的初始大小为2*capacity
  private var data = new Array[AnyRef](2 * capacity)

  // Treat the null key differently so we can use nulls in "data" to represent empty items.
  // data数组中是否已经有了null值
  private var haveNullValue = false
  private var nullValue: V = null.asInstanceOf[V]

  // Triggered by destructiveSortedIterator; the underlying data array may no longer be used
  // destructiveSortedIterator触发，表示data数组不再被使用
  private var destroyed = false
  private val destructionMessage = "Map state is invalid from destructive sorting!"

  /** Get the value for a given key */
  def apply(key: K): V = {
    assert(!destroyed, destructionMessage)
    val k = key.asInstanceOf[AnyRef]
    if (k.eq(null)) {
      return nullValue
    }
    var pos = rehash(k.hashCode) & mask
    var i = 1
    while (true) {
      val curKey = data(2 * pos)
      if (k.eq(curKey) || k.equals(curKey)) {
        return data(2 * pos + 1).asInstanceOf[V]
      } else if (curKey.eq(null)) {
        return null.asInstanceOf[V]
      } else {
        val delta = i
        pos = (pos + delta) & mask
        i += 1
      }
    }
    null.asInstanceOf[V]
  }

  /** Set the value for a key */
  //
  def update(key: K, value: V): Unit = {
    assert(!destroyed, destructionMessage)
    // key生成AnyRef类型对象，因为key有可能是K类型，也有可能是null
    val k = key.asInstanceOf[AnyRef]
    // 如果为null
    if (k.eq(null)) {
      // 没有null值，容量加一
      if (!haveNullValue) {
        incrementSize()
      }
      // null值变量设成value
      nullValue = value
      // 修改标识位
      haveNullValue = true
      return
    }
    // 计算位置
    var pos = rehash(key.hashCode) & mask
    var i = 1
    while (true) {
      val curKey = data(2 * pos)
      // 如果该位置没有key，就在对应位置插入key和value并更新size信息
      if (curKey.eq(null)) {
        data(2 * pos) = k
        data(2 * pos + 1) = value.asInstanceOf[AnyRef]
        incrementSize()  // Since we added a new key
        return
      // 如果该位置有key并且当前key相等，那么就更新value
      } else if (k.eq(curKey) || k.equals(curKey)) {
        data(2 * pos + 1) = value.asInstanceOf[AnyRef]
        return
      // 如果该位置有key但不等于当前key，向后遍历直到找到null或者与该key相同的key
      } else {
        val delta = i
        pos = (pos + delta) & mask
        i += 1
      }
    }
  }

  /**
   * Set the value for key to updateFunc(hadValue, oldValue), where oldValue will be the old value
   * for key, if any, or null otherwise. Returns the newly updated value.
   * 
   */
  def changeValue(key: K, updateFunc: (Boolean, V) => V): V = {
    assert(!destroyed, destructionMessage)
    val k = key.asInstanceOf[AnyRef]
    if (k.eq(null)) {
      // 没有null值，容量加一
      if (!haveNullValue) {
        incrementSize()
      }
      // 对null值key进行聚合操作，参数中的nullValue为上次的值
      nullValue = updateFunc(haveNullValue, nullValue)
      haveNullValue = true
      return nullValue
    }
    // 获取该key的位置
    var pos = rehash(k.hashCode) & mask
    var i = 1
    while (true) {
      val curKey = data(2 * pos)
      if (curKey.eq(null)) {
        // 对该key执行updateFunc函数，获得新的value值
        // 因为之前该位置没有key,所以直接执行combiner函数
        val newValue = updateFunc(false, null.asInstanceOf[V])
        // 对应pos插入key，value
        data(2 * pos) = k
        data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
        incrementSize()
        return newValue
        // 和上面update方法相似，只不过是执行updateFunc函数，然后把新值插到pos
      } else if (k.eq(curKey) || k.equals(curKey)) {
        val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V])
        data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
        return newValue
        // 碰撞情况，找下一个位置
      } else {
        val delta = i
        pos = (pos + delta) & mask
        i += 1
      }
    }
    null.asInstanceOf[V] // Never reached but needed to keep compiler happy
  }

  /** Iterator method from Iterable */
  override def iterator: Iterator[(K, V)] = {
    assert(!destroyed, destructionMessage)
    new Iterator[(K, V)] {
      var pos = -1

      /** Get the next value we should return from next(), or null if we're finished iterating */
      def nextValue(): (K, V) = {
        if (pos == -1) {    // Treat position -1 as looking at the null value
          if (haveNullValue) {
            return (null.asInstanceOf[K], nullValue)
          }
          pos += 1
        }
        while (pos < capacity) {
          if (!data(2 * pos).eq(null)) {
            return (data(2 * pos).asInstanceOf[K], data(2 * pos + 1).asInstanceOf[V])
          }
          pos += 1
        }
        null
      }

      override def hasNext: Boolean = nextValue() != null

      override def next(): (K, V) = {
        val value = nextValue()
        if (value == null) {
          throw new NoSuchElementException("End of iterator")
        }
        pos += 1
        value
      }
    }
  }

  override def size: Int = curSize

  /** Increase table size by 1, rehashing if necessary */
  // 增加当前数量，如果达到扩容阈值，就扩容
  private def incrementSize() {
    curSize += 1
    if (curSize > growThreshold) {
      growTable()
    }
  }

  /**
   * Re-hash a value to deal better with hash functions that don't differ in the lower bits.
   */
  private def rehash(h: Int): Int = Hashing.murmur3_32().hashInt(h).asInt()

  /** Double the table's size and re-hash everything */
  // 将data数组的容量扩大一倍，
  // 流程：1.创建一个两倍于当前容量（capacity）的新数组，并且计算新数组的掩码
  //       2.将老数组中的元素拷贝到新数组的指定索引位置，此索引位置采用新的mask重新使用
  //         refresh(k.hashCode) & mask计算。在拷贝过程中如果发生了“碰撞”，则会重新计算
  //         元素放置到新数组的索引位置，直到没有碰撞发生时将元素放入新数组
  //       3.将新数组作为扩充容量后的data数组
  //       4.将data数组的容量大小改为新数组的容量大小
  //       5.将掩码修改为新计算的掩码
  //       6.重新计算AppendOnlyMap的growThreshold
  protected def growTable() {
    // capacity < MAXIMUM_CAPACITY (2 ^ 29) so capacity * 2 won't overflow
    val newCapacity = capacity * 2
    require(newCapacity <= MAXIMUM_CAPACITY, s"Can't contain more than ${growThreshold} elements")
    val newData = new Array[AnyRef](2 * newCapacity)
    val newMask = newCapacity - 1 // 计算新的掩码
    // Insert all our old values into the new array. Note that because our old keys are
    // unique, there's no need to check for equality here when we insert.
    // 把旧数组的key和value重新插入到新数组
    var oldPos = 0
    // 遍历旧数组
    while (oldPos < capacity) {
      // 如果旧数组该位置有key
      if (!data(2 * oldPos).eq(null)) {
        val key = data(2 * oldPos)
        val value = data(2 * oldPos + 1)
        // 使用新的掩码计算key的位置
        var newPos = rehash(key.hashCode) & newMask
        var i = 1
        var keepGoing = true
        while (keepGoing) {
          val curKey = newData(2 * newPos)
          // 新数组的该位置没有key，那么直接放进去
          if (curKey.eq(null)) {
            newData(2 * newPos) = key
            newData(2 * newPos + 1) = value
            keepGoing = false
          // 如果有key，那么从这个位置的后一个位置开始遍历，
          // 找到一个没有key的位置
          } else {
            val delta = i
            newPos = (newPos + delta) & newMask
            i += 1
          }
        }
      }
      oldPos += 1
    }
    data = newData
    capacity = newCapacity
    mask = newMask
    growThreshold = (LOAD_FACTOR * newCapacity).toInt
  }

  //capacity的初始值的计算方法为取initialCapacity的二进制位的最高位，其余
  //位补0得到新的整数（记为highBit）。如果highBit与initialCapacity相等，则capacity等于initialCapacity
  //否则将highBit左移一位后作为capacity的值。
  //例1，如果initialCapacity等于64时，其二进制为1000000，highBit也为1000000，所以capacity等于64。
  //例2，如果initialCapacity等于72时，其二进制为1001000，highBit为1000000，所以capacity等于128。
  private def nextPowerOf2(n: Int): Int = {
    val highBit = Integer.highestOneBit(n)
    if (highBit == n) n else highBit << 1
  }

  /**
   * Return an iterator of the map in sorted order. This provides a way to sort the map without
   * using additional memory, at the expense of destroying the validity of the map.
   */
  def destructiveSortedIterator(keyComparator: Comparator[K]): Iterator[(K, V)] = {
    destroyed = true
    // Pack KV pairs into the front of the underlying array
    // 把数组中的元素向左移动
    // 例如：["a","b",null,null,"c","d"]->["a","b","c","d",null,null]
    var keyIndex, newIndex = 0
    while (keyIndex < capacity) {
      if (data(2 * keyIndex) != null) {
        data(2 * newIndex) = data(2 * keyIndex)
        data(2 * newIndex + 1) = data(2 * keyIndex + 1)
        newIndex += 1
      }
      keyIndex += 1
    }
    assert(curSize == newIndex + (if (haveNullValue) 1 else 0))

    // 将数组中的元素按照keyComparator排序
    new Sorter(new KVArraySortDataFormat[K, AnyRef]).sort(data, 0, newIndex, keyComparator)

    // 生成迭代访问data数组中元素的迭代器
    new Iterator[(K, V)] {
      var i = 0
      var nullValueReady = haveNullValue
      def hasNext: Boolean = (i < newIndex || nullValueReady)
      def next(): (K, V) = {
        if (nullValueReady) {
          nullValueReady = false
          (null.asInstanceOf[K], nullValue)
        } else {
          val item = (data(2 * i).asInstanceOf[K], data(2 * i + 1).asInstanceOf[V])
          i += 1
          item
        }
      }
    }
  }

  /**
   * Return whether the next insert will cause the map to grow
   */
  def atGrowThreshold: Boolean = curSize == growThreshold
}

private object AppendOnlyMap {
  // 最大容量，2^29
  val MAXIMUM_CAPACITY = (1 << 29)
}