/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.util.collection
import java.util.Comparator
import com.google.common.hash.Hashing
import org.apache.spark.annotation.DeveloperApi
/**
* :: DeveloperApi ::
* A simple open hash table optimized for the append-only use case, where keys
* are never removed, but the value for each key may be changed.
*
* This implementation uses quadratic probing with a power-of-2 hash table
* size, which is guaranteed to explore all spaces for each key (see
* http://en.wikipedia.org/wiki/Quadratic_probing).
*
* The map can support up to `375809638 (0.7 * 2 ^ 29)` elements.
*
* TODO: Cache the hash values of each key? java.util.HashMap does that.
*/
@DeveloperApi
class AppendOnlyMap[K, V](initialCapacity: Int = 64)
extends Iterable[(K, V)] with Serializable {
import AppendOnlyMap._
require(initialCapacity <= MAXIMUM_CAPACITY,
s"Can't make capacity bigger than ${MAXIMUM_CAPACITY} elements")
require(initialCapacity >= 1, "Invalid initial capacity")
// 用于计算data数组容量增长的阈值的负载因子。
private val LOAD_FACTOR = 0.7
/**
* data数组的当前容量。capacity的初始值的计算方法为取initialCapacity的二进制位的最高位,其余
* 位补0得到新的整数(记为highBit)。如果highBit与initialCapacity相等,则capacity等于initialCapacity
* 否则将highBit左移一位后作为capacity的值。
* 例1,如果initialCapacity等于64时,其二进制为1000000,highBit也为1000000,所以capacity等于64。
* 例2,如果initialCapacity等于72时,其二进制为1001000,highBit为1000000,所以capacity等于128。
*
*/
private var capacity = nextPowerOf2(initialCapacity)
// 计算数据存放位置的掩码。
private var mask = capacity - 1
// 记录当前已经放入data的key与聚合值的数量
private var curSize = 0
// data数组容量增长的阈值
private var growThreshold = (LOAD_FACTOR * capacity).toInt
// Holds keys and values in the same array for memory locality; specifically, the order of
// elements is key0, value0, key1, value1, key2, value2, etc.
// 用于保存key和聚合值的数组。data保存各个元素的顺序为key0,value0,key1,value1,key2,value2...
// data的初始大小为2*capacity
private var data = new Array[AnyRef](2 * capacity)
// Treat the null key differently so we can use nulls in "data" to represent empty items.
// data数组中是否已经有了null值
private var haveNullValue = false
private var nullValue: V = null.asInstanceOf[V]
// Triggered by destructiveSortedIterator; the underlying data array may no longer be used
// destructiveSortedIterator触发,表示data数组不再被使用
private var destroyed = false
private val destructionMessage = "Map state is invalid from destructive sorting!"
/** Get the value for a given key */
def apply(key: K): V = {
assert(!destroyed, destructionMessage)
val k = key.asInstanceOf[AnyRef]
if (k.eq(null)) {
return nullValue
}
var pos = rehash(k.hashCode) & mask
var i = 1
while (true) {
val curKey = data(2 * pos)
if (k.eq(curKey) || k.equals(curKey)) {
return data(2 * pos + 1).asInstanceOf[V]
} else if (curKey.eq(null)) {
return null.asInstanceOf[V]
} else {
val delta = i
pos = (pos + delta) & mask
i += 1
}
}
null.asInstanceOf[V]
}
/** Set the value for a key */
//
def update(key: K, value: V): Unit = {
assert(!destroyed, destructionMessage)
// key生成AnyRef类型对象,因为key有可能是K类型,也有可能是null
val k = key.asInstanceOf[AnyRef]
// 如果为null
if (k.eq(null)) {
// 没有null值,容量加一
if (!haveNullValue) {
incrementSize()
}
// null值变量设成value
nullValue = value
// 修改标识位
haveNullValue = true
return
}
// 计算位置
var pos = rehash(key.hashCode) & mask
var i = 1
while (true) {
val curKey = data(2 * pos)
// 如果该位置没有key,就在对应位置插入key和value并更新size信息
if (curKey.eq(null)) {
data(2 * pos) = k
data(2 * pos + 1) = value.asInstanceOf[AnyRef]
incrementSize() // Since we added a new key
return
// 如果该位置有key并且当前key相等,那么就更新value
} else if (k.eq(curKey) || k.equals(curKey)) {
data(2 * pos + 1) = value.asInstanceOf[AnyRef]
return
// 如果该位置有key但不等于当前key,向后遍历直到找到null或者与该key相同的key
} else {
val delta = i
pos = (pos + delta) & mask
i += 1
}
}
}
/**
* Set the value for key to updateFunc(hadValue, oldValue), where oldValue will be the old value
* for key, if any, or null otherwise. Returns the newly updated value.
*
*/
def changeValue(key: K, updateFunc: (Boolean, V) => V): V = {
assert(!destroyed, destructionMessage)
val k = key.asInstanceOf[AnyRef]
if (k.eq(null)) {
// 没有null值,容量加一
if (!haveNullValue) {
incrementSize()
}
// 对null值key进行聚合操作,参数中的nullValue为上次的值
nullValue = updateFunc(haveNullValue, nullValue)
haveNullValue = true
return nullValue
}
// 获取该key的位置
var pos = rehash(k.hashCode) & mask
var i = 1
while (true) {
val curKey = data(2 * pos)
if (curKey.eq(null)) {
// 对该key执行updateFunc函数,获得新的value值
// 因为之前该位置没有key,所以直接执行combiner函数
val newValue = updateFunc(false, null.asInstanceOf[V])
// 对应pos插入key,value
data(2 * pos) = k
data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
incrementSize()
return newValue
// 和上面update方法相似,只不过是执行updateFunc函数,然后把新值插到pos
} else if (k.eq(curKey) || k.equals(curKey)) {
val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V])
data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
return newValue
// 碰撞情况,找下一个位置
} else {
val delta = i
pos = (pos + delta) & mask
i += 1
}
}
null.asInstanceOf[V] // Never reached but needed to keep compiler happy
}
/** Iterator method from Iterable */
override def iterator: Iterator[(K, V)] = {
assert(!destroyed, destructionMessage)
new Iterator[(K, V)] {
var pos = -1
/** Get the next value we should return from next(), or null if we're finished iterating */
def nextValue(): (K, V) = {
if (pos == -1) { // Treat position -1 as looking at the null value
if (haveNullValue) {
return (null.asInstanceOf[K], nullValue)
}
pos += 1
}
while (pos < capacity) {
if (!data(2 * pos).eq(null)) {
return (data(2 * pos).asInstanceOf[K], data(2 * pos + 1).asInstanceOf[V])
}
pos += 1
}
null
}
override def hasNext: Boolean = nextValue() != null
override def next(): (K, V) = {
val value = nextValue()
if (value == null) {
throw new NoSuchElementException("End of iterator")
}
pos += 1
value
}
}
}
override def size: Int = curSize
/** Increase table size by 1, rehashing if necessary */
// 增加当前数量,如果达到扩容阈值,就扩容
private def incrementSize() {
curSize += 1
if (curSize > growThreshold) {
growTable()
}
}
/**
* Re-hash a value to deal better with hash functions that don't differ in the lower bits.
*/
private def rehash(h: Int): Int = Hashing.murmur3_32().hashInt(h).asInt()
/** Double the table's size and re-hash everything */
// 将data数组的容量扩大一倍,
// 流程:1.创建一个两倍于当前容量(capacity)的新数组,并且计算新数组的掩码
// 2.将老数组中的元素拷贝到新数组的指定索引位置,此索引位置采用新的mask重新使用
// refresh(k.hashCode) & mask计算。在拷贝过程中如果发生了“碰撞”,则会重新计算
// 元素放置到新数组的索引位置,直到没有碰撞发生时将元素放入新数组
// 3.将新数组作为扩充容量后的data数组
// 4.将data数组的容量大小改为新数组的容量大小
// 5.将掩码修改为新计算的掩码
// 6.重新计算AppendOnlyMap的growThreshold
protected def growTable() {
// capacity < MAXIMUM_CAPACITY (2 ^ 29) so capacity * 2 won't overflow
val newCapacity = capacity * 2
require(newCapacity <= MAXIMUM_CAPACITY, s"Can't contain more than ${growThreshold} elements")
val newData = new Array[AnyRef](2 * newCapacity)
val newMask = newCapacity - 1 // 计算新的掩码
// Insert all our old values into the new array. Note that because our old keys are
// unique, there's no need to check for equality here when we insert.
// 把旧数组的key和value重新插入到新数组
var oldPos = 0
// 遍历旧数组
while (oldPos < capacity) {
// 如果旧数组该位置有key
if (!data(2 * oldPos).eq(null)) {
val key = data(2 * oldPos)
val value = data(2 * oldPos + 1)
// 使用新的掩码计算key的位置
var newPos = rehash(key.hashCode) & newMask
var i = 1
var keepGoing = true
while (keepGoing) {
val curKey = newData(2 * newPos)
// 新数组的该位置没有key,那么直接放进去
if (curKey.eq(null)) {
newData(2 * newPos) = key
newData(2 * newPos + 1) = value
keepGoing = false
// 如果有key,那么从这个位置的后一个位置开始遍历,
// 找到一个没有key的位置
} else {
val delta = i
newPos = (newPos + delta) & newMask
i += 1
}
}
}
oldPos += 1
}
data = newData
capacity = newCapacity
mask = newMask
growThreshold = (LOAD_FACTOR * newCapacity).toInt
}
//capacity的初始值的计算方法为取initialCapacity的二进制位的最高位,其余
//位补0得到新的整数(记为highBit)。如果highBit与initialCapacity相等,则capacity等于initialCapacity
//否则将highBit左移一位后作为capacity的值。
//例1,如果initialCapacity等于64时,其二进制为1000000,highBit也为1000000,所以capacity等于64。
//例2,如果initialCapacity等于72时,其二进制为1001000,highBit为1000000,所以capacity等于128。
private def nextPowerOf2(n: Int): Int = {
val highBit = Integer.highestOneBit(n)
if (highBit == n) n else highBit << 1
}
/**
* Return an iterator of the map in sorted order. This provides a way to sort the map without
* using additional memory, at the expense of destroying the validity of the map.
*/
def destructiveSortedIterator(keyComparator: Comparator[K]): Iterator[(K, V)] = {
destroyed = true
// Pack KV pairs into the front of the underlying array
// 把数组中的元素向左移动
// 例如:["a","b",null,null,"c","d"]->["a","b","c","d",null,null]
var keyIndex, newIndex = 0
while (keyIndex < capacity) {
if (data(2 * keyIndex) != null) {
data(2 * newIndex) = data(2 * keyIndex)
data(2 * newIndex + 1) = data(2 * keyIndex + 1)
newIndex += 1
}
keyIndex += 1
}
assert(curSize == newIndex + (if (haveNullValue) 1 else 0))
// 将数组中的元素按照keyComparator排序
new Sorter(new KVArraySortDataFormat[K, AnyRef]).sort(data, 0, newIndex, keyComparator)
// 生成迭代访问data数组中元素的迭代器
new Iterator[(K, V)] {
var i = 0
var nullValueReady = haveNullValue
def hasNext: Boolean = (i < newIndex || nullValueReady)
def next(): (K, V) = {
if (nullValueReady) {
nullValueReady = false
(null.asInstanceOf[K], nullValue)
} else {
val item = (data(2 * i).asInstanceOf[K], data(2 * i + 1).asInstanceOf[V])
i += 1
item
}
}
}
}
/**
* Return whether the next insert will cause the map to grow
*/
def atGrowThreshold: Boolean = curSize == growThreshold
}
private object AppendOnlyMap {
// 最大容量,2^29
val MAXIMUM_CAPACITY = (1 << 29)
}