Rust HashMap 源码分析
1 HashMap 数据结构
A hash map implemented with quadratic probing and SIMD lookup.
pub struct HashMap<K, V, S = RandomState> {
base: base::HashMap<K, V, S>,
}
pub struct RandomState {
k0: u64,
k1: u64,
}
pub struct HashMap<K, V, S = DefaultHashBuilder, A: Allocator + Clone = Global> {
pub(crate) hash_builder: S,
pub(crate) table: RawTable<(K, V), A>,
}
pub struct RawTable<T, A: Allocator + Clone = Global> {
table: RawTableInner<A>,
// Tell dropck that we own instances of T.
marker: PhantomData<T>,
}
struct RawTableInner<A> {
// Mask to get an index from a hash value. The value is one less than the
// number of buckets in the table.
bucket_mask: usize,
// [Padding], T1, T2, ..., Tlast, C1, C2, ...
// ^ points here
ctrl: NonNull<u8>,
// Number of elements that can be inserted before we need to grow the table
growth_left: usize,
// Number of elements in the table, only really used by len()
items: usize,
alloc: A,
}
重点关注 struct RawTableInner<A>
的前四个字段:
1.bucket_mask: 哈希表中哈希桶的数量减一。
2.ctrl: 指针,指向堆内存哈希表末端的 ctrl 区。
3.growth_left: capacity - len (容量 - 长度)
4.items: len (长度)
use std::collections::HashMap;
fn main() {
let mut map = HashMap::new();
map.insert("hello", "world");
let data: [usize; 6] = unsafe { std::mem::transmute(map) };
// [4096496360859545168, 8063513060474661978, 3, 2940168701456, 2, 1]
// 前两个值是两个 u64 的 RandomState
// cap: 3, ctrl: 2940168701456, cap - len: 2, len: 1
println!("{:?}", data);
}
2 扩容
以 2 的 n 次幂的方式进行扩容,0,3 (22 - 1),7 (23 - 1),14 (24 - 24 * 12.5%),28 (25 - 25 * 12.5%)…
pub fn insert(&mut self, k: K, v: V) -> Option<V> {
self.base.insert(k, v)
}
pub fn insert(&mut self, k: K, v: V) -> Option<V> {
// 根据 key 生成 hash
let hash = make_insert_hash::<K, S>(&self.hash_builder, &k);
if let Some((_, item)) = self.table.get_mut(hash, equivalent_key(&k)) { // key 已存在
Some(mem::replace(item, v))
} else { // key 不存在
self.table
.insert(hash, (k, v), make_hasher::<K, _, V, S>(&self.hash_builder));
None
}
}
pub fn insert(&mut self, hash: u64, value: T, hasher: impl Fn(&T) -> u64) -> Bucket<T> {
unsafe {
let mut index = self.table.find_insert_slot(hash);
// We can avoid growing the table once we have reached our load
// factor if we are replacing a tombstone. This works since the
// number of EMPTY slots does not change in this case.
let old_ctrl = *self.table.ctrl(index);
if unlikely(self.table.growth_left == 0 && special_is_empty(old_ctrl)) {
// 在这里面进行扩容
self.reserve(1, hasher);
index = self.table.find_insert_slot(hash);
}
self.table.record_item_insert_at(index, old_ctrl, hash);
let bucket = self.bucket(index);
bucket.write(value);
bucket
}
}
#[cfg_attr(feature = "inline-more", inline)]
pub fn reserve(&mut self, additional: usize, hasher: impl Fn(&T) -> u64) {
if additional > self.table.growth_left {
// Avoid `Result::unwrap_or_else` because it bloats LLVM IR.
// 在这里面进行扩容
if self
.reserve_rehash(additional, hasher, Fallibility::Infallible)
.is_err()
{
unsafe { hint::unreachable_unchecked() }
}
}
}
fn reserve_rehash(
&mut self,
additional: usize,
hasher: impl Fn(&T) -> u64,
fallibility: Fallibility,
) -> Result<(), TryReserveError> {
// Avoid `Option::ok_or_else` because it bloats LLVM IR.
let new_items = match self.table.items.checked_add(additional) {
Some(new_items) => new_items,
None => return Err(fallibility.capacity_overflow()),
};
let full_capacity = bucket_mask_to_capacity(self.table.bucket_mask);
if new_items <= full_capacity / 2 {
// Rehash in-place without re-allocating if we have plenty of spare
// capacity that is locked up due to DELETED entries.
self.rehash_in_place(hasher);
Ok(())
} else {
// 在这里面进行扩容
// Otherwise, conservatively resize to at least the next size up
// to avoid churning deletes into frequent rehashes.
self.resize(
usize::max(new_items, full_capacity + 1),
hasher,
fallibility,
)
}
}
fn resize(
&mut self,
capacity: usize,
hasher: impl Fn(&T) -> u64,
fallibility: Fallibility,
) -> Result<(), TryReserveError> {
unsafe {
// 在这里面进行扩容
let mut new_table =
self.table
.prepare_resize(TableLayout::new::<T>(), capacity, fallibility)?;
// Copy all elements to the new table.
for item in self.iter() {
// This may panic.
let hash = hasher(item.as_ref());
// We can use a simpler version of insert() here since:
// - there are no DELETED entries.
// - we know there is enough space in the table.
// - all elements are unique.
let (index, _) = new_table.prepare_insert_slot(hash);
new_table.bucket(index).copy_from_nonoverlapping(&item);
}
// We successfully copied all elements without panicking. Now replace
// self with the new table. The old table will have its memory freed but
// the items will not be dropped (since they have been moved into the
// new table).
mem::swap(&mut self.table, &mut new_table);
Ok(())
}
}
unsafe fn prepare_resize(
&self,
table_layout: TableLayout,
capacity: usize,
fallibility: Fallibility,
) -> Result<crate::scopeguard::ScopeGuard<Self, impl FnMut(&mut Self)>, TryReserveError> {
debug_assert!(self.items <= capacity);
// 扩容,重分配并且初始化个新表
// Allocate and initialize the new table.
let mut new_table = RawTableInner::fallible_with_capacity(
self.alloc.clone(),
table_layout,
capacity,
fallibility,
)?;
new_table.growth_left -= self.items;
new_table.items = self.items;
// The hash function may panic, in which case we simply free the new
// table without dropping any elements that may have been copied into
// it.
//
// This guard is also used to free the old table on success, see
// the comment at the bottom of this function.
Ok(guard(new_table, move |self_| {
if !self_.is_empty_singleton() {
self_.free_buckets(table_layout);
}
}))
}
为什么容量不是 2 的整数倍?
当 1/2/4/8 的时候,会留一个空桶。当大于 8 的时候,会留下 12.5% 的空桶。
e.g. 当添加至长度为 8 的时候,会发生一次扩容,扩容后的容量为 14 (16 - 16 * 12.5%)。
fn bucket_mask_to_capacity(bucket_mask: usize) -> usize {
if bucket_mask < 8 {
// For tables with 1/2/4/8 buckets, we always reserve one empty slot.
// Keep in mind that the bucket mask is one less than the bucket count.
bucket_mask
} else {
// For larger tables we reserve 12.5% of the slots as empty.
((bucket_mask + 1) / 8) * 7
}
}