源代码来自sgi-2.91版本stl_hashtable.h
文章目录
哈希表概念
哈希表又叫做散列表,本质来说就是充分利用空间的一种思想。
(1)一块内存分为53块,并编号#0 - #52区号,存入的object也编号,5存入#5号,25存入#25号,59 % 53= 6 , 59号object也放入#6号,108%53= 2 , 108号object也放入#2号,那如果继续存入2号、55号object呢?它们会共占一块空间。
(2)为了解决空间碰撞,我们把#0-#52当成一个指针数组,每个#号指向一块链表,每次有重复号码进入时,就把它加入链表,采用头插法。如图所示,55,2,108号object都在一张链表上,因为链表没有前向指针,所以hash表的迭代器不支持–操作。
(3)“开链法” - 但如果hash表上元素过多,所有链表上object数量超过53时,就会rehashing,新分配空间指针数组空间,现在是#0 - #96.
rehashing标准:2*53=106,在106附近找到最近的质数97,所以hash表新容量为97。(世界标准,貌似质数取余可以减少空间碰撞)
此时原先所有数据全部打散,一一存入新的区号指向的链表中。
hashtable 模板参数理解(HashFcn)、(ExtractKey)、(EqualKey)
template <class Value, class Key, class HashFcn,
class ExtractKey, class EqualKey, class Alloc = alloc>
class hashtable;
在定义hashtable中传入了很多参数,下面来一一解释:
(1)value - 节点的实际数据类型
(2)Key - 节点的键值,也就是对应的编号
(3)HashFcn - 是个仿函数,从传入的数据类型中获得编号,传入int型自然可以把值当编号,但如果传入一个字符串该如何计算编号呢?所以需要专门的转换函数来处理
(4)ExtractKey - 从节点中取出key键值的方法(函数或者仿函数)
(5)EqualKey - 判断键值相同与否的方法(函数或者仿函数)
(6)Alloc - 内存分配器,缺省使用alloc
HashFcn深入分析
看一个源代码stl_hash_fun.h中的版本
可以发现数值类型的,hash函数的处理都是返回原值,但char*类型的进行单独处理,__stl_hash_string见下面
template <class Key> struct hash { }; //泛化编程
//下面都是全特化版本,对不同的传入参数做不同处理
__STL_TEMPLATE_NULL struct hash<char*>
{
size_t operator()(const char* s) const { return __stl_hash_string(s); }
};
__STL_TEMPLATE_NULL struct hash<const char*>
{
size_t operator()(const char* s) const { return __stl_hash_string(s); }
};
__STL_TEMPLATE_NULL struct hash<char> {
size_t operator()(char x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<unsigned char> {
size_t operator()(unsigned char x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<signed char> {
size_t operator()(unsigned char x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<short> {
size_t operator()(short x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<unsigned short> {
size_t operator()(unsigned short x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<int> {
size_t operator()(int x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<unsigned int> {
size_t operator()(unsigned int x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<long> {
size_t operator()(long x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<unsigned long> {
size_t operator()(unsigned long x) const { return x; }
};
__stl_hash_string函数,"abc"变为5*(‘a’*5+‘b’)+‘c’,此处的编号越复杂越好,因为要避免重复
inline size_t __stl_hash_string(const char* s)
{
unsigned long h = 0;
for ( ; *s; ++s)
h = 5*h + *s;
return size_t(h); //返回编号
}
Hash调用,bkt_num函数过渡
下面这些包装函数最终会调用 hash(key) % n ,hash函数取编号,之后再对n取余后自然可以找到自己该放在哪个buckets中
size_type bkt_num_key(const key_type& key) const
{
return bkt_num_key(key, buckets.size());
}
size_type bkt_num(const value_type& obj) const
{
return bkt_num_key(get_key(obj));
}
size_type bkt_num_key(const key_type& key, size_t n) const
{
return hash(key) % n;
}
size_type bkt_num(const value_type& obj, size_t n) const
{
return bkt_num_key(get_key(obj), n);
}
ExtractKey
一个典型函数来获取key键值,传入什么就返回什么
template <class T>
struct identity : public unary_function<T, T> {
const T& operator()(const T& x) const { return x; }
};
__stl_next_prime函数 找质数
质数表
// Note: assumes long is at least 32 bits.
static const int __stl_num_primes = 28;
static const unsigned long __stl_prime_list[__stl_num_primes] =
{
53, 97, 193, 389, 769,
1543, 3079, 6151, 12289, 24593,
49157, 98317, 196613, 393241, 786433,
1572869, 3145739, 6291469, 12582917, 25165843,
50331653, 100663319, 201326611, 402653189, 805306457,
1610612741, 3221225473ul, 4294967291ul
};
寻找离n最近的质数
inline unsigned long __stl_next_prime(unsigned long n)
{
const unsigned long* first = __stl_prime_list;
const unsigned long* last = __stl_prime_list + __stl_num_primes;
const unsigned long* pos = lower_bound(first, last, n); //从质数表返回一个不小于n的数
return pos == last ? *(last - 1) : *pos;
}
__hashtable_iterator迭代器
template <class Value, class Key, class HashFcn,
class ExtractKey, class EqualKey, class Alloc>
struct __hashtable_iterator {
typedef hashtable<Value, Key, HashFcn, ExtractKey, EqualKey, Alloc>
hashtable;
typedef __hashtable_iterator<Value, Key, HashFcn,
ExtractKey, EqualKey, Alloc>
iterator;
typedef __hashtable_const_iterator<Value, Key, HashFcn,
ExtractKey, EqualKey, Alloc>
const_iterator;
typedef __hashtable_node<Value> node;
typedef forward_iterator_tag iterator_category; //注意迭代器类型是前向迭代器,因为链表节点没有前向指针,无法重载--运算符
typedef Value value_type;
typedef ptrdiff_t difference_type;
typedef size_t size_type;
typedef Value& reference;
typedef Value* pointer;
node* cur; //cur指向迭代器目前所指节点
hashtable* ht; //保持对容器的连接关系(走到链表尾部时需要跳到新的容器#区号去)
__hashtable_iterator(node* n, hashtable* tab) : cur(n), ht(tab) {}
__hashtable_iterator() {}
reference operator*() const { return cur->val; } //返回节点值
#ifndef __SGI_STL_NO_ARROW_OPERATOR
pointer operator->() const { return &(operator*()); }
#endif /* __SGI_STL_NO_ARROW_OPERATOR */
iterator& operator++();
iterator operator++(int);
bool operator==(const iterator& it) const { return cur == it.cur; }
bool operator!=(const iterator& it) const { return cur != it.cur; }
};
迭代器operator++ 重载
template <class V, class K, class HF, class ExK, class EqK, class A>
__hashtable_iterator<V, K, HF, ExK, EqK, A>& //返回一个迭代器引用
__hashtable_iterator<V, K, HF, ExK, EqK, A>::operator++() //前置++
{
const node* old = cur; //暂存旧节点指针
cur = cur->next;
if (!cur) { //如果到链表尾部了,那么需要跳到下一个bucket找元素
size_type bucket = ht->bkt_num(old->val); //bkt_num(old->val)可以取得旧节点所在bucket编号
while (!cur && ++bucket < ht->buckets.size()) //bucket往后移动,cur指向链表有元素时跳出循环
cur = ht->buckets[bucket]; //buckets[bucket]存放指向链表的指针,之前没放元素的则是nullptr
}
return *this; //返回迭代器,其中的cur指向下一个元素
}
__hashtable_node 链表节点定义
template <class Value>
struct __hashtable_node
{
__hashtable_node* next;
Value val;
};
new_node() 、delete_node()
新建链表节点或回收节点
node* new_node(const value_type& obj)
{
node* n = node_allocator::allocate(); //分配空间
n->next = 0;
__STL_TRY {
construct(&n->val, obj); //obj的构造函数
return n;
}
__STL_UNWIND(node_allocator::deallocate(n));
}
void delete_node(node* n)
{
destroy(&n->val); //调用析构函数
node_allocator::deallocate(n); //回收内存
}
hashtable类
template <class Value, class Key, class HashFcn,
class ExtractKey, class EqualKey,
class Alloc>
class hashtable {
public:
//重新申明名称
typedef Key key_type;
typedef Value value_type;
typedef HashFcn hasher;
typedef EqualKey key_equal;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
typedef value_type* pointer;
typedef const value_type* const_pointer;
typedef value_type& reference;
typedef const value_type& const_reference;
hasher hash_funct() const { return hash; }
key_equal key_eq() const { return equals; }
private:
hasher hash; //编号函数
key_equal equals; //比较函数
ExtractKey get_key; //获取key键函数
typedef __hashtable_node<Value> node; //链表节点
typedef simple_alloc<node, Alloc> node_allocator; //节点的内存分配器
vector<node*,Alloc> buckets; //vector创建链表指针数组,初始为53
size_type num_elements;
public:
typedef __hashtable_iterator<Value, Key, HashFcn, ExtractKey, EqualKey,
Alloc>
iterator; //迭代器申明
typedef __hashtable_const_iterator<Value, Key, HashFcn, ExtractKey, EqualKey,
Alloc>
const_iterator;
friend struct
__hashtable_iterator<Value, Key, HashFcn, ExtractKey, EqualKey, Alloc>;
friend struct
__hashtable_const_iterator<Value, Key, HashFcn, ExtractKey, EqualKey, Alloc>;
initialize_buckets函数
size_type next_size(size_type n) const { return __stl_next_prime(n); } //见__stl_next_prime分析
void initialize_buckets(size_type n)
{
const size_type n_buckets = next_size(n);
buckets.reserve(n_buckets); //空间扩容
buckets.insert(buckets.end(), n_buckets, (node*) 0); //放入n个指向链表节点的指针
num_elements = 0;
}
hash构造函数
hashtable(size_type n,
const HashFcn& hf,
const EqualKey& eql,
const ExtractKey& ext)
: hash(hf), equals(eql), get_key(ext), num_elements(0)
{
initialize_buckets(n); //创建buckets指针数组
}
hashtable(size_type n,
const HashFcn& hf,
const EqualKey& eql)
: hash(hf), equals(eql), get_key(ExtractKey()), num_elements(0)
{
initialize_buckets(n); //创建buckets指针数组
}
hashtable(const hashtable& ht)
: hash(ht.hash), equals(ht.equals), get_key(ht.get_key), num_elements(0)
{
copy_from(ht); //拷贝构造
}
hashtable& operator= (const hashtable& ht) //赋值拷贝
{
if (&ht != this) {
clear();
hash = ht.hash;
equals = ht.equals;
get_key = ht.get_key;
copy_from(ht);
}
return *this;
}
~hashtable() { clear(); }
size()
size_type size() const { return num_elements; } //返回元素个数,num_elements会随着插入删除变化
empty()
bool empty() const { return size() == 0; }
begin() end()
__hashtable_iterator(node* n, hashtable* tab) : cur(n), ht(tab) {} //迭代器构造函数
iterator begin()
{
for (size_type n = 0; n < buckets.size(); ++n)
if (buckets[n]) //buckets中遍历找到第一个链表节点中有元素的
return iterator(buckets[n], this); //迭代器构造,this必须传入,迭代器要保持和buckets的连接
return end();
}
iterator end() { return iterator(0, this); }
const_iterator begin() const
{
for (size_type n = 0; n < buckets.size(); ++n)
if (buckets[n])
return const_iterator(buckets[n], this);
return end();
}
const_iterator end() const { return const_iterator(0, this); }
bucket_count()
返回buckets数组中指针个数
size_type bucket_count() const { return buckets.size(); }
elems_in_bucket()
buckets[x],找出x号对应的链表挂了多少个元素
size_type elems_in_bucket(size_type bucket) const
{
size_type result = 0;
for (node* cur = buckets[bucket]; cur; cur = cur->next) //链表查找
result += 1;
return result;
}
insert_unique()
pair<iterator, bool> insert_unique(const value_type& obj)
{
resize(num_elements + 1); //判断是否需要新建hash表,详细见resize
return insert_unique_noresize(obj); //插入元素
}
resize()
判断是否需要新建表
size_type next_size(size_type n) const { return __stl_next_prime(n); } //返回不小于n的第一个质数
template <class V, class K, class HF, class Ex, class Eq, class A>
void hashtable<V, K, HF, Ex, Eq, A>::resize(size_type num_elements_hint)
{
const size_type old_n = buckets.size(); //旧buckets指针数量
if (num_elements_hint > old_n) {
const size_type n = next_size(num_elements_hint); //新表的buckets指针数量n
if (n > old_n) {
vector<node*, A> tmp(n, (node*) 0); //新建buckets,带n个指向node*的指针
__STL_TRY {
//下面处理每一个旧的buckets指针
for (size_type bucket = 0; bucket < old_n; ++bucket) {
node* first = buckets[bucket];
//下面处理每一个旧的buckets指针指向的链表
while (first) {
size_type new_bucket = bkt_num(first->val, n); //找出这个节点在新表中的落脚点
buckets[bucket] = first->next; //指向下一个节点,方便迭代处理
first->next = tmp[new_bucket]; //插入到新位置,采用头插法
tmp[new_bucket] = first;
first = buckets[bucket]; //准备下一个处理的节点
}
}
buckets.swap(tmp); //vector.swap ,交换start,finish迭代器
}
# ifdef __STL_USE_EXCEPTIONS
catch(...) { //异常处理
for (size_type bucket = 0; bucket < tmp.size(); ++bucket) {
while (tmp[bucket]) {
node* next = tmp[bucket]->next;
delete_node(tmp[bucket]);
tmp[bucket] = next;
}
}
throw;
}
# endif /* __STL_USE_EXCEPTIONS */
}
}
}
insert_unique_noresize
在hash表中插入新元素,键值key不允许重复
template <class V, class K, class HF, class Ex, class Eq, class A>
pair<typename hashtable<V, K, HF, Ex, Eq, A>::iterator, bool>
hashtable<V, K, HF, Ex, Eq, A>::insert_unique_noresize(const value_type& obj)
{
const size_type n = bkt_num(obj); //找出obj在buckets的落脚点
node* first = buckets[n];
/*如果buckets[n]不为0,说明已有元素在链表中,需要遍历链表比较元素,因为不允许键值相同*/
for (node* cur = first; cur; cur = cur->next)
if (equals(get_key(cur->val), get_key(obj))) //比较键值
return pair<iterator, bool>(iterator(cur, this), false); //如果相同,马上返回
node* tmp = new_node(obj); //新建节点
tmp->next = first; //经典头插法
buckets[n] = tmp;
++num_elements; //记录hash表总元素个数
return pair<iterator, bool>(iterator(tmp, this), true);
}
find()
在hash表中找元素,根据键值查找
iterator find(const key_type& key)
{
size_type n = bkt_num_key(key); //找到在buckets中落脚点
node* first;
for ( first = buckets[n];
first && !equals(get_key(first->val), key); //键值相等或first指向null时返回
first = first->next)
{}
return iterator(first, this); //返回一个迭代器
}
count()
返回指定元素数量
size_type count(const key_type& key) const
{
const size_type n = bkt_num_key(key); //找到在buckets中落脚点
size_type result = 0;
/*遍历链表*/
for (const node* cur = buckets[n]; cur; cur = cur->next)
if (equals(get_key(cur->val), key)) //键值相同计数就+1
++result;
return result;
}
erase()
删除操作
template <class V, class K, class HF, class Ex, class Eq, class A>
typename hashtable<V, K, HF, Ex, Eq, A>::size_type //返回删除数量
hashtable<V, K, HF, Ex, Eq, A>::erase(const key_type& key) //根据键值删除
{
const size_type n = bkt_num_key(key); //找到buckets对应位置
node* first = buckets[n];
size_type erased = 0;
//下面遍历链表查找
if (first) {
node* cur = first;
node* next = cur->next;
while (next) {
if (equals(get_key(next->val), key)) { //如果找到相同key则删除
cur->next = next->next;
delete_node(next); //删除此节点
next = cur->next;
++erased; //删除数量++
--num_elements;
}
else {
cur = next;
next = cur->next;
}
}
if (equals(get_key(first->val), key)) { //头结点单独判断
buckets[n] = first->next; //更新头结点位置
delete_node(first);
++erased;
--num_elements;
}
}
return erased; //返回删除数量
}
template <class V, class K, class HF, class Ex, class Eq, class A>
void hashtable<V, K, HF, Ex, Eq, A>::erase(const iterator& it) //根据迭代器来删除
{
if (node* const p = it.cur) {
const size_type n = bkt_num(p->val); //找到buckets对应位置
node* cur = buckets[n];
if (cur == p) {
buckets[n] = cur->next;
delete_node(cur);
--num_elements;
}
else {
node* next = cur->next;
while (next) {
if (next == p) { //找到对应节点则删除
cur->next = next->next;
delete_node(next);
--num_elements;
break; //迭代器只代表一个节点,删除一个就退出循环
}
else {
cur = next;
next = cur->next;
}
}
}
}
}
clear()
回收所有空间
template <class V, class K, class HF, class Ex, class Eq, class A>
void hashtable<V, K, HF, Ex, Eq, A>::clear()
{
//遍历buckets
for (size_type i = 0; i < buckets.size(); ++i) {
node* cur = buckets[i];
//处理每一条 buckets list
while (cur != 0) {
node* next = cur->next;
delete_node(cur); //调用cur->val相应的析构函数并回收内存
cur = next;
}
buckets[i] = 0;
}
num_elements = 0; //元素清零
}