STL源码分析哈希表(hashtable)

最新推荐文章于 2022-05-24 09:34:58 发布

Mr.禾

最新推荐文章于 2022-05-24 09:34:58 发布

阅读量375

点赞数

分类专栏： STL 文章标签：链表指针数据结构 hashmap stl

本文链接：https://blog.csdn.net/qq_24447809/article/details/111327135

版权

STL 专栏收录该内容

18 篇文章 2 订阅

订阅专栏

源代码来自sgi-2.91版本stl_hashtable.h

哈希表概念

哈希表又叫做散列表，本质来说就是充分利用空间的一种思想。

（1）一块内存分为53块，并编号#0 - #52区号，存入的object也编号，5存入#5号，25存入#25号，59 % 53= 6 , 59号object也放入#6号，108%53= 2 , 108号object也放入#2号，那如果继续存入2号、55号object呢？它们会共占一块空间。

（2）为了解决空间碰撞，我们把#0-#52当成一个指针数组，每个#号指向一块链表，每次有重复号码进入时，就把它加入链表，采用头插法。如图所示,55,2,108号object都在一张链表上，因为链表没有前向指针，所以hash表的迭代器不支持–操作。

（3）“开链法” - 但如果hash表上元素过多，所有链表上object数量超过53时，就会rehashing，新分配空间指针数组空间，现在是#0 - #96.

rehashing标准：2*53=106，在106附近找到最近的质数97，所以hash表新容量为97。（世界标准，貌似质数取余可以减少空间碰撞）

此时原先所有数据全部打散，一一存入新的区号指向的链表中。

在这里插入图片描述

hashtable 模板参数理解（HashFcn）、（ExtractKey）、（EqualKey）

template <class Value, class Key, class HashFcn,
          class ExtractKey, class EqualKey, class Alloc = alloc>
class hashtable;

在定义hashtable中传入了很多参数，下面来一一解释：
（1）value - 节点的实际数据类型

（2）Key - 节点的键值，也就是对应的编号

（3）HashFcn - 是个仿函数，从传入的数据类型中获得编号，传入int型自然可以把值当编号，但如果传入一个字符串该如何计算编号呢？所以需要专门的转换函数来处理

（4）ExtractKey - 从节点中取出key键值的方法（函数或者仿函数）

（5）EqualKey - 判断键值相同与否的方法（函数或者仿函数）

（6）Alloc - 内存分配器，缺省使用alloc

HashFcn深入分析

看一个源代码stl_hash_fun.h中的版本

可以发现数值类型的，hash函数的处理都是返回原值，但char*类型的进行单独处理，__stl_hash_string见下面

template <class Key> struct hash { };		//泛化编程

//下面都是全特化版本，对不同的传入参数做不同处理
__STL_TEMPLATE_NULL struct hash<char*>
{
  size_t operator()(const char* s) const { return __stl_hash_string(s); }
};

__STL_TEMPLATE_NULL struct hash<const char*>
{
  size_t operator()(const char* s) const { return __stl_hash_string(s); }
};

__STL_TEMPLATE_NULL struct hash<char> {
  size_t operator()(char x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<unsigned char> {
  size_t operator()(unsigned char x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<signed char> {
  size_t operator()(unsigned char x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<short> {
  size_t operator()(short x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<unsigned short> {
  size_t operator()(unsigned short x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<int> {
  size_t operator()(int x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<unsigned int> {
  size_t operator()(unsigned int x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<long> {
  size_t operator()(long x) const { return x; }
};
__STL_TEMPLATE_NULL struct hash<unsigned long> {
  size_t operator()(unsigned long x) const { return x; }
};

__stl_hash_string函数，"abc"变为5*（‘a’*5+‘b’）+‘c’，此处的编号越复杂越好，因为要避免重复

inline size_t __stl_hash_string(const char* s)
{
  unsigned long h = 0; 
  for ( ; *s; ++s)
    h = 5*h + *s;
  
  return size_t(h);		//返回编号
}

Hash调用，bkt_num函数过渡

下面这些包装函数最终会调用 hash(key) % n ，hash函数取编号，之后再对n取余后自然可以找到自己该放在哪个buckets中

size_type bkt_num_key(const key_type& key) const
  {
    return bkt_num_key(key, buckets.size());
  }

  size_type bkt_num(const value_type& obj) const
  {
    return bkt_num_key(get_key(obj));
  }

  size_type bkt_num_key(const key_type& key, size_t n) const
  {
    return hash(key) % n;
  }

  size_type bkt_num(const value_type& obj, size_t n) const
  {
    return bkt_num_key(get_key(obj), n);
  }

ExtractKey

一个典型函数来获取key键值，传入什么就返回什么

template <class T>
struct identity : public unary_function<T, T> {
  const T& operator()(const T& x) const { return x; }
};

__stl_next_prime函数找质数

质数表

// Note: assumes long is at least 32 bits.
static const int __stl_num_primes = 28;
static const unsigned long __stl_prime_list[__stl_num_primes] =
{
  53,         97,           193,         389,       769,
  1543,       3079,         6151,        12289,     24593,
  49157,      98317,        196613,      393241,    786433,
  1572869,    3145739,      6291469,     12582917,  25165843,
  50331653,   100663319,    201326611,   402653189, 805306457, 
  1610612741, 3221225473ul, 4294967291ul
};

寻找离n最近的质数

inline unsigned long __stl_next_prime(unsigned long n)
{
  const unsigned long* first = __stl_prime_list;
  const unsigned long* last = __stl_prime_list + __stl_num_primes;
  const unsigned long* pos = lower_bound(first, last, n);	//从质数表返回一个不小于n的数
  return pos == last ? *(last - 1) : *pos;
}

__hashtable_iterator迭代器

template <class Value, class Key, class HashFcn,
          class ExtractKey, class EqualKey, class Alloc>
struct __hashtable_iterator {
  typedef hashtable<Value, Key, HashFcn, ExtractKey, EqualKey, Alloc>
          hashtable;
  typedef __hashtable_iterator<Value, Key, HashFcn, 
                               ExtractKey, EqualKey, Alloc>
          iterator;
  typedef __hashtable_const_iterator<Value, Key, HashFcn, 
                                     ExtractKey, EqualKey, Alloc>
          const_iterator;
  typedef __hashtable_node<Value> node;

  typedef forward_iterator_tag iterator_category;			//注意迭代器类型是前向迭代器，因为链表节点没有前向指针，无法重载--运算符
  typedef Value value_type;
  typedef ptrdiff_t difference_type;
  typedef size_t size_type;
  typedef Value& reference;
  typedef Value* pointer;

  node* cur;			//cur指向迭代器目前所指节点
  hashtable* ht;		//保持对容器的连接关系（走到链表尾部时需要跳到新的容器#区号去）

  __hashtable_iterator(node* n, hashtable* tab) : cur(n), ht(tab) {}
  __hashtable_iterator() {}
  reference operator*() const { return cur->val; }			//返回节点值
#ifndef __SGI_STL_NO_ARROW_OPERATOR
  pointer operator->() const { return &(operator*()); }
#endif /* __SGI_STL_NO_ARROW_OPERATOR */
  iterator& operator++();
  iterator operator++(int);
  bool operator==(const iterator& it) const { return cur == it.cur; }
  bool operator!=(const iterator& it) const { return cur != it.cur; }
};

迭代器operator++ 重载

template <class V, class K, class HF, class ExK, class EqK, class A>
__hashtable_iterator<V, K, HF, ExK, EqK, A>&			//返回一个迭代器引用
__hashtable_iterator<V, K, HF, ExK, EqK, A>::operator++()		//前置++
{
  const node* old = cur;		//暂存旧节点指针
  cur = cur->next;	
  if (!cur) {				//如果到链表尾部了，那么需要跳到下一个bucket找元素
    size_type bucket = ht->bkt_num(old->val);		//bkt_num(old->val)可以取得旧节点所在bucket编号
    while (!cur && ++bucket < ht->buckets.size())	//bucket往后移动，cur指向链表有元素时跳出循环
      cur = ht->buckets[bucket];		//buckets[bucket]存放指向链表的指针，之前没放元素的则是nullptr
  }
  return *this;		//返回迭代器，其中的cur指向下一个元素
}

__hashtable_node 链表节点定义

template <class Value>
struct __hashtable_node
{
  __hashtable_node* next;
  Value val;
};

new_node() 、delete_node()

新建链表节点或回收节点

node* new_node(const value_type& obj)
  {
    node* n = node_allocator::allocate();		//分配空间
    n->next = 0;
    __STL_TRY {
      construct(&n->val, obj);		//obj的构造函数
      return n;
    }
    __STL_UNWIND(node_allocator::deallocate(n));
  }
  
  void delete_node(node* n)
  {
    destroy(&n->val);		//调用析构函数
    node_allocator::deallocate(n);	//回收内存
  }

hashtable类

template <class Value, class Key, class HashFcn,
          class ExtractKey, class EqualKey,
          class Alloc>
class hashtable {
public:
//重新申明名称
  typedef Key key_type;
  typedef Value value_type;
  typedef HashFcn hasher;
  typedef EqualKey key_equal;

  typedef size_t            size_type;
  typedef ptrdiff_t         difference_type;
  typedef value_type*       pointer;
  typedef const value_type* const_pointer;
  typedef value_type&       reference;
  typedef const value_type& const_reference;

  hasher hash_funct() const { return hash; }
  key_equal key_eq() const { return equals; }

private:
  hasher hash;			//编号函数
  key_equal equals;		//比较函数
  ExtractKey get_key;	//获取key键函数

  typedef __hashtable_node<Value> node;			//链表节点
  typedef simple_alloc<node, Alloc> node_allocator;		//节点的内存分配器

  vector<node*,Alloc> buckets;			//vector创建链表指针数组，初始为53
  size_type num_elements;

public:
  typedef __hashtable_iterator<Value, Key, HashFcn, ExtractKey, EqualKey, 
                               Alloc>
  iterator;		//迭代器申明

  typedef __hashtable_const_iterator<Value, Key, HashFcn, ExtractKey, EqualKey,
                                     Alloc>
  const_iterator;

  friend struct
  __hashtable_iterator<Value, Key, HashFcn, ExtractKey, EqualKey, Alloc>;
  friend struct
  __hashtable_const_iterator<Value, Key, HashFcn, ExtractKey, EqualKey, Alloc>;

initialize_buckets函数

size_type next_size(size_type n) const { return __stl_next_prime(n); }	//见__stl_next_prime分析

void initialize_buckets(size_type n)
  {
    const size_type n_buckets = next_size(n);
    buckets.reserve(n_buckets);			//空间扩容
    buckets.insert(buckets.end(), n_buckets, (node*) 0);	//放入n个指向链表节点的指针
    num_elements = 0;
  }

hash构造函数

hashtable(size_type n,
            const HashFcn&    hf,
            const EqualKey&   eql,
            const ExtractKey& ext)
    : hash(hf), equals(eql), get_key(ext), num_elements(0)
  {
    initialize_buckets(n);		//创建buckets指针数组
  }

  hashtable(size_type n,
            const HashFcn&    hf,
            const EqualKey&   eql)
    : hash(hf), equals(eql), get_key(ExtractKey()), num_elements(0)
  {
    initialize_buckets(n);		//创建buckets指针数组
  }

  hashtable(const hashtable& ht)
    : hash(ht.hash), equals(ht.equals), get_key(ht.get_key), num_elements(0)
  {
    copy_from(ht);		//拷贝构造
  }

  hashtable& operator= (const hashtable& ht)	//赋值拷贝
  {
    if (&ht != this) {
      clear();
      hash = ht.hash;
      equals = ht.equals;
      get_key = ht.get_key;
      copy_from(ht);
    }
    return *this;
  }

  ~hashtable() { clear(); }

size()

size_type size() const { return num_elements; }		//返回元素个数，num_elements会随着插入删除变化

empty()

bool empty() const { return size() == 0; }

begin() end()

__hashtable_iterator(node* n, hashtable* tab) : cur(n), ht(tab) {}  //迭代器构造函数

iterator begin()
  { 
    for (size_type n = 0; n < buckets.size(); ++n)	
      if (buckets[n])		//buckets中遍历找到第一个链表节点中有元素的
        return iterator(buckets[n], this);		//迭代器构造，this必须传入，迭代器要保持和buckets的连接
    return end();
  }

  iterator end() { return iterator(0, this); }

const_iterator begin() const
  {
    for (size_type n = 0; n < buckets.size(); ++n)
      if (buckets[n])
        return const_iterator(buckets[n], this);
    return end();
  }

  const_iterator end() const { return const_iterator(0, this); }

bucket_count()

返回buckets数组中指针个数

size_type bucket_count() const { return buckets.size(); }

elems_in_bucket()

buckets[x],找出x号对应的链表挂了多少个元素

size_type elems_in_bucket(size_type bucket) const
  {
    size_type result = 0;
    for (node* cur = buckets[bucket]; cur; cur = cur->next)		//链表查找
      result += 1;
    return result;
  }

insert_unique()

pair<iterator, bool> insert_unique(const value_type& obj)
  {
    resize(num_elements + 1);		//判断是否需要新建hash表，详细见resize
    return insert_unique_noresize(obj);		//插入元素
  }

resize()

判断是否需要新建表

size_type next_size(size_type n) const { return __stl_next_prime(n); }	//返回不小于n的第一个质数

template <class V, class K, class HF, class Ex, class Eq, class A>
void hashtable<V, K, HF, Ex, Eq, A>::resize(size_type num_elements_hint)
{
  const size_type old_n = buckets.size();	//旧buckets指针数量
  if (num_elements_hint > old_n) {
    const size_type n = next_size(num_elements_hint);	//新表的buckets指针数量n
    if (n > old_n) {
      vector<node*, A> tmp(n, (node*) 0);		//新建buckets，带n个指向node*的指针
      __STL_TRY {
      	//下面处理每一个旧的buckets指针
        for (size_type bucket = 0; bucket < old_n; ++bucket) {
          node* first = buckets[bucket];
          //下面处理每一个旧的buckets指针指向的链表
          while (first) {
            size_type new_bucket = bkt_num(first->val, n);		//找出这个节点在新表中的落脚点
            buckets[bucket] = first->next;		//指向下一个节点，方便迭代处理
            first->next = tmp[new_bucket];		//插入到新位置，采用头插法
            tmp[new_bucket] = first;
            first = buckets[bucket];          //准备下一个处理的节点
          }
        }
        buckets.swap(tmp);		//vector.swap	,交换start，finish迭代器
      }
#         ifdef __STL_USE_EXCEPTIONS
      catch(...) {		//异常处理
        for (size_type bucket = 0; bucket < tmp.size(); ++bucket) {
          while (tmp[bucket]) {
            node* next = tmp[bucket]->next;
            delete_node(tmp[bucket]);
            tmp[bucket] = next;
          }
        }
        throw;
      }
#         endif /* __STL_USE_EXCEPTIONS */
    }
  }
}

insert_unique_noresize

在hash表中插入新元素，键值key不允许重复

template <class V, class K, class HF, class Ex, class Eq, class A>
pair<typename hashtable<V, K, HF, Ex, Eq, A>::iterator, bool> 
hashtable<V, K, HF, Ex, Eq, A>::insert_unique_noresize(const value_type& obj)
{
  const size_type n = bkt_num(obj);		//找出obj在buckets的落脚点
  node* first = buckets[n];
/*如果buckets[n]不为0，说明已有元素在链表中，需要遍历链表比较元素，因为不允许键值相同*/
  for (node* cur = first; cur; cur = cur->next) 
    if (equals(get_key(cur->val), get_key(obj)))	//比较键值
      return pair<iterator, bool>(iterator(cur, this), false);		//如果相同，马上返回

  node* tmp = new_node(obj);	//新建节点
  tmp->next = first;	//经典头插法
  buckets[n] = tmp;
  ++num_elements;		//记录hash表总元素个数
  return pair<iterator, bool>(iterator(tmp, this), true);
}

find()

在hash表中找元素，根据键值查找

iterator find(const key_type& key) 
  {
    size_type n = bkt_num_key(key);		//找到在buckets中落脚点
    node* first;
    for ( first = buckets[n];
          first && !equals(get_key(first->val), key);		//键值相等或first指向null时返回
          first = first->next)
      {}
    return iterator(first, this);		//返回一个迭代器
  }

count()

返回指定元素数量

size_type count(const key_type& key) const
  {
    const size_type n = bkt_num_key(key);		//找到在buckets中落脚点
    size_type result = 0;
	/*遍历链表*/
    for (const node* cur = buckets[n]; cur; cur = cur->next)
      if (equals(get_key(cur->val), key))		//键值相同计数就+1
        ++result;
    return result;
  }

erase()

删除操作

template <class V, class K, class HF, class Ex, class Eq, class A>
typename hashtable<V, K, HF, Ex, Eq, A>::size_type 			//返回删除数量
hashtable<V, K, HF, Ex, Eq, A>::erase(const key_type& key)		//根据键值删除
{
  const size_type n = bkt_num_key(key);			//找到buckets对应位置
  node* first = buckets[n];
  size_type erased = 0;
//下面遍历链表查找
  if (first) {
    node* cur = first;
    node* next = cur->next;
    while (next) {
      if (equals(get_key(next->val), key)) {		//如果找到相同key则删除
        cur->next = next->next;
        delete_node(next);		//删除此节点
        next = cur->next;
        ++erased;		//删除数量++
        --num_elements;
      }
      else {
        cur = next;
        next = cur->next;
      }
    }
    if (equals(get_key(first->val), key)) {		//头结点单独判断
      buckets[n] = first->next;		//更新头结点位置
      delete_node(first);
      ++erased;
      --num_elements;
    }
  }
  return erased;		//返回删除数量
}

template <class V, class K, class HF, class Ex, class Eq, class A>
void hashtable<V, K, HF, Ex, Eq, A>::erase(const iterator& it)		//根据迭代器来删除
{
  if (node* const p = it.cur) {
    const size_type n = bkt_num(p->val);		//找到buckets对应位置
    node* cur = buckets[n];

    if (cur == p) {
      buckets[n] = cur->next;
      delete_node(cur);
      --num_elements;
    }
    else {
      node* next = cur->next;
      while (next) {
        if (next == p) {		//找到对应节点则删除
          cur->next = next->next;
          delete_node(next);
          --num_elements;
          break;		//迭代器只代表一个节点，删除一个就退出循环
        }
        else {
          cur = next;
          next = cur->next;
        }
      }
    }
  }
}

clear()

回收所有空间

template <class V, class K, class HF, class Ex, class Eq, class A>
void hashtable<V, K, HF, Ex, Eq, A>::clear()
{
//遍历buckets
  for (size_type i = 0; i < buckets.size(); ++i) {
    node* cur = buckets[i];
    //处理每一条 buckets list
    while (cur != 0) {
      node* next = cur->next;
      delete_node(cur);		//调用cur->val相应的析构函数并回收内存
      cur = next;
    }
    buckets[i] = 0;
  }
  num_elements = 0;		//元素清零
}

Mr.禾

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
STL源码分析哈希表(hashtable)

源代码来自sgi-2.91版本stl_hashtable.h哈希表概念哈希表又叫做散列表，本质来说就是充分利用空间的一种思想。（1）一块内存分为53块，并编号#0 - #52区号，存入的object也编号，5存入#5号，25存入#25号，59%53= 6,59号object也放入#6号，108%53= 2,108号object也放入#2号，那如果继续存入2号、55号object呢？它们会共占一块空间。为了解决空间碰撞，我们把#0-#52当成一个指针数组，每个#号指向一块链表，每次有重复号码进入时，就
复制链接

扫一扫