深入哈希结构

template<class K>//默认仿函数
struct hash {
	size_t operator()(const K& key) {
		return (size_t)key;
	}
};
template<>//特化
struct hash<string> {
	//BKDR算法
	size_t operator()(const string& key) {
		size_t sum = 0;
		for (auto& e : key) {
			sum = sum * 131 + e;
		}
		return sum;
	}
};

通过提供模板仿函数，利用仿函数处理key后即可得到整型类型的数据。

具体可参考: 各种字符串Hash函数 - clq - 博客园 (cnblogs.com)

四、通过闭散列解决哈希冲突

4.1 闭散列概念

即开放定址法。当发生哈希冲突时，若哈希表未被装满，说明在哈希表中必然存在空位置，那么可将key存放到冲突位置中的"下一个"空位置中去。

4.2 基础操作

4.2.1 插入操作

1. 通过哈希函数获取待插入元素在哈希表中的位置

2. 若该位置中没有元素则直接插入新元素，若该位置中有元素发生哈希冲突，使用线性探测或者二次探测找到下一个空位置，插入新元素

4.2.2 删除操作

采用闭散列处理哈希冲突时，不能随便物理删除哈希表中已有的元素，若直接删除元素会影响其他元素的搜索。因此线性探测采用标记的伪删除法来删除一个元素。

4.2.3 扩容机制

4.3 线性探测

从发生冲突的位置开始，依次向后探测，直到寻找到下一个空位置为止

优点: 简单且易于实现

缺点: 一旦发生哈希冲突，所有的冲突连在一起，容易产生数据"堆积"，即：不同关键码占据了可利用的空位置，使得寻找某关键码的位置需要许多次比较，导致搜索效率降低。

namespace CloseHash {
	enum State {
		EMPTY,
		EXIST,
		DELETE
	};
	template<class K, class V>
	struct HashData {
		pair<K, V> _kv;
		State _state = EMPTY;
	};

	template<class K>//默认仿函数
	struct hash {
		size_t operator()(const K& key) {
			return (size_t)key;
		}
	};
	template<>//特化
	struct hash<string> {
		//BKDR算法
		size_t operator()(const string& key) {
			size_t sum = 0;
			for (auto& e : key) {
				sum = sum * 131 + e;
			}
			return sum;
		}
	};

	template<class K, class V, class Hash = hash<K>>
	class HashTable
	{
	public:
		bool insert(const pair<K, V>& kv) {
			if (find(kv.first) != nullptr) return false;//不允许键值冗余

			if (_table.size() == 0 || 10 * _size / _table.size() >= 7) {//扩容
				size_t newSize = _table.size() == 0 ? 10 : _table.size() * 2;
				HashTable<K, V, Hash> new_table;
				new_table._table.resize(newSize);
				//旧表数据映射到新表
				for (auto& e: _table){
					if (e._state == EXIST) {
						new_table.insert(e._kv);
					}
				}
				_table.swap(new_table._table);
			}
			Hash hash;
			size_t index = hash(kv.first) % _table.size();//int提升为size_t
			while (_table[index]._state == EXIST) {//线性探测
				++index;
				index %= _table.size();
			}
			_table[index]._kv = kv;
			_table[index]._state = EXIST;
			++_size;
			return true;
		}

		bool erase(const K& key) {
			HashData<K, V>* ret = find(key);
			if (ret == nullptr) {
				return false;
			}
			else {
				ret->_state = DELETE;
				--_size;
				return true;
			}
		}

		HashData<K, V>* find(const K& key) {
			if (_table.size() == 0) return nullptr;
			Hash hash;
			size_t start = hash(key) % _table.size();//int提升为size_t
			size_t index = start;
			while (_table[index]._state != EMPTY) {
				if (_table[index]._state != DELETE && _table[index]._kv.first == key) {
					return &_table[index];
				}
				++index;
				index %= _table.size();
				if (index == start) {//当哈希表中全为DELETE 和 EXIST时避免死循环
					break;
				}
			}

			}
			return nullptr;
		}

	private:
		vector<HashData<K, V>> _table;
		size_t _size = 0;//有效数据
	};
}

4.4 二次探测

线性探测的缺陷是产生冲突的数据堆积在一块，这与其找下一个空位置的方法有关(逐个往后去找)。
因此二次探测为了避免该问题，找下一个空位置的方法为: H_i = (H_0 + i ^ 2) % m, 或者: H_i = (H_0 - i ^ 2) % m。其中: i = 0,1,2,3……
H_0是通过散列函数Hash(x)对元素的关键码 key 进行计算得到的位置，m是表的大小。

namespace CloseHash {
#define LINEAR
	enum State {
		EMPTY,
		EXIST,
		DELETE
	};
	template<class K, class V>
	struct HashData {
		pair<K, V> _kv;
		State _state = EMPTY;
	};

	template<class K>//默认仿函数
	struct hash {
		size_t operator()(const K& key) {
			return (size_t)key;
		}
	};
	template<>//特化
	struct hash<string> {
		//BKDR算法
		size_t operator()(const string& key) {
			size_t sum = 0;
			for (auto& e : key) {
				sum = sum * 131 + e;
			}
			return sum;
		}
	};

	template<class K, class V, class Hash = hash<K>>
	class HashTable
	{
	public:
		bool insert(const pair<K, V>& kv) {
			if (find(kv.first) != nullptr) return false;//不允许键值冗余

			if (_table.size() == 0 || 10 * _size / _table.size() >= 5) {//扩容
				size_t newSize = _table.size() == 0 ? 10 : _table.size() * 2;
				HashTable<K, V, Hash> new_table;
				new_table._table.resize(newSize);
				//旧表数据映射到新表
				for (auto& e : _table) {
					if (e._state == EXIST) {
						new_table.insert(e._kv);
					}
				}
				_table.swap(new_table._table);
			}

			Hash hash;
			size_t start = hash(kv.first) % _table.size();//int提升为size_t
			size_t index = start, i = 0;
			while (_table[index]._state == EXIST) {//二次探测
				++i;
				index = start + i * i;
				index %= _table.size();
			}

			_table[index]._kv = kv;
			_table[index]._state = EXIST;
			++_size;
			return true;
		}

		bool erase(const K& key) {
			HashData<K, V>* ret = find(key);
			if (ret == nullptr) {
				return false;
			}
			else {
				ret->_state = DELETE;
				--_size;
				return true;
			}
		}

		HashData<K, V>* find(const K& key) {
			if (_table.size() == 0) return nullptr;
			Hash hash;
			size_t start = hash(key) % _table.size();//int提升为size_t
			size_t index = start, i = 0;
			while (_table[index]._state == EXIST) {//二次探测
				if (_table[index]._state != DELETE && _table[index]._kv.first == key) {
					return &_table[index];
				}
				++i;
				index = start + i * i;
				index %= _table.size();
			}
			return nullptr;
		}

	private:
		vector<HashData<K, V>> _table;
		size_t _size = 0;//有效数据
	};
}

研究表明: 当表的长度为质数且表装载因子a不超过0.5时，新的表项一定能够插入，而且任何一个位置都不会被探查两次。
因此只要表中有一半的空位置，在搜索时可以不考虑表装满的情况，但在插入时必须确保表的装载因子a不超过0.5，若超出必须考虑增容。

五、通过开散列解决哈希冲突

5.1 概念

又称链地址法(开链法)，首先对关键码集合用散列函数计算散列地址，具有相同地址的关键码归于同一子集合。每一个子集合称为一个桶，各个桶中的元素通过一个单链表链接起来，各链表的头结点存储在哈希表中。

开散列中每个桶中放的都是发生哈希冲突的元素

5.2 扩容机制

桶的个数是一定的，随着元素的不断插入，每个桶中元素的个数不断增多。极端情况下，可
能会导致一个桶中链表节点非常多，会影响的哈希表的性能，因此在一定条件下需要对哈希
表进行增容。那该条件怎么确认呢？开散列最好的情况是：每个哈希桶中刚好挂一个节点，
再继续插入元素时，每一次都会发生哈希冲突，因此，在元素个数刚好等于桶的个数时，可
以给哈希表增容。

但是该如何扩容呢？采用除留余数法的情况下，除数(即哈希表的长度)最好为质数，且每次扩容最好近似之前的两倍大小。这里采用SGI版本的方案(开散列、闭散列都可以使用该种扩容方式):

static const int __stl_num_primes = 28;
static const unsigned long __stl_prime_list[__stl_num_primes] =
{
  53,         97,         193,       389,       769,
  1543,       3079,       6151,      12289,     24593,
  49157,      98317,      196613,    393241,    786433,
  1572869,    3145739,    6291469,   12582917,  25165843,
  50331653,   100663319,  201326611, 402653189, 805306457, 
  1610612741, 3221225473, 4294967291
};

5.3 完整代码

namespace OpenHash {
	template<class K>//默认仿函数
	struct hash {
		size_t operator()(const K& key) {
			return (size_t)key;
		}
	};
	template<>//特化
	struct hash<string> {
		//BKDR算法
		size_t operator()(const string& key) {
			size_t sum = 0;
			for (auto& e : key) {
				sum = sum * 131 + e;
			}
			return sum;
		}
	};

	template<class K, class V>
	struct HashNode {
		HashNode() = default;
		HashNode(const pair<K,V>& kv):_kv(kv),_next(nullptr) {}
		pair<K, V> _kv;
		HashNode<K, V>* _next;
	};

	template<class K, class V, class Hash = hash<K>>
	class HashBucket
	{
		typedef HashNode<K, V> Node;
		
		inline size_t __stl_next_prime(unsigned long n)
		{
			static const size_t __stl_num_primes = 28;
			static const size_t __stl_prime_list[__stl_num_primes] =
			{
			  53,         97,         193,       389,       769,
			  1543,       3079,       6151,      12289,     24593,
			  49157,      98317,      196613,    393241,    786433,
			  1572869,    3145739,    6291469,   12582917,  25165843,
			  50331653,   100663319,  201326611, 402653189, 805306457,
			  1610612741, 3221225473, 4294967291
			};
			for (size_t i = 0; i < __stl_num_primes; ++i) {
				if (__stl_prime_list[i] > n) return __stl_prime_list[i];
			}
			return -1;
		}
	public:
		bool insert(const pair<K, V>& kv) {
			Hash hash;
			if (find(kv.first) != nullptr) return false;//不允许键值冗余

			//荷载因子到达1进行扩容
			if (_table.size() == 0 || _size == _table.size()) {
				vector<Node*> new_table;
				new_table.resize(__stl_next_prime(_table.size()), nullptr);
				for (size_t i = 0; i < _table.size(); ++i) {
					Node* cur = _table[i];
					while (cur != nullptr) {
						Node* next = cur->_next;
						size_t hashi = hash(cur->_kv.first) % new_table.size();
						//头插
						cur->_next = new_table[hashi];
						new_table[hashi] = cur;
						cur = next;
					}
					_table[i] = nullptr;
				}
				_table.swap(new_table);
			}
			
			size_t hashi = hash(kv.first) % _table.size();
			//头插
			Node* newNode = new Node(kv);
			newNode->_next = _table[hashi];
			_table[hashi] = newNode;
			++_size;
			return true;
		}

		bool erase(const K& key) {
			Hash hash;

			if (_table.size() == 0) return false;
			size_t hashi = hash(key) % _table.size();
			Node* cur = _table[hashi];
			Node* prev = nullptr;
			while (cur != nullptr) {
				if (cur->_kv.first == key) {
					if (prev == nullptr) {//头删
						_table[hashi] = cur->_next;
					}
					else {
						prev->_next = cur->_next;
					}
					delete cur;
					--_size;
					return true;
				}
				prev = cur;
				cur = cur->_next;
			}
			return false;
		}


		Node* find(const K& key) {
			Hash hash;
			if (_table.size() == 0) return nullptr;
			size_t hashi = hash(key) % _table.size();
			Node* cur = _table[hashi];
			while (cur != nullptr) {
				if (cur->_kv.first == key) {
					return cur;
				}
				cur = cur->_next;
			}
			return nullptr;
		}

		~HashBucket(){
			for (size_t i = 0; i < _table.size(); ++i) {
				Node* cur = _table[i];
				while (cur != nullptr) {
					Node* next = cur->_next;
					delete cur;
					cur = next;
				}
				_table[i] = nullptr;
			}
		}

		//存储的元素个数
		size_t size() { return _size; }

		// 表的长度
		size_t table_size()
		{
			return _tables.size();
		}

		// 桶的个数
		size_t bucket_num(){
			size_t num = 0;
			for (size_t i = 0; i < _tables.size(); ++i) {
				if (_tables[i]) {
					++num;
				}
			}
			return num;
		}

		size_t max_bucket_length() {
			size_t maxLen = 0;
			for (size_t i = 0; i < _tables.size(); ++i) {
				size_t len = 0;
				Node* cur = _tables[i];
				while (cur){
					++len;
					cur = cur->_next;
				}
				if (len > maxLen) maxLen = len;
			}
			return maxLen;
		}

	private:
		vector<Node*> _table;
		size_t _size = 0;
	};

}