【C++】哈希表的开散列、闭散列的代码实现

最新推荐文章于 2024-09-19 13:36:34 发布

TTang-sq

最新推荐文章于 2024-09-19 13:36:34 发布

阅读量46

点赞数 1

分类专栏： C++ 文章标签： c++ 散列表哈希算法

本文链接：https://blog.csdn.net/m0_67470729/article/details/131605563

版权

C++ 专栏收录该内容

25 篇文章 2 订阅

订阅专栏

0. 概念

哈希表，又名 散列表

哈希（Hash）是一种方法：将储存元素的 key 和 存储位置 建立映射（关联）关系

建立映射关系的方法具体有：

1. 直接定址法

使用：范围比较集中，每个而数据分配一个唯一位置

2. 除留余数法

适用：范围不集中，分布分散

hashi = key % 10; key 根据存储位置关系模出来的，不同的值映射到了同一个位置（哈希冲突/碰撞）

如何解决哈希冲突问题？

🎯闭散列，又名开放定址法

在自己的开放空间找下一个位置

a. 线性探测，直接找下一个位置。比较暴力，一些相邻聚集的位置会连续冲突，可能形成“踩踏”

b. 二次探测，模表的长度，冲突后 + i ²。可以缓解线性探测的踩踏

闭散列本质还是一个零和游戏，总不是占了别人的位置

🎯开散列，又名链地址法 / 开链法 / 拉链法 / 哈希桶

首先对关键码集合，用散列函数计算散列地址，具有相同地址的关键码归于同一子集合，每一个子集合称为一个桶各个桶中的元素通过一个单链表链接起来，各链表的头结点存储在哈希表中。

1. 建立哈希表用开放定址法解决冲突

分析【线性探测】：冲突时找下一个位置

查找

从映射位置开始找，直到空结束

删除

删除后的位置怎么处理？只能单独做一个标识符，给数值和 NULL 都不对

解决：每个存储位置的状态标识【空、存在、删除】

扩容

哈希表在什么情况下扩容？怎么扩？

解决：设置负载因子 / 载荷因子（反映表存储数据量的程度、百分比），需要控制在0.7~0.8以下

ps：扩容扩的是 size，扩 capacity 没用，没被size 覆盖到的地方 [] 会报非法访问（vector 的特性）。

pps：且会导致映射关系变化，需要单独处理。

namespace OpenAddress
{
	enum State 
	{
		EMPTY,
		EXIST,
		DELETE
	};

	template<class K, class V>
	struct HashData
	{
		pair<K, V> _kv;
		State _state = EMPTY;
	};

	template<class K, class V>
	class HashTable
	{
	public:

		bool Insert(const pair<K, V>& kv)
		{
			// 去冗
			if (Find(kv.first))
			{
				return false;
			}
			// 负载因子超过0.7就扩容2倍
			//if (_n / _tables.size() >= 0.7)
			//{
			//	// 1. 需要处理空表
			//	// 2. 光扩capacity没法用[]访问，size没变
			//  // 3. 扩完容还需要...重置hash表内数据，要新开一个表，不能在原位置哦
			//	_tables.reserve(_tables.capacity() * 2);	// ×
			//}
			//if (_tables.size() == 0 ||_n * 10 / _tables.size() >= 7)
			//{
			//	size_t newsize = _tables.size() == 0 ? 10 : _tables.size() * 2;
			//	vector<HashData> newtables(newsize);
			//	// 遍历旧表，重新映射到新表
			//	for (auto& data : _tables)
			//	{
			//		if (data._state == EXIST)
			//		{
			//			// 重新算在新表的位置
			//			// 就是下面的代码复用，逻辑如此，但有没有更简单的方法咧？
			//			size_t i = 1;
			//			size_t index = hashi;
			//			while (newtables[hashi]._state == EXIST)
			//			{
			//				index = hashi + i;
			//				index %= newtables.size();	// 超过表就回到前面
			//				++i;
			//			}
			//			newtables[index]._kv = data._kv;
			//			newtables[index].state = EXIST;
			//		}
			//	}
			//	_tables.swap(newtables);
			//}

			// 扩容
			if (_tables.size() == 0 || _n * 10 / _tables.size() >= 7)
			{
				size_t newsize = _tables.size() == 0 ? 10 : _tables.size() * 2;
				HashTable<K, V> newht;
				newht._tables.resize(newsize);		// 用新的hash表对象复用insert
				// 遍历旧表，重新映射到新表
				for (auto& data : _tables)
				{
					newht.Insert(data._kv);
				}
				_tables.swap(newht._tables);
		}
		
			// 映射位置
			size_t hashi = kv.first % _tables.size();	// vector 的特性，模 capacity 后，超出 size 的部分是非法访问！！！自己写还能行，用 vector 就只能模 size
		
			size_t i = 1;
			size_t index = hashi;

			// [线性探测]
			while (_tables[index]._state == EXIST)
			{
				index = hashi + i;
				index %= _tables.size();	// 超过表就回到前面
				++i;
			}

			_tables[index]._kv = kv;
			_tables[index]._state = EXIST;
			_n++;

			return true;
		}
	
		// 注意：表中没有EMPTY的情况！！
		// 插入数据后，在扩容前，删除一 部分数据，在插入数据，并且数据正好占据其他空位，导致表里面，除了存在就是删除
		HashData<K, V>* Find(const K& key)
		{
			if (_tables.size() == 0)
			{
				//return false;
				return nullptr;
			}

			size_t hashi = key % _tables.size();

			size_t i = 1;
			size_t index = hashi;

			// [线性探测]
			while (_tables[index]._state != EMPTY)
			{
				if (_tables[index]._state == EXIST
					&& _tables[index]._kv.first == key)
				{
					return &_tables[index];
				}
				index = hashi + i;
				index %= _tables.size();
				++i;
		
				//如果已经找了一圈了，说明表里没有 EMPTY，break 就可以
				if (index == hashi)	
				{
					break;
				}
			}

			return nullptr;
		}

		bool Erase(const K& key)
		{
			HashData<K, V>* ret = Find(key);
			if (ret)
			{
				ret->_state = DELETE;	// 伪删除法：不是正真删除了数据，而是把数据位置标识成删除状态~~
				--_n;
				return true;
			}
			else
			{
				return false;
			}
		}

	private:
		/*HashDate* tables;
		size_t _size;
		size_t _capacity;*/			// 用现成的容器很方便呀
	
		vector<HashData<K, V>> _tables;
		size_t _n = 0;					// 存取的数据个数
	};

	// 基本测试
	void testHashTable1()
	{
		int a[] = { 3,33,2,13,5,12,1002 };
		HashTable<int, int> ht;

		for (auto e : a)
		{
			ht.Insert(make_pair(e,e));
		}

		ht.Insert(make_pair(15, 15));

		if (ht.Find(13))
		{
			cout << "13在" << endl;
		}
		else 
		{
			cout << "13不在" << endl;
		}

		ht.Erase(13);

		if (ht.Find(13))
		{
			cout << "13在" << endl;
		}
		else
		{
			cout << "13不在" << endl;
		}
	}

}

2. 建立哈希表用哈希桶解决冲突

对比开放定址，冲突得到了极大的降低

增删查改的时间复杂度：O(1)

这里是取的平均时间，为啥不取最坏了呢？因为有扩容的发生，扩容则重新建立映射关系，最坏情况几乎不会出现

Question：

如果如果，某些桶还是特别长，怎么办

Answer：

负载因子控制
单个桶超过一定长度，这个桶改成挂红黑树（结构体 {联合体 {红黑树的指针, 链表的指针}，长度} 解决）

namespace HashBucket
{
	template<class K, class V>
	struct HashNode
	{
		HashNode<K, V>* _next;
		pair<K, V> _kv;

		HashNode(const pair<K, V>& kv)
			:_next(nullptr)
			,_kv(kv)
		{}

	};

	template<class K>
	struct HashFunc
	{
		K operator()(const K& key)
		{
			return key;
		}
	};

	// 特化！！
	template<>
	struct HashFunc<string>
	{
		// BKDR hash
		size_t operator()(const string& s)
		{
			/*return s[0];*/	// 这样写的话，如果首字母相同就是冲突的 ，优化
			size_t hash = 0;
			for (auto ch : s)
			{
				hash += ch;
				hash *= 31;	// 也可以乘131、1313
			}
			return hash;
		}
	};

	template<class K, class V, class Hash = HashFunc<K>>	// Hush：把key转化成可以取模的整形，默认支持整形
	class HashTable
	{
		typedef HashNode<K, V> Node;
	public:
		~HashTable()
		{
			for (auto cur : _tables)
			{
				while (cur)
				{
					Node* next = cur->_next;
					delete cur;
					cur = next;
				}
				cur = nullptr;
			}
		}


		// size_t newsize = GetNextPrime(_tables.size());
		size_t GetNextPrime(size_t prime)
		{
			// SGI 的写法：素数表优化
			const int PRIMECOUNT = 28;
			static const size_t primeList[PRIMECOUNT] =
			{
				53ul, 97ul, 193ul, 389ul, 769ul, 
				1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
				49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
				1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
				50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
				1610612741ul, 3221225473ul, 4294967291ul
			};
			size_t i = 0;
			for (; i < PRIMECOUNT; ++i)
			{
				if (primeList[i] > prime)
					return primeList[i];
			}
			return primeList[i];
		}

		Node* Find(const K& key)
		{
			if (_tables.size() == 0)
				return nullptr;

			Hash hash;
			size_t hashi = hash(key) % _tables.size();
			Node* cur = _tables[hashi];
			while (cur)
			{
				if (cur->_kv.first == key)
				{
					return cur;
				}
				cur = cur->_next;
			}
			return nullptr;
		}

		bool Erase(const K& key)
		{
			Hash hash;
			size_t hashi = hash(key) % _tables.size();
			Node* prev = nullptr;
			Node* cur = _tables[hashi];
			while (cur)
			{
				if (cur->_kv.first == key)
				{
					// 删
					if (prev)
					{
						prev->_next = cur->_next;
					}
					else
					{
						_tables[hashi] = cur->_next;
					}
					--_n;
					delete cur;
					return true;
				}
				else
				{
					prev = cur;
					cur = cur->_next;
				}
			}
			return false;
		}

		// 负载因子越大，冲突的概率越高，查找的效率越低，空间利用率高
		// 负载因子越小，冲突的概率越低，查找的效率越高，空间利用率低
		bool Insert(const pair<K, V>& kv)
		{
			if (Find(kv.first))
				return false;
			// 负载因子 == 1 时扩容
			// 原表的节点重新计算位置，挪动到新表
			Hash hash;
			if (_n == _tables.size())
			{
				//size_t newsize = _tables.size() == 0 ? 10 : _tables.size() * 2;
				size_t newsize = GetNextPrime(_tables.size());
				vector<Node*> newtables(newsize, nullptr);
				//for (Node*& cur : _tables)
				for (auto& cur : _tables)	// 给&，因为旧表挪下来后还要置空
				{
					while (cur)
					{
						Node* next = cur->_next;

						size_t hashi = hash(cur->_kv.first) % newtables.size();
						// 头插到新表
						cur->_next = newtables[hashi];
						newtables[hashi] = cur;

						cur = next;
					}
				}
				_tables.swap(newtables);
			}

			size_t hashi = hash(kv.first) % _tables.size();
			// 头插就可以
			Node* newnode = new Node(kv);
			newnode->_next = _tables[hashi];
			_tables[hashi] = newnode;
			++_n;
			return true;
		}

		size_t MaxBucketSize()
		{
			size_t max = 0;
			for (size_t i = 0; i< _tables.size(); ++i)
			{
				auto cur = _tables[i];
				size_t size = 0;
				while (cur)
				{
					++size;
					cur = cur->_next;
				}

				//printf("[%d]->%d\n", i, size);
				if (size > max)
				{
					max = size;
				}
			}
			return max;
		}

	private:
		vector<Node*> _tables;	// 指针数组 
		size_t _n;				// 存储的有效数据个数
	};


	void testHashBucket1()
	{
		int a[] = { 3,33,2,13,5,12,1002 };
		HashTable<int, int> ht;

		for (auto e : a)
		{
			ht.Insert(make_pair(e, e));
		}

		ht.Insert(make_pair(15, 15));
		ht.Insert(make_pair(25, 25));
		ht.Insert(make_pair(35, 35));
		ht.Insert(make_pair(45, 45));
	}

	void testHashBucket2()
	{
		int a[] = { 3,33,2,13,5,12,1002 };
		HashTable<int, int> ht;

		for (auto e : a)
		{
			ht.Insert(make_pair(e, e));
		}
	
		ht.Erase(12);
		ht.Erase(3);
	}

	// 测试 BKDR哈希法
	struct HashStr
	{
		// BKDR hash
		size_t operator()(const string& s)
		{
			/*return s[0];*/	// 这样写的话，如果首字母相同就是冲突的 ，优化
			size_t hash = 0;
			for (auto ch : s)
			{
				hash += ch;
				hash *= 31;	// 也可以乘131、1313
			}
			return hash;
		}
	};


	void testHashBucket3()
	{
		//HashTable<string, string, HashStr> ht;
		HashTable<string, string> ht;
		ht.Insert(make_pair("string", "字符串"));
		ht.Insert(make_pair("tree", "树"));
		ht.Insert(make_pair("bucket", "桶"));
		ht.Insert(make_pair("sort", "排序"));
		ht.Insert(make_pair("", "排序"));

		HashStr hashstr;
		cout << hashstr("bacd") << endl;
		cout << hashstr("abcd") << endl;	// 相同...
		cout << hashstr("aadd") << endl;	// 不免这样，也是相同的...
		// BKDR hash 优化
	}


	void testHashBucket4()
	{
		size_t N = 100000;
		HashTable<int, int> ht;

		srand(time(0));
		for (size_t i = 0; i < N; ++i)
		{
			size_t x = rand()+i;
			ht.Insert(make_pair(x, x));
		}

		cout << ht.MaxBucketSize() << endl;
	}
}