小谈哈希表

早睡早起身体BBBang

已于 2022-10-08 22:46:03 修改

阅读量137

点赞数

文章标签：哈希算法链表散列表 c++ hash table

于 2022-10-08 22:45:03 首次发布

本文链接：https://blog.csdn.net/qq_43406934/article/details/127217392

版权

小谈哈希表

本文共分为四个部分，哈希表的实现、哈希冲突、哈希扩容以及自定义数据是否可以做哈希表的键值。

哈希表的实现

STL中的散列表采用链表法解决哈希冲突，初始化一个数组，将数组的每一个元素位置称为桶，每个桶上存放的是映射为同一下标的不同关键值的链表的链头元素。
因此散列表的底层数据结构为数组+链表，但是当链表元素长度过长时为避免查询事件复杂度过大，会转化成红黑树结构。

/*
  1）什么是桶？
		STL中散列表采用链接法解决冲突。结构中维护了一个vector，vector中每一个元素称为一个桶（bucket），它包含的是一个链表的第一个节点
		在发生“哈希冲突”的情况下，单个桶会存储多个条目，这些条目必须按顺序搜索。

		简单点说，这个桶就是链表/树

*/
#include <iostream>
#include <vector>
#include <ctime>

#define MAXTABLESIZE 10000 //允许开辟的最大散列表长度
using namespace std;

class hashtable {
public:
	//先定义一个链表
	struct listNode
	{
		int val;
		listNode *next;

		listNode() : val(0), next(nullptr) {}
		listNode(int x) : val(x), next(nullptr) {}
		listNode(int x, listNode *next) : val(x), next(next) {}
	};
	//记录元素数量
	int count = 0;
	//记录当前数组的容量大小
	size_t cursize;
	//存放数据
	vector<listNode*> vec_Node;

	//初始化数组容量
	size_t size;
	//查询
	listNode* find(const int key) {

		size_t pos = fun_index(key);
		listNode *cur = vec_Node[pos];

		while (cur != nullptr) {

			if (key == cur->val)
				return cur;
			else
				cur = cur->next;
		}

		return nullptr;
	}
	//插入
	void insert(const int key) {
	
		if (find(key) != nullptr)
			return;

	
		listNode *tmp = new listNode(key);
		//计算位置
		size_t index = fun_index(key);

		listNode *cur = vec_Node[index];
		if (cur == nullptr) {
			vec_Node[index] = tmp;
		}
		else {
			tmp->next = cur->next;
			vec_Node[index]->next = tmp;
		}
		//cout << vec_Node[index]->val << endl;
		count++;

		if (count == cursize / 2) {
			cout << "扩容" << endl;
			cout << vec_Node.capacity() << endl;
			increaseCapicity();
			cout << vec_Node.capacity() << endl;
		}
	}
	//删除----链表的删除操作
	void remove(const int key) {

		listNode *node = find(key);
		if (node == nullptr)
			return;

		size_t pos = fun_index(key);
		listNode *cur = vec_Node[pos];

		//判断删除的是否是头结点
		if (cur == node) {

			vec_Node[pos] = cur->next;
			cur->next = nullptr;
			free(cur);

			count--;
			return;
		}
		while (cur != nullptr && cur->next != node) {
			cur = cur->next;
		}
		
		cur->next = node->next;
		free(node);
		count--;
		return;
	}
	
	//打印
	void show() {
		for (listNode *cur : vec_Node) {
			while(cur != nullptr) {
				cout << cur->val << "  ";

				cur = cur->next;
			}
		}
		cout << endl;
	}
	size_t getNum() {

		return count;
	}
	hashtable(size_t sz=0):size(sz) {
		init_hash_table();
	}
	~hashtable(){}

private:

	//扩容
	void increaseCapicity() {
		size = get_size() * 2;
		size_t NewSize = get_size();
		cursize = NewSize;
		//当扩容时我们需要初始化一个新的容器，然后将原有的元素进行重新映射到新的容器
		vector<listNode*> newVec(NewSize);

		
		for (listNode *node : vec_Node) {

			while (node != nullptr) {
				size_t pos = fun_index(node->val);

				listNode *cur = new listNode(node->val);
				if (newVec[pos] != nullptr) {

					cur->next = newVec[pos]->next;
					newVec[pos]->next = cur;
				}
				else {
					newVec[pos] = cur;
				}

				node = node->next;
			}
		}
		vec_Node.swap(newVec);

	}
	size_t get_size() {
			size_t n = size;
			int p = (n % 2) ? n + 2 : n + 1; //从大于n的下一个奇数开始
			int i;
			while (p <= MAXTABLESIZE)
			{
				for (i = (int)sqrt(p); i > 2; i--)
				{
					if ((p % i) == 0)
						break;
				}
				if (i == 2)
					break; //说明是素数，结束
				else
					p += 2;
			}
			return p;
		}

	void init_hash_table() {
		size_t Newsize = get_size();
		cursize = Newsize;
		vec_Node.resize(Newsize);
	}

	size_t fun_index(const int key)
	     {
	          return key % cursize;
	     }
};


vector<int> nums;
void produceRandNumbers(vector<int> &nums, int start, int end, const int amount);
int main() {
	hashtable hash(10);
	
	produceRandNumbers(nums, 0, 30, 20);
	for (int x : nums) {
		hash.insert(x);
	}
    
	hash.show();

	return 0;
}


void produceRandNumbers(vector<int> &nums, int start, int end, const int amount) {
	
	//随机种子
	srand((unsigned)time(NULL));

	for (int i = 0; i < amount; i++) {
		nums.push_back(start+ (rand() %(end - start)));
	}
}

为什么采用数组？
常数级别复杂度访问哈希桶。

哈希冲突

1、哈希冲突是否可以避免？
哈希冲突无法避免，只能减少。
2、STL如何解决哈希冲突？
1）开放寻址法，即当出现哈希冲突时，寻找下一个空的地址存储该元素。
2）链表法，即当出现冲突时，将所有冲突的元素使用链表进行组织起来，进行查表操作。并且当链表节点的长度大于8时升级为红黑树，小于6时退化成链表。
2、常用哈希函数？
除留余数法，即对键值直接进行取模操作。C++中提供了hash类，并且重载了可调用函数，可以直接使用。关于hash类，stl源码中对于基本数据类型做了特化处理，使得可以返回一个可以取模的size_t类型。同时对于字符数组（char*）和字符串（string）类型做了特殊处理，使得可以返回一个可以取模的size_t类型。
3、针对string 数据类型的哈希函数特化？

size_t hash = 0;
for (auto ch : key)
    hash = hash * 131 + ch;
return hash;

size_t hash = 0;
size_t i = 0;
for (auto ch : key)
	hash += i * ch;
return hash;

哈希扩容

1、扩容大小
哈希表将容量扩展为原来的两倍，并将原数组中的数据进行重新映射，放到新的数组
2、为什么扩容后需要重新映射？
哈希函数是与容量相关的，当容量发生变化后，相应的哈希函数会发生变化，因为原数组中的所有元素需要使用新的哈希函数进行重新映射

自定义数据类型是否可以当作哈希表的键值？

可以。但是对自定义数据有要求。

unordered_set<typename _Kty,  typename _Hasher=hash<_Kty>,   typename _Keyeq=equal_to<_Kty>,typename _Allocator<_Kty>>

可见哈希表中有四个模板参数，第一个为插入元素的类型，第二个为hash函数（即重载可调用作用符，设计一个可用于返回直接取模的哈希值），第三个为判断两个对象是否相等的条件（在map set hashset hashmap 中都是不能重复的，因为需要重载 == 运算符（采用全局函数做友元）），第四个是分配器；并且后三个都是提供默认值的。

因此，对于自定义数据我们需要满足两点。
1、重载可调用作用符，用于判断是否两个键值发生哈希冲突。
2、自定义哈希函数，完成键值的映射。对于哈希函数的映射我们有三种方式进行定义。
1）使用仿函数

class Hasher {//hash函数，得到hash码
public:
size_t operator()(const Person& p)const{
return hash<string>()(p.firstname) + hash<string>()(p.lastname) + hash<int>()(p.age);
}
};

unordered_set<Person,Hasher> uset;

2）模板特化（直接对hash 函数进行偏特化处理，重载其（）运算符）

template<>
class hash<Person> {//偏特化（这里使用了标准库已经提供的hash偏特化类hash<string>，hash<int>()）
public:
size_t operator()(const Person& p)const {
return hash<string>()(p.firstname)+ hash<string>()(p.lastname)+ hash<int>()(p.age);
}
};

3）自定义哈希函数

提供代码如下：

//hashkey
struct T
{
	int a;
	
	string b;
	char c;

	//重载 == 相等的运算操作符
	T(int _a, char _c, string _s):a(_a),c(_c),b(_s) {

	}
	friend bool operator==(T a, T b);
};
bool operator==(T a, T b) {

	return (a.a == b.a) && (a.b == b.b) && (a.c == b.c);
}

size_t hasher(const T& p) {//hash函数，得到hash码
	return hash<int>()(p.a) + hash<char>()(p.c) + hash<string>()(p.b);
}

template<>
class hash<T> {
public:
	size_t operator()(const T &t){
		return hash<int>()(t.a) + hash<char>()(t.c) + hash<string>()(t.b);
	}
};