【散列】杜鹃散列详情与C++实现代码

zhugenmi

已于 2022-07-14 22:39:58 修改

阅读量376

点赞数

分类专栏：数据结构文章标签：数据结构哈希算法杜鹃散列

于 2022-07-12 08:00:00 首次发布

本文链接：https://blog.csdn.net/qq_51601649/article/details/125725204

版权

数据结构专栏收录该内容

23 篇文章 1 订阅

订阅专栏

导引

在球-箱问题中，如果将N项随机抛入N个箱子中，那么含球最多的箱子的期望球数为Θ(logN/log logN)。

如果在每次投掷中随机选取两个箱子且将被投项投入(在那一刻)较空的箱子中，则最大箱子的球数只是θ(log logN)，这是一个显著更小的数。其中一种做法就是杜鹃散列(cuckoo hashing)。

概念

在杜鹃散列中，假设我们有N项，我们保持两个散列表，每个都多于半空，并且我们有两个独立的散列函数，它们可将每一项分配给每个表中的一个位置。

杜鹃散列保持下述不变性：一项总是被存储在它的两个位置之一中。

杜鹃散列的好处包括最坏情形常数查找和删除次数，避免懒惰删除和额外的数据，以及并行处理的可能。但杜鹃散列对散列函数的选择非常敏感，最后，推荐使用较小的装填因子或多于两个的散列函数。

实现

杜鹃散列表常常作为拥有两个(或更多的)散列函数的一个大表来实现，这些散列函数探测整个大表。如果存在一个可用的位置，那么一些变化的做法则是尝试把一项立即置入二级散列表中，而不是一开始的位置替换。
杜鹃散列算法本身很简单：要想插入新项x，首先确认它不在表中。然后使用第一个散列函数，而如果这（第一）个表位置是空的，则该项即可置入。

代码

//为杜鹃散列生成泛型HashFamily接口,用来发出多簇散列函数到杜鹃散列表
template<typename AnyType>
class CuckooHashFamily {
public:
	size_t hash(const AnyType& x, int which)const;
	int getNumberOfFunctions();
	void generateNewFunctions();
};

/**
* 杜鹃散列法的非正式字符串散列
*/
template<int count>
class StringHashFamily {
private:
	std::vector<int> MULTIPLIERS;
	UniformRandom r;

public:
	StringHashFamily() :MULTIPLIERS(count) {
		generateNewFuntions();
	}
	int getNumberOfFunctions()const {
		return count;
	}
	void generateNewFuntions() {
		for (auto& mult : MULTIPLIERS)
			mult = r.nextInt();
	}

	size_t hash(const string& x, int which)const {
		const int multiplier = MULTIPLIERS[which];
		size_t hashVal = 0;
		for (auto ch : x)
			hashVal = multiplier * hashVal + ch;
		return hashVal;
	}
};

//杜鹃散列类接口，允许(由HashFamily模板参数类型指定)任意个数的散列函数
template<typename AnyType, typename HashFamily>
class HashTable {
private:
	struct HashEntry {
		AnyType element;
		bool isActive;

		HashEntry(const AnyType&e=AnyType(),bool a=false)
			:element{e},isActive{a}{}
		HashEntry(AnyType&&e,bool a=false)
			:element{std::move(e)},isActive{a}{}
	};

	/**
	* 杜鹃散列的插入例程使用不同的算法，
	* 该算法随机选择要逐出的项，
	* 但不再试图重新逐出最后的项。
	* 如果存在太多的逐出项则散列表将尝试选取新的散列函数(再散列)，
	* 而若有太多的再散列则散列表将扩张
	*/
	bool insertHelper1(const AnyType& xx) {
		const int COUNT_LIMIT = 100;
		AnyType x = xx;

		while (true) {
			int lastPos = -1;
			int pos;

			for (int count = 0; count < COUNT_LIMIT; ++count) {
				for (int i = 0; i < numHashFunctions; ++i)
					pos = myhash(x, i);
					
				if (!isActive(pos)) {
					array[pos] = std::move(HashEntry{ std::move(x),true });
					++currentSize;
					return true;
				}
			}

			//无可用位置，逐出一个随机项
			int i = 0;
			do {
				pos = myhash(x, r.nextInt(numHashFunctions));
			} while (pos == lastPos && i++ < 5);

			lastPos = pos;
			std::swap(x, array[pos].element);
		}

		if (++rehashes > ALLOWED_REHASHES) {
			expand();		//使散列表扩大
			rehashes = 0;	//重置rehashes的计数
		}
		else
			rehash();		//表大小相同，散列函数都是新的
	}
	bool insertHelper1(AnyType&& x) {
		const int COUNT_LIMIT = 100;

		while (true) {
			int lastPos = -1;
			int pos;

			for (int count = 0; count < COUNT_LIMIT; ++count) {
				for (int i = 0; i < numHashFunctions; ++i)
					pos = myhash(x, i);

				if (!isActive(pos)) {
					array[pos] = std::move(HashEntry{ std::move(x),true });
					++currentSize;
					return true;
				}
			}

			//无可用位置，逐出一个随机项
			int i = 0;
			do {
				pos = myhash(x, r.nextInt(numHashFunctions));
			} while (pos == lastPos && i++ < 5);

			lastPos = pos;
			std::swap(x, array[pos].element);
		}

		if (++rehashes > ALLOWED_REHASHES) {
			expand();		//使散列表扩大
			rehashes = 0;	//重置rehashes的计数
		}
		else
			rehash();		//表大小相同，散列函数都是新的
	}
	bool isActive(int currentPos)const {
		return currentPos != -1 && array[currentPos].isActive;
	}

	/**
	* 使用特定函数计算x的散列代码
	* 选取适当的散列函数，然后把它换算成合法的数组下标
	*/
	size_t myhash(const AnyType& x, int which)const {
		return hashFunctions.hash(x, which) % array.size();
	}

	/**
	* 查找所有散列函数的位置
	* 返回查阅所有的散列函数以返回包含项x的下标，若找不到则返回-1
	*/
	int findPos(const AnyType& x)const {
		for (int i = 0; i < numHashFunctions; ++i) {
			int pos = myhash(x, i);

			if (isActive(pos) && array[pos].element == x)
				return pos;
		}
		return -1;
	}

	/**
	* 创建一个大数组但使用那些相同的散列函数
	*/
	void expand() {
		rehash(static_cast<int>(array.size() / MAX_LOAD));
	}

	/**
	* 保留数组的大小不变，创建一个新的数组
	* 该数组使用那些新选出的散列函数填充
	*/
	void rehash() {
		hashFunctions.generateNewFuntions();
		rehash(array.size());
	}

	void rehash(int newSize) {
		std::vector<HashEntry> oldArray = array;

		//创建新的双倍大小的空散列表
		array.resize(nextPrime(newSize));
		for (auto& entry : array)
			entry.isActive = false;

		//复制整个表
		currentSize = 0;
		for (auto& entry : oldArray)
			if (entry.isActive)
				insert(std::move(entry.element));
	}

	constexpr static const double MAX_LOAD=0.4; //最大装填因子
	static const int ALLOWED_REHASHES = 5; //最大散列次数

	vector<HashEntry>array;
	int currentSize;
	int numHashFunctions;
	int rehashes;
	UniformRandom r;
	HashFamily hashFunctions;

public:
	explicit HashTable(int size = 101) :array(nextPrime(size)) {
		numHashFunctions = hashFunctions.getNumberOfFunctions();
		rehashes = 0;
		makeEmpty();
	}

	//清空杜鹃散列表
	void makeEmpty() {
		currentSize = 0;
		for (auto& entry : array)
			entry.isActive = false;
	}

	/**
	* 搜索杜鹃散列表的例程
	* 如果找到x则返回true
	*/
	bool contains(const AnyType& x)const {
		return findPos(x) != -1;
	}

	/**
	* 从散列表中删除x
	* 若项x被找到且被删除则返回true
	*/
	bool remove(const AnyType& x) {
		int currentPos = findPos(x);
		if (!isActive(currentPos))
			return false;

		array[currentPos].isActive = false;
		--currentSize;
		return true;
	}

	//杜鹃散列表中公有插入方法
	bool insert(const AnyType& x) {
		if (contains(x))
			return false;

		if (currentSize >= array.size() * MAX_LOAD)
			expand(); 

		return insertHelper1(x);
	}
	bool insert(AnyType&& x) {
		if (contains(x))
			return false;

		if (currentSize >= array.size() * MAX_LOAD)
			expand(); 

		return insertHelper1(std::move(x));
	}

	int size() const
	{
		return currentSize;
	}

	int capacity() const
	{
		return array.size();
	}
};

zhugenmi

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
2
评论
【散列】杜鹃散列详情与C++实现代码

如果在每次投掷中随机选取两个箱子且将被投项投入(在那一刻)较空的箱子中，则最大箱子的球数只是θ(log logN)，这是一个显著更小的数。其中一种做法就是杜鹃散列(cuckoo hashing)。在杜鹃散列中，假设我们有N项，我们保持两个散列表，每个都多于半空，并且我们有两个独立的散列函数，它们可将每一项分配给每个表中的一个位置。杜鹃散列保持下述不变性：一项总是被存储在它的两个位置之一中。杜鹃散列的好处包括最坏情形常数查找和删除次数，避免懒惰删除和额外的数据，以及并行处理的可能。但杜鹃散列对散列函数的选择非
复制链接

扫一扫