C++下“哈希”，“位图”，“布隆过滤器”的简单介绍-CSDN博客

本文链接：https://blog.csdn.net/sakeww/article/details/125107282

本文探讨了C++中哈希函数的模拟实现及位图（包括布隆过滤器）在处理大量不重复整数查找和数据去重中的应用，还涉及了位图的扩展形式如TwoBitSet和布隆过滤器的误判问题，以及在内存限制下寻找交集的方法。

摘要由CSDN通过智能技术生成

1. unordered系列关联式

区别：
1.unordered_XXX 遍历不按key排序，命名体现
2.unordered_XXX单向迭代器
3.unordered_XXX综合效率略胜map和set

数据重复量比较多的时候，unordered_set更好
重复量比较少的时候，有序的时候，set比较好

2. C++下“hash“的简单模拟实现以及实现中遇见的问题

链接: C++下“hash“的简单模拟实现以及实现中遇见的问题

3.哈希的应用

位图

给40亿个不重复的无符号整数，没排过序。给一个无符号整数，如何快速判断一个数是否在这40亿个数中。
解决：

遍历，时间复杂度O(N)
排序(O(NlogN))，利用二分查找: logN
位图解决
数据是否在给定的整形数据中，结果是在或者不在，刚好是两种状态，那么可以使用一个二进制比特位来代表数据是否存在的信息，如果二进制比特位为1，代表存在，为0代表不存在。比如：

位图概念
所谓位图，就是用每一位来存放某种状态，适用于海量数据，数据无重复的场景。通常是用来判断某个数据存不存在的

优点：节省空间，效率高
局限性：正能处理整数

位图模拟实现

#pragma once
#include <vector>

namespace sakeww
{
	template<size_t N>
	class bitset
	{
	public:
		bitset()
		{
			_bits.resize(N / 8 + 1, 0);
		}

		void set(size_t x)
		{
			size_t i = x / 8;
			size_t j = x % 8;

			_bits[i] |= (1 << j);//将0-》1
		}

		void reset(size_t x)
		{
			size_t i = x / 8;
			size_t j = x % 8;

			_bits[i] &= (~(1 << j));//将1-》0
		}

		bool test(size_t x)//查找
		{
			size_t i = x / 8;
			size_t j = x % 8;

			return _bits[i] & (1 << j);//判断1还是0
		}

	private:
		std::vector<char> _bits;
	};

	void test_bitset()
	{
		bitset<100> bs;
		bs.set(5);
		bs.set(4);
		bs.set(10);
		bs.set(20);

		cout << bs.test(5) << endl;
		cout << bs.test(4) << endl;
		cout << bs.test(10) << endl;
		cout << bs.test(20) << endl;
		cout << bs.test(21) << endl;
		cout << bs.test(6) << endl << endl;

		bs.reset(20);
		bs.reset(10);
		bs.reset(5);

		cout << bs.test(5) << endl;
		cout << bs.test(4) << endl;
		cout << bs.test(10) << endl;
		cout << bs.test(20) << endl;
		cout << bs.test(21) << endl;
		cout << bs.test(6) << endl;

		//bitset<0xffffffff> bs;
	}
}

布隆过滤器

一个值映射多个位置

Set

#pragma once
#include<iostream>
using namespace std;
#include<bitset>
#include<string>

struct BKDRHash
{
	size_t operator()(const string& s)
	{
		// BKDR
		size_t value = 0;
		for (auto ch : s)
		{
			value *= 31;
			value += ch;
		}
		return value;
	}
};

struct APHash
{
	size_t operator()(const string& s)
	{
		size_t hash = 0;
		for (long i = 0; i < s.size(); i++)
		{
			if ((i & 1) == 0)
			{
				hash ^= ((hash << 7) ^ s[i] ^ (hash >> 3));
			}
			else
			{
				hash ^= (~((hash << 11) ^ s[i] ^ (hash >> 5)));
			}
		}
		return hash;
	}
};

struct DJBHash
{
	size_t operator()(const string& s)
	{
		size_t hash = 5381;
		for (auto ch : s)
		{
			hash += (hash << 5) + ch;
		}
		return hash;
	}
};

template<size_t N,
	class K = string,
    class HashFunc1=BKDRHash,
	class HashFunc2=APHash,
	class HashFunc3=DJBHash>
class BloomFilter
{
public:
	void Set(const K& key)
	{
		//测试数值
		//cout << HashFunc1()(key) << endl;
		//cout << HashFunc2()(key) << endl;
		//cout << HashFunc3()(key) << endl;
		//cout << endl;

		size_t index1 = HashFunc1()(key);
		size_t index2 = HashFunc2()(key);
		size_t index3 = HashFunc3()(key);

		_bs.set(index1);
		_bs.set(index2);
		_bs.set(index3);
	}

	
private:
	bitset<N> _bs;
};

void test1()
{
	BloomFilter<100> bf;
	bf.Set("aadd");
	bf.Set("abcd");
	bf.Set("acbd");
}

N的大小

在这里插入图片描述

删除

不能删除

如果硬要？
每个标记位使用多个比特位，存储引用计数（有几个值映射了当前位置）
也可以对特殊位置增加map存储，
支持删除，整体而言消耗空间变多了，布隆过滤器的优势下降了

需求

数据量大，节省空间，允许误判

举个栗子：
昵称：注册信息的时候，判断昵称有没有人用过？
可以将数据库中的昵称，放到布隆过滤器
在是存在误判的，不在是不存在误判的
昵称判断在的情况下，可以再在数据库中查一下，这样对用户更加负责

垃圾邮件：
每个邮件都有一个发件地址，可以制作一个黑名单，
可以对垃圾邮件进行标记，将其放进布隆过滤器中，
下一封邮件，如果不在布隆过滤器中，那么一定不是垃圾邮件。

利用布隆过滤器减少磁盘IO或者网络请求，因为一旦一个值必定不存在的话，我们可以不用进行后续昂贵的查询请求

布隆过滤器代码

#pragma once
#include<iostream>
using namespace std;
#include<bitset>
#include<string>

struct BKDRHash
{
	size_t operator()(const string& s)
	{
		// BKDR
		size_t value = 0;
		for (auto ch : s)
		{
			value *= 31;
			value += ch;
		}
		return value;
	}
};

struct APHash
{
	size_t operator()(const string& s)
	{
		size_t hash = 0;
		for (long i = 0; i < s.size(); i++)
		{
			if ((i & 1) == 0)
			{
				hash ^= ((hash << 7) ^ s[i] ^ (hash >> 3));
			}
			else
			{
				hash ^= (~((hash << 11) ^ s[i] ^ (hash >> 5)));
			}
		}
		return hash;
	}
};

struct DJBHash
{
	size_t operator()(const string& s)
	{
		size_t hash = 5381;
		for (auto ch : s)
		{
			hash += (hash << 5) + ch;
		}
		return hash;
	}
};

template<size_t N,
	size_t X=4,
	class K = string,
    class HashFunc1=BKDRHash,
	class HashFunc2=APHash,
	class HashFunc3=DJBHash>
class BloomFilter
{
public:
	void Set(const K& key)
	{
		//测试数值
		//cout << HashFunc1()(key) << endl;
		//cout << HashFunc2()(key) << endl;
		//cout << HashFunc3()(key) << endl;
		//cout << endl;

		size_t len = X * N;
		size_t index1 = HashFunc1()(key) % len;
		size_t index2 = HashFunc2()(key) % len;
		size_t index3 = HashFunc3()(key) % len;

		//cout << index1 << endl;
		//cout << index2 << endl;
		//cout << index3 << endl;
		//cout << endl;

		_bs.set(index1);
		_bs.set(index2);
		_bs.set(index3);
	}

	bool Test(const K& key)
	{
		size_t len = X * N;
		size_t index1 = HashFunc1()(key) % len;
		if (_bs.test(index1) == false) return false;
		size_t index2 = HashFunc2()(key) % len;
		if (_bs.test(index2) == false) return false;

		size_t index3 = HashFunc3()(key) % len;
		if (_bs.test(index3) == false) return false;

		return true;//存在误判
	}
	
	//不能删除
	//void Reset(const K& key);

private:
	bitset<N*X> _bs;
};

void test1()
{
	BloomFilter<100> bf;
	bf.Set("aadd");
	bf.Set("abcd");
	bf.Set("acbd");
}

4.海量数据处理面试题

位图应用

给定100亿个整数，设计算法找到只出现一次的整数？

template<size_t N>
class TwoBitSet
{
public:
	void Set(size_t x)
	{
		if (!_bs1.test(x) && !_bs2.test(x)) // 00 -> 01
		{
			_bs2.set(x);
		}
		else if (!_bs1.test(x) && _bs2.test(x)) // 01 -> 10
		{
			_bs1.set(x);
			_bs2.reset(x);
		}
		// 10 表示已经出现2次或以上，不用处理
	}

	void PrintOnceNum()
	{
		for (size_t i = 0; i < N; ++i)
		{
			if (!_bs1.test(i) && _bs2.test(i)) // 01
			{
				cout << i << endl;
			}
		}
	}
private:
	sakeww::bitset<N> _bs1;
	sakeww::bitset<N> _bs2;
};

void TestTwoBitSet()
{
	int a[] = { 1,2,3,4,5,6,7,8,9,4,5,6,3,2,1,6};
	TwoBitSet<100> bs;
	for (auto e : a)
	{
		bs.Set(e);
	}

	bs.PrintOnceNum();
}

给两个文件，分别有100亿个整数，我们只有1G内存，如何找到两个文件交集？

一个文件中整数，set到一个位图，读取第二个文件中的整数判读在不在位图，再就是交集，不在就不是
缺陷：交集会把重复值找出来，多次出现
改进：
一个文件中整数，set到一个位图bs1，另一个文件中整数，set到一个位图bs2
a.遍历bs2中值，看在不在bs1，在就是交集
b.bs1中的值依次跟bs2中的值与一下，再去看与完是1的位置值就是交集