位图和布隆过滤器

最新推荐文章于 2024-08-15 21:42:12 发布

Sunlight..

最新推荐文章于 2024-08-15 21:42:12 发布

阅读量126

点赞数

文章标签： c++ 大数据

本文链接：https://blog.csdn.net/weixin_43688483/article/details/106664155

版权

文章目录

1、位图
1.1 位图模拟实现
2、布隆过滤器
2.1 模拟实现

1、位图

位图，就是用每一位来存放某种状态，适用于海量数据，数据无重复的场景。通常是用来判断某个数据存不存在的。可以用来1. 快速查找某个数据是否在一个集合中 2. 排序 3. 求两个集合的交集、并集等 4. 操作系统中磁盘块标记。
其内在的数据结构其实就是连续的二进制的数组。0代表数组中该下标元素不存在，0代表存在。
在系统中，只要加入<bitset>头文件，就可以使用位图。

#include<iostream>
#include<bitset>
using namespace std;
void main()
{
	bitset<10> bt(100);//设置10个比特位，来表示100
	cout<<bt<<endl;   //输出  0001100100
	bt.flip();   //取反
	cout<<bt<<endl;   //1110011011
	bt.set(2);  //将第2个比特位置为1
	cout<<bt<<endl;   //1110011111
	bt.set(3, 0);   //将第3个比特位置为0
	cout<<bt<<endl;   //1110010111
	//统计bt中1的个数
	cout<<bt.count()<<endl; // 7
	//判断bt中是否有1存在，存在返回1
	cout<<bt.any()<<endl;   // 1
	//reset，将bt中的比特位全置为0
	//bt.reset();       //0000000000
	//这时bt中不存在1，bt.any()返回0
	//cout<<bt.any()<<endl;   // 0
	//判断bt中的第三位是否有1存在，存在返回1
	cout<<bt.test(3)<<endl;
	//将bt从二进制转换为10进制表示
	cout<<bt.to_ulong()<<endl;
	cout<<~bt<<endl;
}

1.1 位图模拟实现

我们知道，位图底层是用数组实现的，每个整形数组元素可以存放32个比特位，所以我们必须通过取模，相除等操作来修改比特位，具体的原理如下。
在这里插入图片描述
bt.set(35)就相当于对数组_Array[1]操作，随后_Array[1]|=(1<<3)

ostream& operator<<(ostream &out, const Bitset<10> &bt);
template <size_t _N>
class Bitset
{
	friend ostream& operator<<(ostream &out, const Bitset<10> &bt);
public:
	Bitset() :m_bit((_N - 1) / 32 + 1), m_size(_N)
	{}
	void set(size_t pos)  //pos位 置为1 
	{
		if (pos >= m_size)
			return;
		size_t index = pos / 32;
		size_t offset = pos % 32;
		m_bit[index] |= (1 << offset);//所以
	}
	bool test(size_t pos)  //检测pos位是否为1
	{
		if (pos >= m_size)
			return false;
		size_t index = pos / 32;
		size_t offset = pos % 32;
		m_bit[index] &= (1 << offset);
	}
public:
	size_t count()const
	{
		int bitCnttable[256] = {
			0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2,
			3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3,
			3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3,
			4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4,
			3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5,
			6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4,
			4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5,
			6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5,
			3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3,
			4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6,
			6, 7, 6, 7, 7, 8 };
		/*
		编程之美  0-255  之间，数num的比特位中1的个数是bitCnttable[num],这样做是因为对于数组中任意1个数，
		我们在统计其中的个数时，（将其强转为char，也就是取低8位，随后右移8位）循环4次，每次判断8个字节，
		而8字节最大也就是255，所以这样直接就返回bitCnttable[num]。减少开销
		*/
		size_t size = m_bit.size();
		size_t count = 0;
		for (size_t i = 0; i < size; i++)
		{
			//每次先取出一个int -> m_bit[i];
			//int有32个比特位，我们每次右移8位
			int value = m_bit[i];
			int j = 0;
			while (j < sizeof(m_bit[0]))//循环4次sizeof(m_bit[0]=4 ，将32个比特为强转换为循环四次的8个比特俄日的char，
			{
				unsigned char c = value;//将int强转为char类型，取低8位
				count += bitCnttable[c];
				++j;
				value >>= 8;
			}
		}
		return count;
	}
private:
	vector<int> m_bit;
	size_t m_size;
};
ostream& operator<<(ostream &out, const Bitset<10> &bt)
{
	cout << bt.m_bit[0] << endl;
	for (int i = 9; i >= 0; --i)
	{
		if (bt.m_bit[i / 32] & (0x01 << i))
			out << i;
		else
			out << 0;
	}
	return out;
}

void main()
{
	Bitset<10> bt;
	cout << bt << endl;
	bt.set(2);
	bt.set(1);
	cout << bt.test(3) << endl;
	cout << bt.count() << endl;
}

2、布隆过滤器

布隆过滤器实质上就是利用位图，查看大数据中的某个元素是否存在过。优点是查找速度快，但是布隆过滤器只能判断数据是否一定不存在，而无法判断数据是否一定存在。其实就是通过多个hash算法，将数据映射到位图中，在再次查找某个元素时，运行相同的hash算法，进行判断。但由于hash算法的不确定性，而且数据巨大而位图较小，则有可能不存在的数据经过hash计算刚好映射到之前已经映射过的位置。所以而无法判断数据是否一定存在。只能判断数据是否一定不存在。
在这里插入图片描述

2.1 模拟实现


struct StrToInt1
{
	size_t BKDRHash(const char *str)
	{
		register size_t hash = 0;
		while (size_t ch = (size_t)*str++)
		{
			hash = hash * 131 + ch;
		}
		return hash;
	}
	size_t operator()(const string &str)
	{
		return BKDRHash(str.c_str());
	}
};

struct StrToInt2
{
	size_t SDBMHash(const char *str)
	{
		register size_t hash = 0;
		while (size_t ch = (size_t)*str++)
		{
			hash = 65599 * hash + ch;
		}		return hash;

	}
	size_t operator()(const string &str)
	{
		return SDBMHash(str.c_str());
	}
};
struct StrToInt3
{
	size_t RSHash(const char *str)
	{
		register size_t hash = 0;
		size_t magic = 63689;
		while (size_t ch = (size_t)*str++)
		{
			hash = hash * magic + ch;
			magic *= 378551;
		}
		return hash;
	}
	size_t operator()(const string &str)
	{
		return RSHash(str.c_str());
	}
};
//布隆过滤器
#define _N 1000
template<class T>
class BloomFilter
{
public:
	BloomFilter() :m_size(0)
	{}
public:
	void Insert(const T &str)
	{
		size_t bit_capacity = m_bmp.size();
		size_t index1 = HashFunc1(str) % bit_capacity;
		m_bmp.set(index1);
		size_t index2 = HashFunc2(str) % bit_capacity;
		m_bmp.set(index2);
		size_t index3 = HashFunc3(str) % bit_capacity;
		m_bmp.set(index3);
		m_size++;
	}
	bool Test(const T &str)
	{
		size_t bit_capacity = m_bmp.size();
		size_t index1 = HashFunc1(str) % bit_capacity;
		if (!m_bmp.test(index1))
			return false;
		size_t index2 = HashFunc2(str) % bit_capacity;
		if (!m_bmp.test(index2))
			return false;
		size_t index3 = HashFunc3(str) % bit_capacity;
		if (!m_bmp.test(index3))
			return false;
		return true;
	}
private:
	bitset<_N * 5> m_bmp;
	StrToInt1 HashFunc1;
	StrToInt3 HashFunc3;
	StrToInt2 HashFunc2;

	size_t m_size;
};