大数据处理算法--Bloom Filter

最新推荐文章于 2020-08-29 23:59:14 发布

db199410

最新推荐文章于 2020-08-29 23:59:14 发布

阅读量570

点赞数 3

分类专栏：数据结构文章标签： hash 算法 filter 大数据

本文链接：https://blog.csdn.net/db199410/article/details/52453998

版权

数据结构专栏收录该内容

9 篇文章 0 订阅

订阅专栏

一、布隆过滤器（Bloom Filter）的定义

布隆过滤器可以用来检测数据是否存在于一个集合中。它是hash的扩展，底层就是一个位数组，每一个bit位可以表示一个数字，所以布隆过滤器是基于位图来实现的。

二、布隆过滤器的原理

1、插入数据

在位图中，每一个bit对应一个数字，出现一个数字就可以在相应的位上置1。但是布隆过滤器不一样，它要处理的不仅仅是整型还有其他如string类型的数据，因此，当大量的string类信息需要处理的时候，难免会引起大量的冲突。布隆过滤器在这里的处理是用多个bit位来表示string类型的数据，这多个bit位通过不同的hash函数得出，这样冲突的概率就减少了。

2、删除数据

布隆过滤器不支持删除数据，因为一个数据类型是由好几个bit位来表示的，难免几个数据的bit位会重叠，如果删除一个bit位，其他的数据也可能受到影响。那么如何实现布隆过滤器的删除呢？这里可以引入引用计数的概念，将位数组扩展位整型数组，数组的下标对应数据通过hash函数得出的数，数组中存放的是数字出现的次数，即引用计数。

3、检测数据

同插入数据时一样，这里的数据需要用几个hash函数得出多个bit位，如果每个bit位都存在于位数组中，那么这个数据可能存在。为什么说是可能存在呢，因为有可能这个数据的几个bit位是由其他数据置为1的。如果几个bit中有一个为0，那么这个数据一定就不存在了。

三、布隆过滤器的特点

1、优点：它相比于hash，红黑树等结构更加节省空间，而且插入效率和查找效率都远远超过一般算法

2、缺点：不支持删除操作；查找的结果不一定准确（结果为不存在时是准确的，为存在时是不准确的）；

四、布隆过滤器的应用

像网易，QQ这样的公众电子邮件（email）提供商，总是需要过滤来自发送垃圾邮件的人（spamer）的垃圾邮件。

一个办法就是记录下那些发垃圾邮件的 email地址。由于那些发送者不停地在注册新的地址，全世界少说也有几十亿个发垃圾邮件的地址，将他们都存起来则需要大量的网络服务器。

如果用哈希表，每存储一亿个 email地址，就需要 1.6GB的内存（用哈希表实现的具体办法是将每一个 email地址对应成一个八字节的信息指纹，然后将这些信息指纹存入哈希表，由于哈希表的存储效率一般只有 50%，因此一个email地址需要占用十六个字节。一亿个地址大约要 1.6GB，即十六亿字节的内存）。因此存贮几十亿个邮件地址可能需要上百 GB的内存。

而Bloom Filter只需要哈希表 1/8到 1/4 的大小就能解决同样的问题。

BloomFilter决不会漏掉任何一个在黑名单中的可疑地址。而至于误判问题，常见的补救办法是在建立一个小的白名单，存储那些可能别误判的邮件地址。

五、布隆过滤器的实现

#include<iostream>
#include "bitmap.h"
using namespace std;

struct HashFunc1
{
	static size_t BKDRHash(const char *str)
	{
		unsigned int seed = 131;
		unsigned int hash = 1;
		while (*str)
		{
			hash = hash * seed + (*str++);
		}

		return (hash & 0x7fffffff);
	}

	size_t operator()(const std::string &str)
	{
		return BKDRHash(str.c_str());
	}
};
struct HashFunc2
{
	static size_t BKDRHash(const char *str)
	{
		register size_t hash = 0;
		while (size_t ch = (size_t)*str++)
		{
			hash = hash * 131 + ch;
		}
		return hash;
	}

	size_t operator()(const std::string &str)
	{
		return BKDRHash(str.c_str());
	}
};
struct HashFunc3
{
	static size_t BKDRHash(const char *str)
	{
		if (!*str)        // 这是由本人添加，以保证空字符串返回哈希值0  
			return 0;
		register size_t hash = 1315423911;
		while (size_t ch = (size_t)*str++)
		{
			hash ^= ((hash << 5) + ch + (hash >> 2));
		}
		return hash;
	}

	size_t operator()(const std::string &str)
	{
		return BKDRHash(str.c_str());
	}
};
struct HashFunc4
{
	static size_t BKDRHash(const char *str)
	{
		register size_t hash = 0;
		size_t magic = 63689;
		while (size_t ch = (size_t)*str++)
		{
			hash = hash * magic + ch;
			magic *= 378551;
		}
		return hash;
	}

	size_t operator()(const std::string &str)
	{
		return BKDRHash(str.c_str());
	}
};
struct HashFunc5
{
	static size_t BKDRHash(const char *str)
	{
		register size_t hash = 0;
		size_t ch;
		for (long i = 0; ch = (size_t)*str++; i++)
		{
			if ((i & 1) == 0)
			{
				hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
			}
			else
			{
				hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
			}
		}
		return hash;
	}
	size_t operator()(const std::string &str)
	{
		return BKDRHash(str.c_str());
	}
};
template<class K=string,
class Hash1 = HashFunc1,
class Hash2 = HashFunc2,
class Hash3 = HashFunc3,
class Hash4 = HashFunc4,
class Hash5 = HashFunc5>
class Bloom
{
public:
	Bloom(size_t size)
		:_map(size)
	{}
	void Set(string str1)
	{
		size_t hash1 = HashFunc1()(str1);
		_map.Set(hash1%_map.Size());
		size_t hash2 = HashFunc2()(str1);
		_map.Set(hash2%_map.Size());
		size_t hash3 = HashFunc3()(str1);
		_map.Set(hash3%_map.Size());
		size_t hash4 = HashFunc4()(str1);
		_map.Set(hash4%_map.Size());
		size_t hash5 = HashFunc5()(str1);
		_map.Set(hash5%_map.Size());
	}

	void Unset()
	{

	}

	bool Test(string str1)
	{
		size_t hash1 = HashFunc1()(str1);
		if (false == _map.test(hash1%_map.Size()))
			return false;
		size_t hash2 = HashFunc2()(str1);
		if (false == _map.test(hash2%_map.Size()))
			return false;
		size_t hash3 = HashFunc3()(str1);
		if (false == _map.test(hash3%_map.Size()))
			return false;
		size_t hash4 = HashFunc4()(str1);
		if (false == _map.test(hash4%_map.Size()))
			return false;
		size_t hash5 = HashFunc5()(str1);
		if (false == _map.test(hash5%_map.Size()))
			return false;
		return true;
	}

private:
	bitmap _map;
};

以下是位图的实现

#pragma once
#include<iostream>
#include <vector>
using namespace std;

class bitmap
{
public:
	bitmap(size_t size)
		:_size(size)
	{
		map = new int[(size >> 5) + 1];
		memset(map, 0, sizeof(map));
	}
	~bitmap()
	{
		delete[]map;
	}
	void Set(size_t num)
	{
		int index = num >> 5;
		int pos = num % 32;
		map[index] |= (1 << (pos-1));
	}
	void Unset(size_t num)
	{
		int index = num >> 5;
		int pos = num % 32;
		map[index] &= ~(1 << (pos-1));
	}

	bool test(size_t num)
	{
		int index = num >> 5;
		int pos = num % 32;
		if (((map[index]>>(pos-1))&1)==1)
			return true;
		return false;
	}
	size_t Size()
	{
		return _size;
	}
private:
	int*map;
	size_t _size;
};