Bloom Filter 是一种空间效率很高的随机数据结构,Bloom filter 可以看做是对 bit-map 的扩展, 它的基本思想是:
如果判断一个元素是不是在一个集合里,一般想到的是将所有元素保存起来,然后通过比较确定,但是随着集合元素的增加,需要的存储的空间越来越大,检索的速度聚会越来越慢,这时我我们会想到散列表(哈希表hashtable)的数据结构。通过一个hash函数将一个元素映射到一个位阵列中的一个点,这样只要需要看这个映射的这个位置是不是1,就可以确定这个元素在没在这个集合中了。
布隆过滤器的简单实现:
common.h
size_t BKDRHash(const char *str)
{
unsigned int seed = 131; // 31 131 1313 13131 131313
unsigned int hash = 0;
while (*str)
{
hash = hash * seed + (*str++);
}
return (hash & 0x7FFFFFFF);
}
size_t SDBMHash(const char* str)
{
register size_t hash = 0;
while (size_t ch = (size_t)*str++)
{
hash = 65599 * hash + ch;
//hash = (size_t)ch+(hash<<6)+ (hash<<16)-hash;
}
return hash;
}
size_t RSHash(const char *str)
{
register size_t hash = 0;
size_t magic = 63689;
while (size_t ch = (size_t)*str++)
{
hash = hash * magic + ch;
magic *= 378551;
}
return hash;
}
size_t APHash(const char* str)
{
register size_t hash = 0;
size_t ch;
for (long i = 0; ch = (size_t)*str++; i++)
{
if (0 == (i & 1))
{
hash ^= ((hash << 7) ^ (hash >> 3));
}
else
{
hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
}
}
return hash;
}
size_t JSHash(const char* str)
{
if (!*str)
return 0;
register size_t hash = 1315423911;
while (size_t ch = (size_t)*str++)
{
hash ^= ((hash << 5) + ch + (hash >> 2));
}
return hash;
}
template<class K>
struct __HashFunc1
{
size_t operator()(const K& key)
{
return BKDRHash(key.c_str());
}
};
template<class K>
struct __HashFunc2
{
size_t operator()(const K& key)
{
return SDBMHash(key.c_str());
}
};
template<class K>
struct __HashFunc3
{
size_t operator()(const K& key)
{
return RSHash(key.c_str());
}
};
template<class K>
struct __HashFunc4
{
size_t operator()(const K& key)
{
return APHash(key.c_str());
}
};
template<class K>
struct __HashFunc5
{
size_t operator()(const K& key)
{
return JSHash(key.c_str());
}
};
bitmap.h
#define _CRT_SECURE_NO_WARNINGS
#include<stdio.h>
#include<stdlib.h>
#include<iostream>
#include<vector>
using namespace std;
class BitMap
{
public:
BitMap()
{}
BitMap(size_t size)
{
_table.resize(size / 32 + 1);
}
void set(int data)
{
ByteNo = data >> 5;//求data放的区间号
BitNo = data % 32;//求data放的该区间的比特位
_table[ByteNo] |= (1 << BitNo);//将BitNo置1
}
void reset(int data)
{
ByteNo = data >> 5;//求data放的区间号
BitNo = data % 32;//求data放的该字节的区间位
_table[ByteNo] &= ~(1 << BitNo);//将BitNo置0
}
bool Find(int data)
{
ByteNo = data >> 5;
BitNo = data % 32;
if ((1 << BitNo)&_table[ByteNo])//如果结果为1,说明该数字在此范围内;结果为0,说明不存在
return true;
return false;
}
private:
vector<size_t> _table;
size_t ByteNo;
size_t BitNo;
};
bloomFilter.cpp
#include"bitmap.h"
#include"common.h"
template<class K , class Fun1 = __HashFunc1<string>
,class Fun2 = __HashFunc2<string>
,class Fun3 = __HashFunc3<string>
,class Fun4 = __HashFunc4<string>
,class Fun5 = __HashFunc5<string>>
class BloomFilter
{
public:
BloomFilter(size_t capacity=100)
:_bitmap(capacity)
, _capacity(capacity)
{}
void set(const K& key)
{
size_t idx1 = Fun1()(key) % _capacity;
size_t idx2 = Fun2()(key) % _capacity;
size_t idx3 = Fun3()(key) % _capacity;
size_t idx4 = Fun4()(key) % _capacity;
size_t idx5 = Fun5()(key) % _capacity;
_bitmap.set(idx1);
_bitmap.set(idx2);
_bitmap.set(idx3);
_bitmap.set(idx4);
_bitmap.set(idx5);
cout << idx1 << "," << idx2 << "," << idx3 << "," << idx4 << "," << idx5 << "\n";
}
void reset(const K& key)
{
size_t idx1 = Fun1()(key) % _capacity;
size_t idx2 = Fun2()(key) % _capacity;
size_t idx3 = Fun3()(key) % _capacity;
size_t idx4 = Fun4()(key) % _capacity;
size_t idx5 = Fun5()(key) % _capacity;
_bitmap.reset(idx1);
_bitmap.reset(idx2);
_bitmap.reset(idx3);
_bitmap.reset(idx4);
_bitmap.reset(idx5);
cout << idx1 << "," << idx2 << "," << idx3 << "," << idx4 << "," << idx5 << "\n";
return true;
}
bool Test(const K& key)
{
size_t idx1 = Fun1()(key) % _capacity;
size_t idx2 = Fun2()(key) % _capacity;
size_t idx3 = Fun3()(key) % _capacity;
size_t idx4 = Fun4()(key) % _capacity;
size_t idx5 = Fun5()(key) % _capacity;
if (_bitmap.Find(idx1) != 0 ||
_bitmap.Find(idx2) != 0 ||
_bitmap.Find(idx3) != 0 ||
_bitmap.Find(idx4) != 0 ||
_bitmap.Find(idx5) != 0)
return true;
return false;
}
private:
BitMap _bitmap;
size_t _capacity;
};
void test()
{
BloomFilter<string> bf;
bf.set("it's not fair");
bf.set("everything in the world");
bf.set("just open your eyes");
cout<<bf.Test("aaaa")<<endl;
cout<<bf.Test("just open your eyes")<<endl;
}
int main()
{
test();
system("pause");
return 0;
}