布隆过滤器:
1、用途:本质相当于一个集合,用于快速判断一个元素是否在集合之内
2、实现原理:
2.1、自行准备若干hash函数
2.2、准备一个足量的数组创建bitmap,足量与否根据hash值范围
2.3、插入时:用所有hash函数对key取hash值,均插入bitmap
2.4、判断时:用所有hash函数对key取hash值,判断bitmap是否均已set
2.5、注意,已经插入的数据,自然判断时肯定是正确的,即如果bloomfilter判定不存在的元素不会冤枉它,必定不存在;但偶尔会出现,并不存在的数据也判定存在,即对是否存在的数据可能有误判,尤其随数据越来越多以后,但几率很很低。
3、优点:hash也可以实现这个功能,但hash的实现往往内存占用很大,随数据越来越多rehash越来越大,bloomfilter不会增大
缺点:误判;另外当想删除已插入的元素时比较麻烦(也可以实现如rocksdb但比较麻烦)
4、实际用:除了”海量网址判重“、"邮件黑白名单过滤"等这些典型例子外,在rocksdb中被用于优化读取,属rocksdb重要改进点之一。
代码:
#include <random>
#include <iostream>
#include <memory.h>
#include <memory>
#include <set>
unsigned int RSHash(const char* str, unsigned int len) {
int b = 378551;
int a = 63689;
int hash = 0;
int i = 0;
for(i = 0; i < len; str++, i++) {
hash = hash * a + (*str);
a = a * b;
}
return hash;
}
unsigned int JSHash(const char* str, unsigned int len) {
int hash = 1315423911;
int i = 0;
for(i = 0; i < len; str++, i++) {
hash ^= ((hash << 5) + (*str) + (hash >> 2));
}
return hash;
}
unsigned int PJWHash(const char* str, unsigned int len) {
const int BitsInUnsignedInt = (int)(sizeof(int) * 8);
const int ThreeQuarters = (int)((BitsInUnsignedInt * 3) / 4);
const int OneEighth = (int)(BitsInUnsignedInt / 8);
const int HighBits = (int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
int hash = 0, test = 0, i = 0;
for(i = 0; i < len; str++, i++) {
hash = (hash << OneEighth) + (*str);
if((test = hash & HighBits) != 0) {
hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits));
}
}
return hash;
}
class Bitmap {
int *data;
public:
Bitmap(int max) {
data = new int[max/32 + 1];
memset(data, 0, sizeof(int) * (max/32 + 1));
}
~Bitmap() {
if (data) {
delete []data;
}
}
void Set(unsigned int d) {
std::cout << d << std::endl;
data[d/32] |= (1 << (d % 32));
}
bool Get(unsigned int d) {
return data[d/32] & (1 << (d % 32));
}
};
class BloomFilter {
std::unique_ptr<Bitmap> bmap;
public:
BloomFilter () {
bmap.reset(new Bitmap(99999999));
}
void Set (int d) {
unsigned int hash1 = RSHash(std::to_string(d).c_str(), std::to_string(d).length()) % (1000 * 1000 * 100);
unsigned int hash2 = JSHash(std::to_string(d).c_str(), std::to_string(d).length()) % (1000 * 1000 * 100);
unsigned int hash3 = PJWHash(std::to_string(d).c_str(), std::to_string(d).length()) % (1000 * 1000 * 100);
bmap->Set(hash1);
bmap->Set(hash2);
bmap->Set(hash3);
}
bool Get (int d) {
unsigned int hash1 = RSHash(std::to_string(d).c_str(), std::to_string(d).length()) % (1000 * 1000 * 100);
unsigned int hash2 = JSHash(std::to_string(d).c_str(), std::to_string(d).length()) % (1000 * 1000 * 100);
unsigned int hash3 = PJWHash(std::to_string(d).c_str(), std::to_string(d).length()) % (1000 * 1000 * 100);
if (bmap->Get(hash1) && bmap->Get(hash2) && bmap->Get(hash3)) {
return true;
}
return false;
}
};
int main () {
//first insert 100 num to bloomfilter
std::set<int> inserted;
std::random_device rd;
BloomFilter blf;
for (int i = 0; i < 100; i++) {
int cur = rd() % 1000 * 1000 * 100;
inserted.insert(cur);
blf.Set(cur);
}
//and then make sure that blf is right(all the number inserted yet could be found)
for (std::set<int>::iterator i = inserted.begin(); i != inserted.end(); i++) {
if (!blf.Get(*i)) {
std::cout << *i << ": fail" << std::endl;
}
}
//now calc failure rate for this bloomfilter: find other numbers, if judge exist so that wrong, calc failure rate
int fail = 0, succ = 0;
for (int i = 0; i < 1000 * 1000 * 100; i++) {
if (blf.Get(i)) {
++fail;
} else {
++succ;
}
}
//result = 0.000001, nice
std::cout << std::fixed << double(fail)/double(fail + succ) << std::endl;
return 0;
}