bloomfilter利用多个hash函数将key映射到位上,可以大幅节省存储空间。
搜索引擎的爬虫在判断自己是否爬过某个页面时就会用bloomfilter判断。
具体介绍可以看这篇博客http://www.cnblogs.com/heaad/archive/2011/01/02/1924195.html
下面是我的实现
#include "stdafx.h"
#include<cmath>
#include<iostream>
using namespace std;
class bloomfilter
{
private:
unsigned int m;//bit数组的宽度
unsigned int k;//使用的hash函数的个数
double f;// False Positive的比率
unsigned int n;//key数量
char*bitmap;
private:
int hash1(char*str);
int hash2(char*str);
int getbit(const int nn);
void setbit(const int nn);
public:
bloomfilter(const int N);
bool find(char*str);
~bloomfilter();
};
bloomfilter::bloomfilter(const int N)
{
n = N;
f = 0.00001;
//要达到上述的false positive比率需要的hash函数个数为k = -ln(f) / ln(2)
//k = -log(f) / log(2.0);
//实际f、n、m、k有一定关系,这里偷懒了
//n = m ln(0.6185) / ln(f)
//m = (n + 1)*log(f) / log(0.6185);//m是bit数,一个char有8个bit
k = 2;
m = 1000000;
bitmap = new char[m/8+1];
memset(bitmap, 0, (m / 8 + 1)*sizeof(char));
}
bloomfilter::~bloomfilter()
{
delete[]bitmap;
}
bool bloomfilter::find(char*str)
{
int l1 = hash1(str);
int flag = 0;
if (getbit(l1))
flag++;
else
setbit(l1);
int l2 = hash2(str);
if (getbit(l2))
flag++;
else
setbit(l2);
return flag == 2;
}
//位操作
int bloomfilter::getbit(const int nn)
{
int nnn = nn >> 3;
int lessthan8 = nn % 8;
return (bitmap[nnn]>>lessthan8)%2==1;
}
//位操作
void bloomfilter::setbit(const int nn)
{
int nnn = nn >>3;
int lessthan8 = nn % 8;
bitmap[nnn]+=(1 << lessthan8);
}
//可以自己设置合适的hash函数
int bloomfilter::hash1(char*str)
{
unsigned int h=0;
char *p;
for (p = str; *p!='\n'; p++)
{
h = 31 * h + *p;
}
return h%m;
}
int bloomfilter::hash2(char*str)
{
unsigned int hash = 0;
unsigned int i = 0;
int len = strlen(str);
for (i = 0; i < len; str++, i++) {
hash = (*str) + (hash << 6) + (hash << 16) - hash;
}
return hash%m;
}
int _tmain(int argc, _TCHAR* argv[])
{
//cout << log(0.00001) << endl;
//char a = 20;
//a += 1 << 5;
//cout << ((a >> 4)%2==1) << endl;
bloomfilter bf(1000);
cout<<bf.find("www.google.com");
cout<<bf.find("www.baidu.com");
cout<<bf.find("www.google.com");
system("pause");
return 0;
}