#ifndef __BLOOM_FILTER_HPP__
#define __BLOOM_FILTER_HPP__
#include <cstdlib>
#include <cstring>
template <typename T>
unsigned int GetHash(const T & value)
{
return(value);
}
const int prime[] = {
3, 5, 7, 11, 13, 17, 19, 23, 29, 31,
37, 41, 43, 47, 53, 59, 61, 67, 71, 73,
79, 83, 89, 97, 101, 103, 107, 109, 113, 127,
131, 137, 139, 149, 151, 157, 163, 167, 173, 179
};
template <typename T>
class BloomFilter
{
public:
BloomFilter(unsigned int count, unsigned int k = 10);
~BloomFilter();
void set(const T & value);
bool test(const T & value);
private:
BloomFilter(const BloomFilter &);
BloomFilter & operator = (const BloomFilter &);
void clear(const T & value);
private:
unsigned int m_k;
unsigned int m_size;
unsigned int m_count;
unsigned char ** m_filter;
};
template <typename T>
BloomFilter<T>::BloomFilter(unsigned int count, unsigned int k)
: m_k(k), m_size(0), m_count(count), m_filter(NULL)
{
if (m_count == 0) { /* maybe it is not good enough */
abort();
}
if (sizeof(prime) / sizeof(prime[0]) < 2) {
abort();
}
if (m_k > sizeof(prime) / sizeof(prime[0])) {
m_k = sizeof(prime) / sizeof(prime[0]);
}
else if (m_k < 2) {
m_k = 2;
}
m_size = ((count >> 2) + 1); /* maybe it is not good enough */
typedef unsigned char * ucharptr;
m_filter = new ucharptr[m_k];
if (m_filter == NULL) {
abort();
}
for (int i = 0; i < m_k; ++i) {
m_filter[i] = new unsigned char[m_size];
if (m_filter[i] == NULL) {
abort();
}
memset(m_filter[i], 0, m_size);
}
}
template <typename T>
BloomFilter<T>::~BloomFilter()
{
for (int i = 0; i < m_k; ++i) {
delete[] m_filter[i];
}
delete[] m_filter;
}
template <typename T>
void BloomFilter<T>::set(const T & value)
{
unsigned int hash = GetHash(value);
for (int i = 0; i < m_k; ++i) {
unsigned int key = (hash * prime[i]) % m_count;
unsigned char * bit = m_filter[i];
bit[key >> 3] |= (0x01 << (key & 0x07));
}
}
template <typename T>
void BloomFilter<T>::clear(const T & value)
{
unsigned int hash = GetHash(value);
for (int i = 0; i < m_k; ++i) {
unsigned int key = (hash * prime[i]) % m_count;
unsigned char * bit = m_filter[i];
bit[key >> 3] &= ~(0x01 << (key & 0x07));
}
}
template <typename T>
bool BloomFilter<T>::test(const T & value)
{
unsigned int hash = GetHash(value);
for (int i = 0; i < m_k; ++i) {
unsigned int key = (hash * prime[i]) % m_count;
unsigned char * bit = m_filter[i];
if (!(bit[key >> 3] & (0x01 << (key & 0x07)))) {
return(false);
}
}
return(true);
}
#endif
#include <vector>
#include <string>
#include <iostream>
using namespace std;
#include "BloomFilter.hpp"
template <>
unsigned int GetHash(const string & value)
{
unsigned int hash = 0;
typedef string::const_iterator iterator;
for (iterator iter = value.begin(); iter != value.end(); ++iter) {
hash += (*iter) * 5;
}
return(hash);
}
int main(int argc, char ** argv)
{
int array[] = { 9, 5, 4, 6, 7, 8, 0, 1, 55, -100 };
const int size = sizeof(array)/sizeof(array[0]);
int min = array[0];
int max = array[0];
cout << "array: ";
for (int i = 0; i < size; ++i) {
if (array[i] > max) {
max = array[i];
}
else if (array[i] < min) {
min = array[i];
}
cout << array[i] << ' ';
}
cout << endl;
BloomFilter<int> filter1(size);
BloomFilter<int> filter2(5 * size);
BloomFilter<int> filter3(10 * size);
/* must be right, but BitMap can do it */
BloomFilter<int> filter4(max - min);
for (int i = 0; i < size; ++i) {
filter1.set(array[i]);
filter2.set(array[i]);
filter3.set(array[i]);
filter4.set(array[i]);
}
cout << "sorted1: ";
for (int value = min; value <= max; ++value) {
if (filter1.test(value)) {
cout << value << ' ';
}
}
cout << endl;
cout << "sorted2: ";
for (int value = min; value <= max; ++value) {
if (filter2.test(value)) {
cout << value << ' ';
}
}
cout << endl;
cout << "sorted3: ";
for (int value = min; value <= max; ++value) {
if (filter3.test(value)) {
cout << value << ' ';
}
}
cout << endl;
cout << "sorted4: ";
for (int value = min; value <= max; ++value) {
if (filter4.test(value)) {
cout << value << ' ';
}
}
cout << endl;
/* ------------------------------------------ */
const char * const url[] = {
"www.google.com.hk",
"www.bing.com.cn",
"www.baidu.com",
"www.manmankan.com",
"www.csdn.net"
};
BloomFilter<string> filter(500);
for (int i = 0; i < sizeof(url)/sizeof(url[0]); ++i) {
filter.set(url[i]);
}
const char * const check[] = {
"www.google.com.hk",
"www.bing.com.cn",
"www.baidu.com",
"www.manmankan.com",
"www.csdn.net",
"www.hao123.com",
"www.sohu.com",
"www.soso.com",
"www.sina.com",
"www.nosuchurl.com"
};
for (int i = 0; i < sizeof(check)/sizeof(check[0]); ++i) {
if (filter.test(check[i])) {
cout << check[i] << " is exist" << endl;
}
else {
cout << check[i] << " is not exist" << endl;
}
}
return(0);
}
代码中的 位数组大小(m_size, 受传入的m_count影响), 哈希函数个数(m_k), 哈希函数的构造(GetHash)都不太恰当