BloomFilter简易实现

最新推荐文章于 2022-07-11 07:36:00 发布

ken_scott

最新推荐文章于 2022-07-11 07:36:00 发布

阅读量947

点赞数

分类专栏： c++ 数据结构与算法工具文章标签： filter iterator delete url null string

本文链接：https://blog.csdn.net/cxxmaker/article/details/7471884

版权

c++ 同时被 3 个专栏收录

100 篇文章 0 订阅

订阅专栏

数据结构与算法

42 篇文章 0 订阅

订阅专栏

工具

22 篇文章 0 订阅

订阅专栏

#ifndef __BLOOM_FILTER_HPP__
#define __BLOOM_FILTER_HPP__


#include <cstdlib>
#include <cstring>

template <typename T>
unsigned int GetHash(const T & value)
{
    return(value);
}

const int prime[] = {
      3,   5,   7,  11,  13,  17,  19,  23,  29,  31,
     37,  41,  43,  47,  53,  59,  61,  67,  71,  73, 
     79,  83,  89,  97, 101, 103, 107, 109, 113, 127,
    131, 137, 139, 149, 151, 157, 163, 167, 173, 179
};

template <typename T>
class BloomFilter
{
public:
    BloomFilter(unsigned int count, unsigned int k = 10);
    ~BloomFilter();
    void set(const T & value);
    bool test(const T & value);
private:
    BloomFilter(const BloomFilter &);
    BloomFilter & operator = (const BloomFilter &);
    void clear(const T & value);
private:
    unsigned int     m_k;
    unsigned int     m_size;
    unsigned int     m_count;
    unsigned char ** m_filter;
};

template <typename T>
BloomFilter<T>::BloomFilter(unsigned int count, unsigned int k)
 : m_k(k), m_size(0), m_count(count), m_filter(NULL)
{
    if (m_count == 0) {  /* maybe it is not good enough */
        abort();
    }
    
    if (sizeof(prime) / sizeof(prime[0]) < 2) {
        abort();
    }
 
    if (m_k > sizeof(prime) / sizeof(prime[0])) {
        m_k = sizeof(prime) / sizeof(prime[0]);
    }
    else if (m_k < 2) {
        m_k = 2;
    }

    m_size = ((count >> 2) + 1); /* maybe it is not good enough */
          
    typedef unsigned char * ucharptr;
    m_filter = new ucharptr[m_k];
    if (m_filter == NULL) {
        abort();
    }

    for (int i = 0; i < m_k; ++i) {
        m_filter[i] = new unsigned char[m_size];
        if (m_filter[i] == NULL) {
            abort();
        }
        memset(m_filter[i], 0, m_size);
    }
}

template <typename T>
BloomFilter<T>::~BloomFilter()
{
    for (int i = 0; i < m_k; ++i) {
        delete[] m_filter[i];
    }
    delete[] m_filter;
}

template <typename T>
void BloomFilter<T>::set(const T & value)
{
    unsigned int hash = GetHash(value);
    for (int i = 0; i < m_k; ++i) {
        unsigned int key = (hash * prime[i]) % m_count;
        unsigned char * bit = m_filter[i];
        bit[key >> 3] |= (0x01 << (key & 0x07));
    }
}

template <typename T>
void BloomFilter<T>::clear(const T & value)
{
    unsigned int hash = GetHash(value);
    for (int i = 0; i < m_k; ++i) {
        unsigned int key = (hash * prime[i]) % m_count;
        unsigned char * bit = m_filter[i];
        bit[key >> 3] &= ~(0x01 << (key & 0x07));
    }
}

template <typename T>
bool BloomFilter<T>::test(const T & value)
{
    unsigned int hash = GetHash(value);
    for (int i = 0; i < m_k; ++i) {
        unsigned int key = (hash * prime[i]) % m_count;
        unsigned char * bit = m_filter[i];
        if (!(bit[key >> 3] & (0x01 << (key & 0x07)))) {
            return(false);
        }
    }
    return(true);
}


#endif

#include <vector>
#include <string>
#include <iostream>
using namespace std;
#include "BloomFilter.hpp"

template <>
unsigned int GetHash(const string & value)
{
    unsigned int hash = 0;
    typedef string::const_iterator iterator;
    for (iterator iter = value.begin(); iter != value.end(); ++iter) {
        hash += (*iter) * 5;
    }
    return(hash);
}

int main(int argc, char ** argv)
{
    int array[] = { 9, 5, 4, 6, 7, 8, 0, 1, 55, -100 };
    const int size = sizeof(array)/sizeof(array[0]);
    int min = array[0];
    int max = array[0];
    cout << "array:   ";
    for (int i = 0; i < size; ++i) {
        if (array[i] > max) {
            max = array[i];
        }
        else if (array[i] < min) {
            min = array[i];
        }
        cout << array[i] << ' ';
    }
    cout << endl;
    
    BloomFilter<int> filter1(size);
    BloomFilter<int> filter2(5 * size);
    BloomFilter<int> filter3(10 * size);
    /* must be right, but BitMap can do it */
    BloomFilter<int> filter4(max - min);
    for (int i = 0; i < size; ++i) {
        filter1.set(array[i]);
        filter2.set(array[i]);
        filter3.set(array[i]);
        filter4.set(array[i]);
    }

    cout << "sorted1: ";
    for (int value = min; value <= max; ++value) {
        if (filter1.test(value)) {
            cout << value << ' ';
        }
    }
    cout << endl;

    cout << "sorted2: ";
    for (int value = min; value <= max; ++value) {
        if (filter2.test(value)) {
            cout << value << ' ';
        }
    }
    cout << endl;

    cout << "sorted3: ";
    for (int value = min; value <= max; ++value) {
        if (filter3.test(value)) {
            cout << value << ' ';
        }
    }
    cout << endl;

    cout << "sorted4: ";
    for (int value = min; value <= max; ++value) {
        if (filter4.test(value)) {
            cout << value << ' ';
        }
    }
    cout << endl;

    /* ------------------------------------------ */

    const char * const url[] = {
        "www.google.com.hk", 
        "www.bing.com.cn", 
        "www.baidu.com", 
        "www.manmankan.com", 
        "www.csdn.net" 
    };
    BloomFilter<string> filter(500);
    for (int i = 0; i < sizeof(url)/sizeof(url[0]); ++i) {
        filter.set(url[i]);
    }

    const char * const check[] = {
        "www.google.com.hk", 
        "www.bing.com.cn", 
        "www.baidu.com", 
        "www.manmankan.com", 
        "www.csdn.net", 

        "www.hao123.com", 
        "www.sohu.com", 
        "www.soso.com", 
        "www.sina.com", 
        "www.nosuchurl.com" 
    };

    for (int i = 0; i < sizeof(check)/sizeof(check[0]); ++i) {
        if (filter.test(check[i])) {
            cout << check[i] << "    is exist" << endl;
        }
        else {
            cout << check[i] << "    is not exist" << endl;
        }
    }

    return(0);
}

代码中的位数组大小(m_size, 受传入的m_count影响), 哈希函数个数(m_k), 哈希函数的构造(GetHash)都不太恰当