c++实现布隆过滤器

hashfuntion.h(常用字符串Hash函数

unsigned int SDBMHash(const char *str);
unsigned int RSHash(const char *str);
unsigned int JSHash(const char *str);
unsigned int PJWHash(const char *str);
unsigned int APHash(const char *str);
unsigned int DJBHash(const char *str);
unsigned int ELFHash(const char *str);
unsigned int BKDRHash(const char *str);

hashfuntion.cpp

#include "hashfunction.h"

unsigned int SDBMHash(const char *str)
{
    unsigned int hash = 0;
    while (*str)
    {
        // equivalent to: hash = 65599*hash + (*str++);
        hash = (*str++) + (hash << 6) + (hash << 16) - hash;
    }
    return (hash & 0x7FFFFFFF);
}

//RS Hash Function
unsigned int RSHash(const char *str)
{
    unsigned int b = 378551;
    unsigned int a = 63689;
    unsigned int hash = 0;
    while (*str)
    {
        hash = hash * a + (*str++);
        a *= b;
    }
    return (hash & 0x7FFFFFFF);
}

//JS Hash Function
unsigned int JSHash(const char *str)
{
    unsigned int hash = 1315423911;
    while (*str)
    {
        hash ^= ((hash << 5) + (*str++) + (hash >> 2));
    }
    return (hash & 0x7FFFFFFF);
}

//P. J. Weinberger Hash Function
unsigned int PJWHash(const char *str)
{
    unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8);
    unsigned int ThreeQuarters    = (unsigned int)((BitsInUnignedInt  * 3) / 4);
    unsigned int OneEighth        = (unsigned int)(BitsInUnignedInt / 8);
    unsigned int HighBits         = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);
    unsigned int hash             = 0;
    unsigned int test             = 0;
    while (*str)
    {
        hash = (hash << OneEighth) + (*str++);
        if ((test = hash & HighBits) != 0)
        {
            hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
        }
    }
    return (hash & 0x7FFFFFFF);
}

//AP Hash Function
unsigned int APHash(const char *str)
{
    unsigned int hash = 0;
    int i;
    for (i=0; *str; i++)
    {
        if ((i & 1) == 0)
        {
            hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3));
        }
        else
        {
            hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5)));
        }
    }
    return (hash & 0x7FFFFFFF);
}

//DJB Hash Function
unsigned int DJBHash(const char *str)
{
    unsigned int hash = 5381;
    while (*str)
    {
        hash += (hash << 5) + (*str++);
    }
    return (hash & 0x7FFFFFFF);
}

// ELF Hash Function
unsigned int ELFHash(const char *str)
{
    unsigned int hash = 0;
    unsigned int x    = 0;
    while (*str)
    {
        hash = (hash << 4) + (*str++);
        if ((x = hash & 0xF0000000L) != 0)
        {
            hash ^= (x >> 24);
            hash &= ~x;
        }
    }
    return (hash & 0x7FFFFFFF);
}

//BKDR Hash Function
unsigned int BKDRHash(const char *str)
{
    unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
    unsigned int hash = 0;
    while (*str)
    {
        hash = hash * seed + (*str++);
    }
    return (hash & 0x7FFFFFFF);
}

bloomfilter.h

#ifndef BLOOMFILTER_H
#define BLOOMFILTER_H

#include <stdio.h>
#include <iostream>
#include <string>
#include <math.h>
#include <vector>
#include <fstream>
#include "hashfunction.h"

#define INT_SIZE 32
#define BUFFER_SIZE 1024

class BloomFilter{
public:
    //样本个数n,期望的失误率p,传入样本的文档路径
    BloomFilter(double errRate,int sampleNum);
    ~BloomFilter();
    void filterInit();                      //初始化布隆过滤器
    void arrayGenerate(std::string path);   //打开path路径的文档,计算每一行样本到array中
    bool isContain(const char* str);        //查看字符串是否在样本中存在
    void storeArray(std::string path);      //把array存储在指定路径文件path里
    void restoreArray(std::string path);    //把array从指定路径文件path里恢复载入
    int getHashFunNum();                    //返回需要的哈希函数的个数k
    int getIntNum();                        //返回需要的内存的int数
private:
    int hashtableInit();    //把几个哈希函数加入到hastable中
    std::string path;       //传入样本的文档路径
    double errRate;         //样本失误率p
    int sampleNum;          //样本个数n
    int bitNum;             //需要的二进制位数m
    int intNum;             //需要申请内存的int数
    int hashFunNum;         //需要的哈希函数的个数k,注意计算得到的哈希函数个数k应该<=hashtable.size();
    int *array;             //内存
    std::vector<unsigned int (*)(const char*)> hashtable;    //存放计算字符串哈希值的哈希函数
};

#endif // BLOOMFILTER_H

bloomfilter.cpp

#include "bloomfilter.h"

BloomFilter::BloomFilter(double errRate,int sampleNum)
{
    this->errRate=errRate;
    this->sampleNum=sampleNum;
    this->bitNum=-((this->sampleNum*log(this->errRate))/(log(2)*log(2)));
    this->hashFunNum=0.7*(this->bitNum/this->sampleNum);
    this->intNum=this->bitNum/INT_SIZE+1;
    array=new int[this->intNum];
}

BloomFilter::~BloomFilter()
{
    delete []array;
}

void BloomFilter::filterInit()
{
    hashtableInit();
    if(this->hashFunNum>(int)hashtable.size())
    {
        std::cout<<"哈希函数不足,请添加"<<std::endl;
        return;
    }
}

int BloomFilter::hashtableInit()
{
    hashtable.push_back(*PJWHash);
    hashtable.push_back(*JSHash);
    hashtable.push_back(*RSHash);
    hashtable.push_back(*SDBMHash);
    hashtable.push_back(*APHash);
    hashtable.push_back(*DJBHash);
    hashtable.push_back(*BKDRHash);
    hashtable.push_back(*ELFHash);
    return hashtable.size();
}

void BloomFilter::arrayGenerate(std::string path)
{
    int hashval;
    std::fstream fs;
    fs.open(path.c_str(),std::ios::in|std::ios::out);
    if(!fs)
    {
        perror("open file error");
        return;
    }
    fs.seekp(std::ios::beg);
    char buf[BUFFER_SIZE]={0};
    fs.getline(buf,BUFFER_SIZE);
    while(!fs.eof())
    {
        for(int i=0;i!=this->hashFunNum;i++)
        {
            hashval=hashtable[i](buf);
            hashval=hashval%(this->intNum*INT_SIZE);
            this->array[hashval/INT_SIZE]|=(0x1<<(hashval%INT_SIZE));
        }
        fs.getline(buf,BUFFER_SIZE);
    }
    fs.clear();
    fs.close();
}

bool BloomFilter::isContain(const char* str)
{
    int hashval;
    for(int i=0;i!=this->hashFunNum;i++)
    {
        hashval=hashtable[i](str);
        hashval=hashval%(this->intNum*INT_SIZE);
        if(array[hashval/INT_SIZE]&(0x1<<(hashval%INT_SIZE)))
            continue;
        else
            return false;
    }
    return true;
}

void BloomFilter::storeArray(std::string path)
{
    std::fstream fs;
    fs.open(path.c_str(),std::ios::in|std::ios::out);
    if(!fs)
    {
        fs.open(path.c_str(),std::ios::app|std::ios::in|std::ios::out);
        for(int k=0;k<this->intNum;k++)
            fs<<array[k]<<std::endl;
    }
    else
    {
        fs.seekp(std::ios::beg);
        for(int k=0;k<this->intNum;k++)
            fs<<array[k]<<std::endl;
    }
    fs.clear();
    fs.close();
}

void BloomFilter::restoreArray(std::string path)
{
    std::fstream fs;
    fs.open(path.c_str(),std::ios::in|std::ios::out);
    if(!fs)
    {
        perror("open file error");
        return;
    }
    fs.seekp(std::ios::beg);
    char buf[INT_SIZE]={0};
    fs.getline(buf,INT_SIZE);
    int i=0;
    while(!fs.eof())
    {
        sscanf(buf,"%d\n",&this->array[i++]);
        fs.getline(buf,INT_SIZE);
    }
    fs.clear();
    fs.close();
}

int BloomFilter::getHashFunNum()
{
    return this->hashFunNum;
}

int BloomFilter::getIntNum()
{
    return this->intNum;
}

测试:

main.cpp

加载文件生成array并保存:

#include "tool/bloomfilter.h"
using namespace std;

int main(int argc, char *argv[])
{
    BloomFilter mybloom(0.01,100);
    mybloom.filterInit();
    mybloom.arrayGenerate("../tool/redlist.txt");
    cout<<"intNum:"<<mybloom.getIntNum()<<",hashFunNum:"<<mybloom.getHashFunNum()<<endl;
    cout<<"www.dubai.com在我的集合中吗:"<<(mybloom.isContain("www.dubai.com")?"在":"不在")<<endl;
    cout<<"www.qq.com在我的集合中吗:"<<(mybloom.isContain("www.qq.com")?"在":"不在")<<endl;
    mybloom.storeArray("../tool/bloom_array.txt");
    return 0;
}

从文件中读取恢复array:

#include "tool/bloomfilter.h"
using namespace std;

int main(int argc, char *argv[])
{
    BloomFilter mybloom(0.01,100);
    mybloom.filterInit();
    mybloom.restoreArray("../tool/bloom_array.txt");
    cout<<"www.dubai.com在我的集合中吗:"<<(mybloom.isContain("www.dubai.com")?"在":"不在")<<endl;
    cout<<"www.qq.com在我的集合中吗:"<<(mybloom.isContain("www.qq.com")?"在":"不在")<<endl;
    return 0;
}
  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值