布隆过滤器

最新推荐文章于 2023-12-14 08:45:00 发布

adorable_

最新推荐文章于 2023-12-14 08:45:00 发布

阅读量159

点赞数

分类专栏：数据结构文章标签：布隆过滤器数据结构

本文链接：https://blog.csdn.net/adorable_/article/details/79678156

版权

数据结构专栏收录该内容

27 篇文章 1 订阅

订阅专栏

布隆过滤器可以用于检索一个元素是否在一个集合中。它的优点是空间效率和查询时间都远远超过一般的算法，缺点是有一定的误识别率和删除困难。

基本思想：

如果想要判断一个元素是不是在一个集合里，一般想到的是将所有元素保存起来，然后通过比较确定。链表，树等等数据结构都是这种思路。但是随着集合中元素的增加，我们需要的存储空间越来越大，检索速度也越来越慢(O(n),O(logn))。不过世界上还有一种叫作散列表（又叫哈希表，Hash table）的数据结构。它可以通过一个Hash函数将一个元素映射成一个位阵列（Bit array）中的一个点。这样一来，我们只要看看这个点是不是1就可以知道集合中有没有它了。这就是布隆过滤器的基本思想。

具体代码：

①bloom_filter.h

#pragma once

#include <stdint.h>

#include "F:\VS2013_RTM_ULT_CHS\作业代码\数据结构\BitMap\BitMap\bit_map.h"

#define HashFuncMaxSize 2 
#define BitMapCapacity 1024 

typedef size_t(*HashFunc)(const char*);

typedef struct BloomFilter {
    BitMap bitmap;
    HashFunc hash_func[HashFuncMaxSize];
} BloomFilter;

void BloomFilterInit(BloomFilter* bf);

void BloomFilterInsert(BloomFilter* bf, const char* str);

int BloomFilterIsExist(BloomFilter* bf, const char* str);

void BloomFilterDestroy(BloomFilter* bf);

// 按照当前的设计, 是不允许删除的.

②bloom_filter.c

#include "bloom_filter.h"

size_t HashFunc0(const char* str)
{
    size_t hash = 0;        
    size_t ch;
    while (ch = (size_t)*str++)
    {
        hash = hash * 131 + ch;   // 也可以乘以31、131、1313、13131、131313..  

        // 有人说将乘法分解为位运算及加减法可以提高效率，如将上式表达为：hash = hash << 7 + hash << 1 + hash + ch;             
        // 但其实在Intel平台上，CPU内部对二者的处理效率都是差不多的，           
        // 我分别进行了100亿次的上述两种运算，发现二者时间差距基本为0（如果是Debug版，分解成位运算后的耗时还要高1/3）；          
        // 在ARM这类RISC系统上没有测试过，由于ARM内部使用Booth's Algorithm来模拟32位整数乘法运算，它的效率与乘数有关：           
        // 当乘数8-31位都为1或0时，需要1个时钟周期         
        // 当乘数16-31位都为1或0时，需要2个时钟周期            
        // 当乘数24-31位都为1或0时，需要3个时钟周期            
        // 否则，需要4个时钟周期      
        // 因此，虽然我没有实际测试，但是我依然认为二者效率上差别不大          
    }
    return hash;

}

size_t HashFunc1(const char* str)
{
    size_t hash = 0;
    size_t ch;
    while (ch = (size_t)*str++)
    {
        hash = 65599 * hash + ch;
        //hash = (size_t)ch + (hash << 6) + (hash << 16) - hash;  
    }
    return hash;

}

void BloomFilterInit(BloomFilter* bf)
{
    BitMapInit(&bf->bitmap, BitMapCapacity);
    bf->hash_func[0] = HashFunc0;
    bf->hash_func[1] = HashFunc1;
}

void BloomFilterInsert(BloomFilter* bf, const char* str)      //插入
{
    if (bf == NULL || str == NULL)
    {
        return;
    }
    size_t i = 0;
    for (; i < HashFuncMaxSize; ++i)
    {
        size_t offset = bf->hash_func[i](str) % BitMapCapacity;
        BitMapSet(&bf->bitmap, offset);
    }
    return;
}

int BloomFilterIsExist(BloomFilter* bf, const char* str)     //判定
{
    if (bf == NULL || str == NULL)
    {
        return 0;
    }
    size_t i = 0;
    for (; i < HashFuncMaxSize; ++i)
    {
        size_t offset = bf->hash_func[i](str) % BitMapCapacity;
        int ret = BitMapTest(&bf->bitmap, offset);
        if (ret == 0)
        {
            return 0;
        }
    }
    //所有哈希计算结果都在位图上
    //可以近似认为字符串包含在集合中
    return 1;
}

void BloomFilterDestroy(BloomFilter* bf)
{
    if (bf == NULL)
    {
        return;
    }
    BitMapDestroy(&bf->bitmap);
    bf->hash_func[0] = NULL;
    bf->hash_func[1] = NULL;
}

③bit_map.c


    }
    if (index >= bm->capacity)
    {
        return;
    }
    size_t n, offset;        //64个bit位表示一个数，n为数字的下标；offset表示具体的bit位
    GetOffset(index, &n, &offset);
    bm->data[n] &= ~(1ul << offset);        //将指定位设置为0
}

// 测试 index 为是 1 , 还是 0. 如果是1, 就返回1. 否则返回0. 
int BitMapTest(BitMap* bm, size_t index)
{
    if (bm == NULL)
    {
        return 0;
    }
    if (index >= bm->capacity)
    {
        return 0;
    }
    size_t n, offset;
    GetOffset(index, &n, &offset);
    //在进行按位操作时，切记注意防止整型溢出
    uint64_t ret = bm->data[n] & (1ul << offset);
    return ret != 0 ? 1 : 0;
}

// 把整个位图所有的位都设为0. 
void BitMapClear(BitMap* bm)
{
    if (bm == NULL)
    {
        return;
    }
    memset(bm->data, 0, sizeof(uint64_t)*DataSize(bm->capacity));
}

// 把整个位图所有的位都设为1. 
void BitMapFill(BitMap* bm)
{
    if (bm == NULL)
    {
        return;
    }
    memset(bm->data, 0xff, sizeof(uint64_t)*DataSize(bm->capacity));
    //一个字节一个字节的设置为1，一个字节对应8个bit位
}

void BitMapDestroy(BitMap* bm)
{
    if (bm == NULL)
    {
        return;
    }
    bm->capacity = 0;
    free(bm->data);
    bm->data = NULL;
    return;
}

④test.c

#include "bloom_filter.h"

#include <stdio.h>
#include <windows.h>

#define TEST_HEADER printf("\n=================================%s=============================\n",__FUNCTION__)

void TestBloomFilter()
{
    TEST_HEADER;
    BloomFilter bloom_filter;
    BloomFilterInit(&bloom_filter);
    BloomFilterInsert(&bloom_filter, "hello");
    int ret = BloomFilterIsExist(&bloom_filter, "hello");
    printf("ret expect 1, actual %d\n", ret);
}

int main()
{
    TestBloomFilter();
    system("pause");
    return 0;
}

测试结果：

adorable_

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
布隆过滤器

布隆过滤器可以用于检索一个元素是否在一个集合中。它的优点是空间效率和查询时间都远远超过一般的算法，缺点是有一定的误识别率和删除困难。基本思想：如果想要判断一个元素是不是在一个集合里，一般想到的是将所有元素保存起来，然后通过比较确定。链表，树等等数据结构都是这种思路。但是随着集合中元素的增加，我们需要的存储空间越来越大，检索速度也越来越慢(O(n),O(logn))。
复制链接

扫一扫