哈希扩展——位图&布隆过滤器

最新推荐文章于 2022-10-21 17:31:05 发布

Mr_Garfield__

最新推荐文章于 2022-10-21 17:31:05 发布

阅读量393

点赞数

分类专栏：数据结构算法文章标签：哈希扩展位图布隆过滤器

本文链接：https://blog.csdn.net/Mr_Garfield__/article/details/79452824

版权

数据结构同时被 2 个专栏收录

11 篇文章 0 订阅

订阅专栏

算法

3 篇文章 0 订阅

订阅专栏

位图

来看一道题：

给40亿个不重复的无符号整数，没排过序。给一个无符号整数，如何快速判断一个数是否在这40亿个数中。

要快速判断一个数是否在一组数中：比较快的算法有：平衡搜索树、哈希表等。而这里如果直接建立一个哈希表的话，每个节点存一个数据，对于40亿的数据需要40亿个整型单元，这相当于100+G内存，显然这种方法不可行。
那么，我们便想到能不能用一个bit位来表示一个数存在或者不存在状态呢，
这，就是位图
这里写图片描述

位图的代码实现：
BitMap.h

#pragma once
#include <assert.h>
#include <stdlib.h>
#include <malloc.h>
#include <string.h>
#include <stdio.h>

typedef struct BitMap
{
    size_t* _bits;
    size_t _range;
}BitMap;

void BitMapInit(BitMap* bm, size_t range);
void BitMapSet(BitMap* bm, size_t x);
void BitMapReset(BitMap* bm, size_t x);
int BitMapTest(BitMap* bm,size_t x);

void BitMapInit(BitMap* bm, size_t range)
{
    assert(bm);
    bm->_range = range;
    bm->_bits = (size_t*)malloc(sizeof(size_t)*((range>>5)+1));
    assert(bm->_bits);
    memset(bm->_bits, 0, sizeof(size_t)*((range >> 5) + 1));
}

void BitMapSet(BitMap* bm, size_t x)
{
    assert(bm);
    size_t index = (x >> 5);
    size_t num = x % 32;
    bm->_bits[index] |= (1 << num);

}

void BitMapReset(BitMap* bm, size_t x)
{
    assert(bm);
    size_t index = (x >> 5);
    size_t num = x % 32;
    bm->_bits[index] ^= (1 << num);
}

//存在返回0，不存在返回-1
int BitMapTest(BitMap* bm, size_t x)
{
    assert(bm);
    size_t index = (x >> 5);
    if (bm->_bits[index] & (1 << (x % 32)))
    {
        return 0;
    }
    else
        return -1;
}

void BitMapDestroy(BitMap* bm)
{
    free(bm->_bits);
}

布隆过滤器 (bloom filter)

如果想要判断一个元素是不是在一个集合里，一般想到的是将所有元素保存起来，然后通过比较确定。链表，树等等数据结构都是这种思路. 但是随着集合中元素的增加，我们需要的存储空间越来越大，检索速度也越来越慢(O(n),O(logn))。不过世界上还有一种叫作散列表（又叫哈希表，Hash table）的数据结构。它可以通过一个Hash函数将一个元素映射成一个位阵列（Bit array）中的一个点。这样一来，我们只要看看这个点是不是1就可以知道集合中有没有它了。这就是布隆过滤器的基本思想。
Hash面临的问题就是冲突。假设Hash函数是良好的，如果我们的位阵列长度为m个点，那么如果我们想将冲突率降低到例如 1%, 这个散列表就只能容纳m / 100个元素。显然这就不叫空间效率了（Space-efficient）了。解决方法也简单，就是使用多个Hash，如果它们有一个说元素不在集合中，那肯定就不在。如果它们都说在，虽然也有一定可能性它们在说谎，不过直觉上判断这种事情的概率是比较低的。
这里写图片描述

布隆过滤器实现：
BloomFilter.h

#pragma once
#include "BitMap.h"

typedef char* KeyType;
typedef size_t(*HASH_FUNC)(KeyType str);
/// @brief BKDR Hash Function  
/// @detail 本 算法由于在Brian Kernighan与Dennis Ritchie的《The C Programming Language》一书被展
//示而得名，是一种简单快捷的hash算法，也是Java目前采用的字符串的Hash算法（累乘因子为31）。
size_t BKDRHash(KeyType str)
{
    size_t hash = 0;
    size_t ch;
    while ( ch = (size_t)*str++)
    {
        hash = hash * 131 + ch;   // 也可以乘以31、131、1313、13131、131313.         
    }
    return hash;
}
/// @brief SDBM Hash Function  
/// @detail 本算法是由于在开源项目SDBM（一种简单的数据库引擎）中被应用而得名
//，它与BKDRHash思想一致，只是种子不同而已。
size_t SDBMHash(KeyType str)
{
    size_t hash = 0;
    size_t ch;
    while (ch = (size_t)*str++)
    {
        hash = 65599 * hash + ch;
        //hash = (size_t)ch + (hash << 6) + (hash << 16) - hash;  
    }
    return hash;
}

/// @brief RS Hash Function  
/// @detail 因Robert Sedgwicks在其《Algorithms in C》一书中展示而得名。  
size_t RSHash(KeyType str)
{
    size_t hash = 0;
    size_t magic = 63689;
    size_t ch;
    while (ch = (size_t)*str++)
    {
        hash = hash * magic + ch;
        magic *= 378551;
    }
    return hash;
}

typedef struct BloomFilter
{
    BitMap _bm;

    HASH_FUNC hashfunc1;
    HASH_FUNC hashfunc2;
    HASH_FUNC hashfunc3;
}BloomFilter;

void BloomFilterInit(BloomFilter* bf,size_t range)
{
    assert(bf);
    BitMapInit(&bf->_bm,range);

    bf->hashfunc1 = BKDRHash;
    bf->hashfunc2 = SDBMHash;
    bf->hashfunc3 = RSHash;
}
void BloomFilterSet(BloomFilter* bf, KeyType key)
{
    assert(bf);

    size_t range = bf->_bm._range;
    size_t hash1 = bf->hashfunc1(key);
    size_t hash2 = bf->hashfunc2(key);
    size_t hash3 = bf->hashfunc3(key);

    BitMapSet(&bf->_bm,hash1%range);
    BitMapSet(&bf->_bm,hash2%range);
    BitMapSet(&bf->_bm,hash3%range);
}
//存在返回0，不存在返回-1
int BloomFilterTest(BloomFilter* bf, KeyType key)
{
    assert(bf);

    size_t range = bf->_bm._range;

    if (BitMapTest(&bf->_bm, bf->hashfunc1(key) % range) == -1)
        return -1;

    if (BitMapTest(&bf->_bm, bf->hashfunc2(key) % range) == -1)
        return -1;

    if (BitMapTest(&bf->_bm, bf->hashfunc3(key) % range) == -1)
        return -1;
    return 0;
}

void BloomFilterDestroy(BloomFilter* bf)
{
    BitMapDestroy(&bf->_bm);
}

void TestBloomFilter()
{
    BloomFilter bf;
    BloomFilterInit(&bf, 10000);
    BloomFilterSet(&bf, "sort");

    BloomFilterSet(&bf, "insert");
    size_t x = 0;
    printf("%d\n",BloomFilterTest(&bf,"sort"));
}