python布隆过滤器实现去重详解

## 声明计算偏移量所需要的函数,来源:http://www.partow.net/programming/hashfunctions/index.html
def rs_hash(key):
    a = 378551
    b = 63689
    hash_value = 0
    for i in range(len(key)):
        hash_value = hash_value * a + ord(key[i])
        a = a * b
    return hash_value


def js_hash(key):
    hash_value = 1315423911
    for i in range(len(key)):
        hash_value ^= ((hash_value << 5) + ord(key[i]) + (hash_value >> 2))
    return hash_value


def pjw_hash(key):
    bits_in_unsigned_int = 4 * 8
    three_quarters = (bits_in_unsigned_int * 3) / 4
    one_eighth = bits_in_unsigned_int / 8
    high_bits = 0xFFFFFFFF << int(bits_in_unsigned_int - one_eighth)
    hash_value = 0
    test = 0

    for i in range(len(key)):
        hash_value = (hash_value << int(one_eighth)) + ord(key[i])
        test = hash_value & high_bits
    if test != 0:
        hash_value = ((hash_value ^ (test >> int(three_quarters))) & (~high_bits))
    return hash_value & 0x7FFFFFFF


def elf_hash(key):
    hash_value = 0
    for i in range(len(key)):
        hash_value = (hash_value << 4) + ord(key[i])
        x = hash_value & 0xF0000000
        if x != 0:
            hash_value ^= (x >> 24)
        hash_value &= ~x
    return hash_value


def bkdr_hash(key):
    seed = 131  # 31 131 1313 13131 131313 etc..
    hash_value = 0
    for i in range(len(key)):
        hash_value = (hash_value * seed) + ord(key[i])
    return hash_value


def sdbm_hash(key):
    hash_value = 0
    for i in range(len(key)):
        hash_value = ord(key[i]) + (hash_value << 6) + (hash_value << 16) - hash_value;
    return hash_value


def djb_hash(key):
    hash_value = 5381
    for i in range(len(key)):
        hash_value = ((hash_value << 5) + hash_value) + ord(key[i])
    return hash_value


def dek_hash(key):
    hash_value = len(key);
    for i in range(len(key)):
        hash_value = ((hash_value << 5) ^ (hash_value >> 27)) ^ ord(key[i])
    return hash_value


def bp_hash(key):
    hash_value = 0
    for i in range(len(key)):
        hash_value = hash_value << 7 ^ ord(key[i])
    return hash_value


def fnv_hash(key):
    fnv_prime = 0x811C9DC5
    hash_value = 0
    for i in range(len(key)):
        hash_value *= fnv_prime
        hash_value ^= ord(key[i])
    return hash_value


def ap_hash(key):
    hash_value = 0xAAAAAAAA
    for i in range(len(key)):
        if (i & 1) == 0:
            hash_value ^= ((hash_value << 7) ^ ord(key[i]) * (hash_value >> 3))
        else:
            hash_value ^= (~((hash_value << 11) + ord(key[i]) ^ (hash_value >> 5)))
    return hash_value

## 实现去重过滤类
class BloomFilterRedis:
    hash_list = [rs_hash, js_hash, pjw_hash, elf_hash, bkdr_hash,
                 sdbm_hash, djb_hash, dek_hash]

    def __init__(self, key, host='127.0.0.1', port=6379, hash_list=hash_list):
        # redis-bitmap的key
        self.key = key
        # redis连接信息
        self.pool = redis.ConnectionPool(host=host, port=port)
        self.handle = redis.StrictRedis(connection_pool=self.pool, charset='utf-8')
        # 哈希函数列表
        self.hash_list = hash_list

    @classmethod
    def random_generator(cls, hash_value):
        '''
        将hash函数得出的函数值映射到[0, 2^32-1]区间内
        '''
        return hash_value % (1 << 32)

    def do_filter(self, item):
        '''
        检查是否是新的条目,是新条目则更新bitmap并返回True,是重复条目则返回False
        '''
        flag = False
        for hash_func in self.hash_list:
            # 获得到hash函数对象
            # hash_func = getattr(GeneralHashFunctions, hash_func_str)
            print(hash_func)
            # 计算hash值
            hash_value = hash_func(item)
            # 将hash值映射到[0, 2^32]区间
            real_value = BloomFilterRedis.random_generator(hash_value)
            print(real_value)
            # bitmap中对应位是0,则置为1,并说明此条目为新的条目
            if self.handle.getbit(self.key, real_value) == 0:
                self.handle.setbit(self.key, real_value, 1)
                flag = True
        # 当所有hash值在bitmap中对应位都是1,说明此条目重复,返回False
        return flag


if __name__ == '__main__':
    bloomFilterRedis = BloomFilterRedis("bloom")
    bloomFilterRedis.do_filter("one item to check")```

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值