python爬虫url去重_Python 爬虫使用布隆过滤器实现url去重

#!/usr/bin/python3

# encoding=utf-8

import redis

from hashlib import md5

class SimpleHash(object):

def __init__(self, cap, seed):

self.cap = cap

self.seed = seed

def hash(self, value):

ret = 0

for i in range(len(value)):

ret += self.seed * ret + ord(value[i])

return (self.cap - 1) & ret

class BloomFilter(object):

def __init__(self, host='localhost', port=6379, db=0, blockNum=1, key='bloomfilter'):

"""

:param host: the host of Redis

:param port: the port of Redis

:param db: witch db in Redis

:param blockNum: one blockNum for about 90,000,000; if you have more strings for filtering, increase it.

:param key: the key's name in Redis

"""

self.server = redis.Redis(host=host, port=port, db=db)

self.bit_size = 1 << 31 # Redis的String类型最大容量为512M,现使用256M

self.seeds = [5, 7, 11, 13, 31, 37, 61]

self.key = key

self.blockNum = blockNum

self.hashfunc = []

for seed in self.seeds:

self.hashfunc.append(SimpleHash(self.bit_size, seed))

def isContains(self, str_input):

if not str_input:

return False

m5 = md5()

m5.update(str_input.encode("utf8"))

str_input = m5.hexdigest()

ret = True

name = self.key + str(int(str_input[0:2], 16) % self.blockNum)

print(name)

for f in self.hashfunc:

loc = f.hash(str_input)

ret = ret & self.server.getbit(name, loc)

return ret

def insert(self, str_input):

m5 = md5()

m5.update(str_input.encode("utf8"))

str_input = m5.hexdigest()

name = self.key + str(int(str_input[0:2], 16) % self.blockNum)

for f in self.hashfunc:

loc = f.hash(str_input)

self.server.setbit(name, loc, 1)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值