pyhon3 - bloomfilter(布隆筛选器) - pybloom_live 使用
一、安装
#安装
pip3 install pybloom_live
#卸载
pip3 uninstall pybloom_live
二、筛选器类型
BloomFilter(定容)
ScalableBloomFilter(可伸缩的)
三、使用实例
#!/usr/bin/env python3
# coding=utf-8
import codecs
import os
import re
import sys
import time
import requests
from pyquery import PyQuery as pq
from pybloom_live.pybloom import (BloomFilter,
ScalableBloomFilter,
make_hashfuncs)
savePath = "/Users/download/"
bloomfilterFileName = "bloomfilter_urls.txt"
def getBloomFilter():
fileFullName = os.path.join(savePath,bloomfilterFileName)
# capacity是容量, error_rate 是能容忍的误报率
bFilter = BloomFilter(capacity=100000, error_rate=0.0001)
#初始化布隆
print('初始化布隆过滤器,{}'.format(fileFullName))
totalCount = 0
#按行读取txt文件
with open(fileFullName,'r',encoding='utf-8') as f:
for line in f:
# line = f.readline()
# if not line: #等价于if line == "":
# break
if line.startswith('#'):
print('bloom filter content:%s' % line)
continue
#txt = line.replace('\r\n','').encode('utf8')
txt = re.sub(re.compile('\n|\r|\s'),'',line)
# 写入,true=已存在key,false=不存在新写入
f = bFilter.add(txt)
print('bloom add [%s],size=%s' % (f,len(bFilter)))
totalCount = totalCount + 1
print('布隆过滤器,初始化完成\t成功={}/{}'.format(bFilter.count,totalCount))
return bFilter
def setBloomFilter(bFilter,content):
# 重新编码
# bItem = content.encode('utf8')
fileFullName = os.path.join(savePath,bloomfilterFileName)
print("更新 bloom file:",fileFullName)
result = False
#布隆过滤写入
#如果键已经存在过滤器将返回True, 否则 返回 False
flag = bFilter.add(content)
if not flag:
#写bloom txt 文件
with open(fileFullName, 'a+') as f: # 若是'wb'就表示写二进制文件
f.write('{}\n'.format(content))
result = True
print("bloom add [%s]" % result)
return result
def bloomForTxtFile():
bfilter = getBloomFilter()
url = "thunder://QUFodHRwOi8vZG93bi5kZGxvYWRhYmMuY29tOjU3NTc1L3pmL015RnJpZW5kc0hvdE1vbS1CcmlhbmEgQmFua3MubXA0Wlo="
f = setBloomFilter(bfilter,url)
print('bloom filter test [%s]' % ('存在' if f else '新增'))
pass
def bloomSimple():
# 固定大小的筛选器,错误率是万分之一
bf = BloomFilter(capacity=100, error_rate=0.0001)
txt = '123'
print('%s %s' % (txt,('存在' if (txt in bf) else '不存在')))
f = bf.add(txt) #False 添加成功;True=已存在
print('%s %s' % (txt,('存在' if (txt in bf) else '不存在')))
pass
def main():
# 简单使用
bloomSimple()
# 使用 txt 暂存
bloomForTxtFile()