安装
pip <span class="hljs-keyword">install</span> pybloom
1
|
pip
<
span
class
=
"hljs-keyword"
>
install
<
/
span
>
pybloom
|
该模块包含两个类实现布隆过滤器功能。
BloomFilter
是定容。
ScalableBloomFilter
可以自动扩容
使用
<span class="hljs-meta">></span><span class="bash">>> from pybloom import BloomFilter</span> <span class="hljs-meta">></span><span class="bash">>> f = BloomFilter(capacity=1000, error_rate=0.001) <span class="hljs-comment"># capacity是容量, error_rate 是能容忍的误报率</span></span> <span class="hljs-meta">></span><span class="bash">>> f.add(<span class="hljs-string">'Traim304'</span>) <span class="hljs-comment"># 当不存在该元素,返回False</span></span> False <span class="hljs-meta">></span><span class="bash">>> f.add(<span class="hljs-string">'Traim304'</span>) <span class="hljs-comment"># 若存在,返回 True</span></span> True <span class="hljs-meta">></span><span class="bash">>> <span class="hljs-string">'Traim304'</span> <span class="hljs-keyword">in</span> f <span class="hljs-comment"># 值得注意的是若返回 True。该元素可能存在, 也可能不存在。过滤器能容许存在一定的错误</span></span> True <span class="hljs-meta">></span><span class="bash">>> <span class="hljs-string">'Jacob'</span> <span class="hljs-keyword">in</span> f <span class="hljs-comment"># 但是 False。则必定不存在</span></span> False <span class="hljs-meta">></span><span class="bash">>> len(f) <span class="hljs-comment"># 当前存在的元素</span></span> 1 <span class="hljs-meta"> ></span><span class="bash">>> f = BloomFilter(capacity=1000, error_rate=0.001) </span> <span class="hljs-meta">></span><span class="bash">>> from pybloom import ScalableBloomFilter</span> <span class="hljs-meta">></span><span class="bash">>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)</span> <span class="hljs-meta">></span><span class="bash">>> <span class="hljs-comment"># sbf.add() 与 BloomFilter 同</span></span>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
<
span
class
=
"hljs-meta"
>>
<
/
span
>
<
span
class
=
"bash"
>>>
from
pybloom
import
BloomFilter
<
/
span
>
<
span
class
=
"hljs-meta"
>>
<
/
span
>
<
span
class
=
"bash"
>>>
f
=
BloomFilter
(
capacity
=
1000
,
error_rate
=
0.001
)
<
span
class
=
"hljs-comment"
>
# capacity是容量, error_rate 是能容忍的误报率</span></span>
<
span
class
=
"hljs-meta"
>>
<
/
span
>
<
span
class
=
"bash"
>>>
f
.
add
(
<
span
class
=
"hljs-string"
>
'Traim304'
<
/
span
>
)
<
span
class
=
"hljs-comment"
>
# 当不存在该元素,返回False</span></span>
False
<
span
class
=
"hljs-meta"
>>
<
/
span
>
<
span
class
=
"bash"
>>>
f
.
add
(
<
span
class
=
"hljs-string"
>
'Traim304'
<
/
span
>
)
<
span
class
=
"hljs-comment"
>
# 若存在,返回 True</span></span>
True
<
span
class
=
"hljs-meta"
>>
<
/
span
>
<
span
class
=
"bash"
>>>
<
span
class
=
"hljs-string"
>
'Traim304'
<
/
span
>
<
span
class
=
"hljs-keyword"
>
in
<
/
span
>
f
<
span
class
=
"hljs-comment"
>
# 值得注意的是若返回 True。该元素可能存在, 也可能不存在。过滤器能容许存在一定的错误</span></span>
True
<
span
class
=
"hljs-meta"
>>
<
/
span
>
<
span
class
=
"bash"
>>>
<
span
class
=
"hljs-string"
>
'Jacob'
<
/
span
>
<
span
class
=
"hljs-keyword"
>
in
<
/
span
>
f
<
span
class
=
"hljs-comment"
>
# 但是 False。则必定不存在</span></span>
False
<
span
class
=
"hljs-meta"
>>
<
/
span
>
<
span
class
=
"bash"
>>>
len
(
f
)
<
span
class
=
"hljs-comment"
>
# 当前存在的元素</span></span>
1
<
span
class
=
"hljs-meta"
>
>
<
/
span
>
<
span
class
=
"bash"
>>>
f
=
BloomFilter
(
capacity
=
1000
,
error_rate
=
0.001
)
<
/
span
>
<
span
class
=
"hljs-meta"
>>
<
/
span
>
<
span
class
=
"bash"
>>>
from
pybloom
import
ScalableBloomFilter
<
/
span
>
<
span
class
=
"hljs-meta"
>>
<
/
span
>
<
span
class
=
"bash"
>>>
sbf
=
ScalableBloomFilter
(
mode
=
ScalableBloomFilter
.
SMALL_SET_GROWTH
)
<
/
span
>
<
span
class
=
"hljs-meta"
>>
<
/
span
>
<
span
class
=
"bash"
>>>
<
span
class
=
"hljs-comment"
>
# sbf.add() 与 BloomFilter 同</span></span>
|
超过误报率时抛出异常
>>> f = BloomFilter(capacity=<span class="hljs-number">1000</span>, error_rate=<span class="hljs-number">0.0000001</span>) >>> <span class="hljs-keyword">for</span> <span class="hljs-keyword">a</span> <span class="hljs-keyword">in</span> range(<span class="hljs-number">1000</span>): ... _ = f.<span class="hljs-built_in">add</span>(<span class="hljs-keyword">a</span>) ... >>> <span class="hljs-built_in">len</span>(<span class="hljs-keyword">a</span>) Traceback (most recent call <span class="hljs-keyword">last</span>): File <span class="hljs-string">"<stdin>"</span>, <span class="hljs-built_in">line</span> <span class="hljs-number">1</span>, <span class="hljs-keyword">in</span> <module> TypeError: object <span class="hljs-keyword">of</span> type <span class="hljs-string">'int'</span> has no <span class="hljs-built_in">len</span>() >>> <span class="hljs-built_in">len</span>(f) <span class="hljs-number">1000</span> >>> f.<span class="hljs-built_in">add</span>(<span class="hljs-number">1000</span>) False >>> f.<span class="hljs-built_in">add</span>(<span class="hljs-number">1001</span>) <span class="hljs-comment"># 当误报率超过 error_rate 会报错</span> Traceback (most recent call <span class="hljs-keyword">last</span>): File <span class="hljs-string">"<stdin>"</span>, <span class="hljs-built_in">line</span> <span class="hljs-number">1</span>, <span class="hljs-keyword">in</span> <module> File <span class="hljs-string">"/usr/local/lib/python2.7/site-packages/pybloom/pybloom.py"</span>, <span class="hljs-built_in">line</span> <span class="hljs-number">182</span>, <span class="hljs-keyword">in</span> <span class="hljs-built_in">add</span> raise IndexError(<span class="hljs-string">"BloomFilter is at capacity"</span>) IndexError: BloomFilter is <span class="hljs-keyword">at</span> capacity
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
>>>
f
=
BloomFilter
(
capacity
=
<
span
class
=
"hljs-number"
>
1000
<
/
span
>
,
error_rate
=
<
span
class
=
"hljs-number"
>
0.0000001
<
/
span
>
)
>>>
<
span
class
=
"hljs-keyword"
>
for
<
/
span
>
<
span
class
=
"hljs-keyword"
>
a
<
/
span
>
<
span
class
=
"hljs-keyword"
>
in
<
/
span
>
range
(
<
span
class
=
"hljs-number"
>
1000
<
/
span
>
)
:
.
.
.
_
=
f
.
<
span
class
=
"hljs-built_in"
>
add
<
/
span
>
(
<
span
class
=
"hljs-keyword"
>
a
<
/
span
>
)
.
.
.
>>>
<
span
class
=
"hljs-built_in"
>
len
<
/
span
>
(
<
span
class
=
"hljs-keyword"
>
a
<
/
span
>
)
Traceback
(
most
recent
call
<
span
class
=
"hljs-keyword"
>
last
<
/
span
>
)
:
File
<
span
class
=
"hljs-string"
>
"<stdin>"
<
/
span
>
,
<
span
class
=
"hljs-built_in"
>
line
<
/
span
>
<
span
class
=
"hljs-number"
>
1
<
/
span
>
,
<
span
class
=
"hljs-keyword"
>
in
<
/
span
>
<
module
>
TypeError
:
object
<
span
class
=
"hljs-keyword"
>
of
<
/
span
>
type
<
span
class
=
"hljs-string"
>
'int'
<
/
span
>
has
no
<
span
class
=
"hljs-built_in"
>
len
<
/
span
>
(
)
>>>
<
span
class
=
"hljs-built_in"
>
len
<
/
span
>
(
f
)
<
span
class
=
"hljs-number"
>
1000
<
/
span
>
>>>
f
.
<
span
class
=
"hljs-built_in"
>
add
<
/
span
>
(
<
span
class
=
"hljs-number"
>
1000
<
/
span
>
)
False
>>>
f
.
<
span
class
=
"hljs-built_in"
>
add
<
/
span
>
(
<
span
class
=
"hljs-number"
>
1001
<
/
span
>
)
<
span
class
=
"hljs-comment"
>
# 当误报率超过 error_rate 会报错</span>
Traceback
(
most
recent
call
<
span
class
=
"hljs-keyword"
>
last
<
/
span
>
)
:
File
<
span
class
=
"hljs-string"
>
"<stdin>"
<
/
span
>
,
<
span
class
=
"hljs-built_in"
>
line
<
/
span
>
<
span
class
=
"hljs-number"
>
1
<
/
span
>
,
<
span
class
=
"hljs-keyword"
>
in
<
/
span
>
<
module
>
File
<
span
class
=
"hljs-string"
>
"/usr/local/lib/python2.7/site-packages/pybloom/pybloom.py"
<
/
span
>
,
<
span
class
=
"hljs-built_in"
>
line
<
/
span
>
<
span
class
=
"hljs-number"
>
182
<
/
span
>
,
<
span
class
=
"hljs-keyword"
>
in
<
/
span
>
<
span
class
=
"hljs-built_in"
>
add
<
/
span
>
raise
IndexError
(
<
span
class
=
"hljs-string"
>
"BloomFilter is at capacity"
<
/
span
>
)
IndexError
:
BloomFilter
is
<
span
class
=
"hljs-keyword"
>
at
<
/
span
>
capacity
|