一.什么是布隆过滤器
利用内存中的一个长度是m的位数组B,对其中所有位都置0,位数组的初始状态是每个位的值都是0。然后根据k个不同的散列函数,对每个遍历过的URL执行散列,每次散列的结果都是不大于m的一个整数a。根据散列得到的数在位数组B对应的位上置1,也就是让B[a]=1。
每次插入一个爬过的URL,也执行k次散列,只有当全部位都已经置1了才认为这个URL已经遍历过。
二.代码实现(Java)
BloomFilter.java
package com.rumo.foundation;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.BitSet;
import java.util.concurrent.atomic.AtomicInteger;
public class BloomFilter implements Serializable{
private static final long serialVersionUID = 923464509229483197L;
private final int[] seeds;
private final int size;
private final BitSet bitset;
private final MisJudgeRate rate;
private final AtomicInteger useCount = new AtomicInteger(0);
private final Double autoClearRate;
/**
* 默认中等程序的误判率:MisjudgmentRate.MIDDLE 以及不自动清空数据
* @param dataCount
*/
public BloomFilter(int dataCount){
this(MisJudgeRate.MIDDLE, dataCount, null);
}
/**
*
* @param rate 枚举类型的误判率
* @param dataCount 预期处理的数据规模
* @param autoClearRate
* 自动清空过滤器内部信息的使用比率,传null则表示不会自动清理,
* 当过滤器使用率达到100%时,则无论传入什么数据,都会认为在数据已经存在,
* 当希望过滤器使用率达到80%时自动清空重新使用,则传入0.8
*/
public BloomFilter(MisJudgeRate rate,int dataCount,Double autoClearRate){
long bitSize = rate.seeds.length*dataCount;
if(bitSize<0 || bitSize>Integer.MAX_VALUE)
throw new RuntimeException("DataCount too large,Overflow!");
this.rate = rate;
this.seeds = rate.getSeeds();
this.size = (int)bitSize;
this.bitset = new BitSet(size);
this.autoClearRate = autoClearRate;
}
/**
* 存在返回true,不存在记录返回false
* @param data
* @return
*/
public boolean addIfNotExist(String data){
checkNeedClear();// 检查是否需要清空重新使用
int[] indexs = new int[this.seeds.length];
boolean exist = true;// assign it exist
int index;
for (int i = 0; i < this.seeds.length; i++) {
index = hash(data, this.seeds[i]);
indexs[i] = index;
if(exist){
if(!bitset.get(index)){
// 有一个不存在则认为第一次出现
exist = false;
for (int j = 0; j <= i; j++) {
setTrue(indexs[j]);
}
}
}else{
setTrue(index);
}
}
return exist;
}
public void setTrue(int index){
useCount.incrementAndGet();
bitset.set(index,true);
}
private void checkNeedClear(){
if(this.autoClearRate!=null){
if(getUseRate() >= this.autoClearRate){
synchronized (this) {
if(getUseRate() >= this.autoClearRate){
bitset.clear();
useCount.set(0);
}
}
}
}
}
private int hash(String data,int seeds){
char[] value = data.toCharArray();
int hash = 0;
if(value.length>0){
for (int i = 0; i < value.length; i++) {
hash = i*hash + value[i];
}
}
hash = hash*seeds%this.size;
return Math.abs(hash);
}
public void add(String data){
checkNeedClear();
for (int i = 0; i < this.seeds.length; i++) {
int index = hash(data, this.seeds[i]);
setTrue(index);
}
}
public boolean check(String data){
for (int i = 0; i < this.seeds.length; i++) {
int index = hash(data, this.seeds[i]);
if(!this.bitset.get(index))
return false;
}
return true;
}
public double getUseRate(){
return (double)useCount.intValue() / (double)size;
}
/**
* 清空过滤器中记录信息
*/
public void clear(){
this.useCount.set(0);
this.bitset.clear();
}
public void saveFilter(String path){
try {
ObjectOutputStream oos =
new ObjectOutputStream(new FileOutputStream(path));
oos.writeObject(this);
} catch (Exception e) {
throw new RuntimeException("Save FilterObject to file Failure!");
}
}
public static BloomFilter readFilter(String path){
try {
ObjectInputStream ois =
new ObjectInputStream(new FileInputStream(path));
return (BloomFilter) ois.readObject();
} catch (Exception e) {
throw new RuntimeException("Read Filter from ObjectFile Failure!");
}
}
public MisJudgeRate getRate() {
return rate;
}
public enum MisJudgeRate{
/**
* 分配4个位,误判率约为:0.14689159766308
*/
VERY_SMALL(new int[]{2,3,5,7}),
/**
* 分配8个位,误判率约为:0.02157714146322
*/
SMALL(new int[]{2,3,5,7,11,13,17,19}),
/**
* 分配16个位,误判率约为:0.00046557303372
*/
MIDDLE(new int[]{2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53}),
/**
* 分配32个位,误判率约为:0.00000021167340
*/
HIGH(new int[]{2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,
59,61,67,71,73,79,83,89,97,101,193,107,109,113,127,131});
private int[] seeds;
private MisJudgeRate(int[] seeds){
this.seeds = seeds;
}
public int[] getSeeds(){
return seeds;
}
public void setSeeds(int[] seeds){
this.seeds = seeds;
}
}
public static void main(String[] args) {
BloomFilter urlBloomFilter = new BloomFilter(5);
System.out.println(urlBloomFilter.addIfNotExist("www.rumoss.cn"));
System.out.println(urlBloomFilter.addIfNotExist("www.sohu.com"));
System.out.println(urlBloomFilter.addIfNotExist("www.sina.com"));
System.out.println(urlBloomFilter.getUseRate());
System.out.println(urlBloomFilter.addIfNotExist("www.qq.cn"));
System.out.println(urlBloomFilter.getUseRate());
String savePath = "D:\\bloomFilter.obj";
urlBloomFilter.saveFilter(savePath);
urlBloomFilter = readFilter(savePath);
System.out.println(urlBloomFilter.getUseRate());
System.out.println(urlBloomFilter.addIfNotExist("www.qq.cn"));
}
}
测试结果:
false
false
true
0.4
false
0.6
0.6
true