用Java实现Bloom Filter

正好在“问答”和“论坛”中看到关于Bloom Filter的帖子,学习研究了一把,自娱自乐就写了一种实现。不多说,直接上代码,代码尽量写得具备可读性,不多解释了。关于Bloom Filter可以参考[url]http://www.google.com.hk/ggblog/googlechinablog/2007/07/bloom-filter_7469.html[/url]




import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.BitSet;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

public class BloomFilter<T extends Serializable> implements Serializable {
/**
*
*/
private static final long serialVersionUID = -4322599722288348992L;
private static final int DEFAULT_CAPACITY = 1 << 16;
private transient BitSet filter;
private HashGenerator<T> hashGenerator;
private int hashSize;
private int nbits;

public BloomFilter(HashGenerator<T> hashGenerator) {
this(hashGenerator, hashGenerator.size(), DEFAULT_CAPACITY);
}

public BloomFilter(HashGenerator<T> hashGenerator, int capacity) {
this(hashGenerator, hashGenerator.size(), capacity);
}

public BloomFilter(HashGenerator<T> hashGenerator, int hashSize, int capacity) {
super();
this.hashGenerator = hashGenerator;
this.hashSize = hashSize;
this.nbits = capacity * hashSize * 2;
filter = new BitSet(nbits);
}

private void writeObject(ObjectOutputStream out) throws IOException {
// 压缩
ByteArrayOutputStream buf = new ByteArrayOutputStream();
ObjectOutputStream objOut = new ObjectOutputStream(new GZIPOutputStream(buf));
objOut.writeObject(filter);
objOut.close();
out.writeObject(buf.toByteArray());
out.defaultWriteObject();
}

private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
byte[] buf = (byte[]) in.readObject();
ObjectInputStream objIn = new ObjectInputStream(new GZIPInputStream(
new ByteArrayInputStream(buf)));
filter = (BitSet) objIn.readObject();
objIn.close();
in.defaultReadObject();
}

public void add(T key) {
for (int i = 0; i < hashSize; i++) {
long hashCode = hashGenerator.getHashCode(key, i);
int index = hashGenerator.getBitIndex(hashCode, nbits);
filter.set(index);
}
}

public boolean contains(T key) {
for (int i = 0; i < hashSize; i++) {
long hashCode = hashGenerator.getHashCode(key, i);
int index = hashGenerator.getBitIndex(hashCode, nbits);
if (!filter.get(index))
return false;
}
return true;
}
}




import java.io.Serializable;

public interface HashGenerator<T> extends Serializable {
public int getBitIndex(long hashCode,int maxIndex);

public long getHashCode(T key, int index);

public int size();
}




import java.util.Random;

public abstract class AbstractHashGenerator<T> implements HashGenerator<T> {
/**
*
*/
private static final long serialVersionUID = 1918866698987940799L;
private static final Random rand = new Random();
private int size;

public AbstractHashGenerator(int size) {
super();
this.size = size;
}

public int getBitIndex(long hashCode, int maxIndex) {
rand.setSeed(hashCode);
return rand.nextInt(maxIndex);
}

public int size() {
return size;
}
}





public class SimpleHashGenerator<T> extends AbstractHashGenerator<T> {

/**
*
*/
private static final long serialVersionUID = -6971076063651082178L;

public SimpleHashGenerator(int size) {
super(size);
}

public SimpleHashGenerator() {
this(8);
}

private long getHashCode2(T key, int index) {
int h = index * key.hashCode();
h ^= (h >>> 20) ^ (h >>> 12);
return h ^ (h >>> 7) ^ (h >>> 4);
}

private long getHashCode1(T key, int index) {
int h = index * 31 + key.hashCode();
h += ~(h << 9);
h ^= (h >>> 14);
h += (h << 4);
h ^= (h >>> 10);
return h;
}

public long getHashCode(T key, int index) {
return getHashCode1(key, index);
// return getHashCode2(key, index);
// if ((index & 1) == 0) {
// return getHashCode1(key, index);
// } else {
// return getHashCode2(key, index);
// }
}
}



小小测试:


import java.io.File;
import java.io.Serializable;

import bluechip.io.SerializeUtils;

public class BloomFilterTest {
static class Key implements Serializable {
/**
*
*/
private static final long serialVersionUID = 7503732767154152820L;
String name;
String id;

public Key(String name, String id) {
super();
this.name = name;
this.id = id;
}

/*
* (non-Javadoc)
*
* @see java.lang.Object#hashCode()
*/
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((id == null) ? 0 : id.hashCode());
result = prime * result + ((name == null) ? 0 : name.hashCode());
return result;
}

/*
* (non-Javadoc)
*
* @see java.lang.Object#equals(java.lang.Object)
*/
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Key other = (Key) obj;
if (id == null) {
if (other.id != null)
return false;
} else if (!id.equals(other.id))
return false;
if (name == null) {
if (other.name != null)
return false;
} else if (!name.equals(other.name))
return false;
return true;
}

/*
* (non-Javadoc)
*
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return "Key [id=" + id + ", name=" + name + "]";
}

}

/**
* @param args
*/
public static void main(String[] args) throws Exception {
File file = new File("d:/filter.dat");
int n = 1000000;
BloomFilter<Key> bf = null;
try {
bf = SerializeUtils.readObject(file);
} catch (Exception ex) {

bf = new BloomFilter<Key>(new SimpleHashGenerator<Key>(), n);
for (int i = 0; i < n; i++) {
bf.add(new Key("Jim", String.valueOf(i)));
}
SerializeUtils.writeObject(bf, new File("d:/filter.dat"));
}

System.out.println("==================");
for (int i = 0; i < n; i++) {
Key k = new Key("aaa", String.valueOf(i));
if (bf.contains(k)) {
System.out.println(k);
}
}
System.out.println("==================");
for (int i = 0; i < n; i++) {
Key k = new Key("Jim", String.valueOf(i));
if (!bf.contains(k)) {
System.out.println(k);
}
}
}

}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值