正好在“问答”和“论坛”中看到关于Bloom Filter的帖子,学习研究了一把,自娱自乐就写了一种实现。不多说,直接上代码,代码尽量写得具备可读性,不多解释了。关于Bloom Filter可以参考[url]http://www.google.com.hk/ggblog/googlechinablog/2007/07/bloom-filter_7469.html[/url]
小小测试:
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.BitSet;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
public class BloomFilter<T extends Serializable> implements Serializable {
/**
*
*/
private static final long serialVersionUID = -4322599722288348992L;
private static final int DEFAULT_CAPACITY = 1 << 16;
private transient BitSet filter;
private HashGenerator<T> hashGenerator;
private int hashSize;
private int nbits;
public BloomFilter(HashGenerator<T> hashGenerator) {
this(hashGenerator, hashGenerator.size(), DEFAULT_CAPACITY);
}
public BloomFilter(HashGenerator<T> hashGenerator, int capacity) {
this(hashGenerator, hashGenerator.size(), capacity);
}
public BloomFilter(HashGenerator<T> hashGenerator, int hashSize, int capacity) {
super();
this.hashGenerator = hashGenerator;
this.hashSize = hashSize;
this.nbits = capacity * hashSize * 2;
filter = new BitSet(nbits);
}
private void writeObject(ObjectOutputStream out) throws IOException {
// 压缩
ByteArrayOutputStream buf = new ByteArrayOutputStream();
ObjectOutputStream objOut = new ObjectOutputStream(new GZIPOutputStream(buf));
objOut.writeObject(filter);
objOut.close();
out.writeObject(buf.toByteArray());
out.defaultWriteObject();
}
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
byte[] buf = (byte[]) in.readObject();
ObjectInputStream objIn = new ObjectInputStream(new GZIPInputStream(
new ByteArrayInputStream(buf)));
filter = (BitSet) objIn.readObject();
objIn.close();
in.defaultReadObject();
}
public void add(T key) {
for (int i = 0; i < hashSize; i++) {
long hashCode = hashGenerator.getHashCode(key, i);
int index = hashGenerator.getBitIndex(hashCode, nbits);
filter.set(index);
}
}
public boolean contains(T key) {
for (int i = 0; i < hashSize; i++) {
long hashCode = hashGenerator.getHashCode(key, i);
int index = hashGenerator.getBitIndex(hashCode, nbits);
if (!filter.get(index))
return false;
}
return true;
}
}
import java.io.Serializable;
public interface HashGenerator<T> extends Serializable {
public int getBitIndex(long hashCode,int maxIndex);
public long getHashCode(T key, int index);
public int size();
}
import java.util.Random;
public abstract class AbstractHashGenerator<T> implements HashGenerator<T> {
/**
*
*/
private static final long serialVersionUID = 1918866698987940799L;
private static final Random rand = new Random();
private int size;
public AbstractHashGenerator(int size) {
super();
this.size = size;
}
public int getBitIndex(long hashCode, int maxIndex) {
rand.setSeed(hashCode);
return rand.nextInt(maxIndex);
}
public int size() {
return size;
}
}
public class SimpleHashGenerator<T> extends AbstractHashGenerator<T> {
/**
*
*/
private static final long serialVersionUID = -6971076063651082178L;
public SimpleHashGenerator(int size) {
super(size);
}
public SimpleHashGenerator() {
this(8);
}
private long getHashCode2(T key, int index) {
int h = index * key.hashCode();
h ^= (h >>> 20) ^ (h >>> 12);
return h ^ (h >>> 7) ^ (h >>> 4);
}
private long getHashCode1(T key, int index) {
int h = index * 31 + key.hashCode();
h += ~(h << 9);
h ^= (h >>> 14);
h += (h << 4);
h ^= (h >>> 10);
return h;
}
public long getHashCode(T key, int index) {
return getHashCode1(key, index);
// return getHashCode2(key, index);
// if ((index & 1) == 0) {
// return getHashCode1(key, index);
// } else {
// return getHashCode2(key, index);
// }
}
}
小小测试:
import java.io.File;
import java.io.Serializable;
import bluechip.io.SerializeUtils;
public class BloomFilterTest {
static class Key implements Serializable {
/**
*
*/
private static final long serialVersionUID = 7503732767154152820L;
String name;
String id;
public Key(String name, String id) {
super();
this.name = name;
this.id = id;
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#hashCode()
*/
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((id == null) ? 0 : id.hashCode());
result = prime * result + ((name == null) ? 0 : name.hashCode());
return result;
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#equals(java.lang.Object)
*/
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Key other = (Key) obj;
if (id == null) {
if (other.id != null)
return false;
} else if (!id.equals(other.id))
return false;
if (name == null) {
if (other.name != null)
return false;
} else if (!name.equals(other.name))
return false;
return true;
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return "Key [id=" + id + ", name=" + name + "]";
}
}
/**
* @param args
*/
public static void main(String[] args) throws Exception {
File file = new File("d:/filter.dat");
int n = 1000000;
BloomFilter<Key> bf = null;
try {
bf = SerializeUtils.readObject(file);
} catch (Exception ex) {
bf = new BloomFilter<Key>(new SimpleHashGenerator<Key>(), n);
for (int i = 0; i < n; i++) {
bf.add(new Key("Jim", String.valueOf(i)));
}
SerializeUtils.writeObject(bf, new File("d:/filter.dat"));
}
System.out.println("==================");
for (int i = 0; i < n; i++) {
Key k = new Key("aaa", String.valueOf(i));
if (bf.contains(k)) {
System.out.println(k);
}
}
System.out.println("==================");
for (int i = 0; i < n; i++) {
Key k = new Key("Jim", String.valueOf(i));
if (!bf.contains(k)) {
System.out.println(k);
}
}
}
}