介绍
布鲁姆过滤器常用作对元素和集合的从属关系进行判定,用来绕过在集合中查找对应元素的过程。本文将介绍一种最简单的基于比特向量的布鲁姆过滤器,并提供Java代码。
实现方法
插入:
1、初始化空集
2、插入集合元素a,经过4个不同哈希函数映射到4个位置,将其置1
3、插入元素b操作同上
4、插入元素c操作同上
查询:
1、查询元素c
2、经过4个不同的哈希函数映射到4个不同的位置
3、若发现有1处为0,则集合中一定不能存在可匹配的元素,否则无法确定集合中是否存在,需进行进一步的查找。
代码实现(Java)
package org.hash.bloom;
public interface HashKey {
public byte[] getSig();
}
package org.hash.bloom;
import org.hash.util.H3HashFunction;
import org.hash.util.HashFunctions;
/**
* bloom过滤器,用于判断一个元素是否不在于一个集合中
* 对于每个元素加入到集合时,需要先映射到布谷鸟的比特向量中
* bloom过滤器一定可确定元素不在集合中,对于存在的情况,会存在一定的假阳性错误率
*/
public class BloomFilter{
private int size; //比特向量长度
private int currentNum;
private byte bitVector[]; //比特向量
private static final int H3_HASH_NUM = 6;
private H3HashFunction h3HashFunction = new H3HashFunction(H3_HASH_NUM);
public int getCurrentNum(){
return this.currentNum;
}
public BloomFilter(int size){ //比特向量的数量
this.size = size;
this.currentNum = 0;
this.bitVector = new byte[size>>3]; //size>>3 is equivalent of size/8
}
public boolean contains(HashKey key){
int h3_hash_code = HashFunctions.bob(key.getSig()); //生成32位的哈希签名
for(int i= 0;i<H3_HASH_NUM;i++){
int h3_result = h3HashFunction.h3Hash(32,h3_hash_code,i); //
int index = h3_result % this.size;
if((bitVector[index >> 3] & (1 << (index & 7))) == 0){
return false;
}
}
return true;
}
public void add(HashKey key){
this.currentNum++;
int h3_hash_code = HashFunctions.bob(key.getSig());
for(int i = 0;i<H3_HASH_NUM;i++){
int h3_result = h3HashFunction.h3Hash(32,h3_hash_code,i);
int index = h3_result % this.size;
bitVector[index >> 3] |= 1<<(index & 7);
}
}
}
文中使用了H3哈希函数和普通哈希函数,其中H3哈希函数是一种通用型哈希函数,这里不再赘述,可参考博主的另外一篇博文。
H3哈希函数
package org.hash.util;
import java.util.Random;
/**
* H3哈希函数,可生成任意长度的哈希键
*/
public class H3HashFunction {
private class H3Matrix{
public int col[] = new int[32];
}
private int h3_hash_num;
private H3Matrix h3_matrix[];
public H3HashFunction(int h3_hash_num){
this.h3_hash_num = h3_hash_num;
this.initH3Matrix(h3_hash_num);
}
public void initH3Matrix(int h3_hash_num){
this.h3_matrix = new H3Matrix[h3_hash_num];
Random random = new Random();
for(int i = 0;i<h3_hash_num;i++){
this.h3_matrix[i] = new H3Matrix();
for(int j = 0;j<32;j++){
int r = 0;
for(int q = 0;q<4;q++){
int randf = random.nextInt()*256;
r += randf;
if(q == 3){
break;
}
r <<= 8;
}
this.h3_matrix[i].col[j] = r;
}
}
}
public int h3Hash(int h3_row,int h3_hash_code,int matrix_index){
int matrix_col = 0xffffffff;
int colIndex = 0x00000001;
for(int i = 0;i<32;i++){
if((h3_hash_code & colIndex) == colIndex){
matrix_col ^= h3_matrix[matrix_index].col[i] >> (32-h3_row);
}
colIndex <<= 1;
}
int tmp = matrix_col;
if(tmp <0){
tmp = -tmp;
}
matrix_col = tmp % (int)Math.pow(2,h3_row);
return matrix_col;
}
public static void main(String[] args) {
H3HashFunction f = new H3HashFunction(6);
}
}
普通哈希函数:
package org.hash.util;
/**
* 16-bit XOR folding hash function
* @author 路遥
*/
public class HashFunctions {
public static short xor16(byte buf[]){
short hash = 0;
short i = 0;
while (i+ 2 <= buf.length)
{
hash ^= buf[i+1] << 8;
hash ^= buf[i];
i += 2;
}
while (i<buf.length)
{
hash ^= buf[i]<<(8*(i%2));
i++;
}
return hash;
}
public static void mix(int a,int b,int c){
a -= b; a -= c; a ^= (c>>13);
b -= c; b -= a; b ^= (a<<8);
c -= a; c -= b; c ^= (b>>13);
a -= b; a -= c; a ^= (c>>12);
b -= c; b -= a; b ^= (a<<16);
c -= a; c -= b; c ^= (b>>5);
a -= b; a -= c; a ^= (c>>3);
b -= c; b -= a; b ^= (a<<10);
c -= a; c -= b; c ^= (b>>15);
}
public static int bob(byte buf[]){
int a,b,c,length;
length = buf.length;
a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */
c = 0; /* variable initialization of internal state */
/*---------------------------------------- handle most of the key */
int i = 0;
while (length >= 12)
{
a += (buf[i+0]+((short)buf[i+1]<<8)+((short)buf[i+2]<<16) +((short)buf[i+3]<<24));
b += (buf[i+4]+((short)buf[i+5]<<8)+((short)buf[i+6]<<16) +((short)buf[i+7]<<24));
c += (buf[i+8]+((short)buf[i+9]<<8)+((short)buf[i+10]<<16)+((short)buf[i+11]<<24));
mix(a,b,c);
i += 12; length -= 12;
}
/*------------------------------------- handle the last 11 bytes */
c += buf.length;
switch(length) /* all the case statements fall through */
{
case 11: c+=((int)buf[i+10]<<24);
case 10: c+=((int)buf[i+9]<<16);
case 9 : c+=((int)buf[i+8]<<8);
/* the first byte of c is reserved for the length */
case 8 : b+=((int)buf[i+7]<<24);
case 7 : b+=((int)buf[i+6]<<16);
case 6 : b+=((int)buf[i+5]<<8);
case 5 : b+=buf[i+4];
case 4 : a+=((int)buf[i+3]<<24);
case 3 : a+=((int)buf[i+2]<<16);
case 2 : a+=((int)buf[i+1]<<8);
case 1 : a+=buf[i+0];
/* case 0: nothing left to add */
}
mix(a,b,c);
/*-------------------------------------------- report the result */
return c;
}
/*Jenkins's one-at-a-time hash*/
public static int oaat(byte buf[])
{
int len = buf.length;
int hash= 0 ;
for (int i=0; i<len; ++i)
{
hash += buf[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
public static int additiveHash(String key) {
int n = key.length();
int hash = n;
for (int i = 0; i < n; i++)
hash += key.charAt(i);
return hash ^ (hash >> 10) ^ (hash >> 20);
}
public static int rotatingHash(String key) {
int n = key.length();
int hash = n;
for (int i = 0; i < n; i++)
hash = (hash << 4) ^ (hash >> 28) ^ key.charAt(i);
return (hash & 0x7FFFFFFF);
}
}
总结
本文简要介绍了BloomFilter的原理,引出了BF的一个实现案例,最后提供了BF的Java版本的实现。