Bloom Filter的原理及实现

最新推荐文章于 2023-12-30 16:04:29 发布

laozhao02

最新推荐文章于 2023-12-30 16:04:29 发布

阅读量628

点赞数

文章标签： java 大数据

Bloom Filter：是一个比特数组，表示具有一定误报率的集合。主要优势在于其大小(比特位个数)为常数且在初始化时被设置，增加更多的元素到一个Bloom Filter 中不会增加它的大小，仅增加误报的概率。一般包含两个方法：add()，contains()。

误报率： r = (1-exp(-kn/m))k ，k = ln(2) * (m/n) , r = 0.6185*(m/n)

——k,散列函数个数

——m,比特个数

——n,被添加的元素个数

比如，存储一千万条URL的集合(n = 10 000 000)，每个URL分配8个比特(m/n = 8)，将需要10M的Bloom Filter（m = 80 000 000）,误报率约为2%。若用Set存储，需要1G的空间。

Bloom Filter的内在表现为一个m个比特位的数组。有k个独立的散列函数，每个散列函数的输入为一个对象，而输出为介于0到m-1之间的一个整数。使用这个输出的整数作为位数组的索引。当添加一个元素到Bloom Filter时，使用散列函数来生成位数组的k个索引。

上图(画的图真难看

，不知道什么工具比较好?)是使用三个散列函数的Bloom Filter中添加了几个对象(x,y,z)的过程。无论以前的状态是什么，比特位都被设置为1，在位数组中的1的个数只能增加。对象(如x,y,z)被确定地散列到数组中的位上，而这些位被设置为1，通过散列并检查那些位置上的比特值，可以查看一个对象是否在这个集合中。

当有一个对象到来时，若要检查它是否已经被加入到Bloom Fiter中，则使用与在添加对象时相同的k个散列函数来生成一个位数组的索引。现在检查是否比特数组中所有的k个比特均为1，是则返回true，否则返回false。若已被添加，则一定返回true，不过，即使此对象从未被添加到这个集合中，与所查询相对应的k个比特也可能都为1，这是因为其他对象的增加会设置这些位，从而导致误报。

用java实现的一个Bloom Filter(Hadoop in Action一书中的实现)。

package cn.zhf.test;
import java.io.BufferedReader;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
public class BloomFilter<E> {
 private BitSet bs;
 private int bitArraySize = 100000000;
 private int numHashFunc = 6;
 public BloomFilter(){
  bs = new BitSet(bitArraySize);
 }
 
 public void add(E obj){
  int[] indexes = getHashIndexes(obj);
  for(int index : indexes)
   bs.set(index);
 }
 
 public boolean contains(E obj){
  int[] indexes = getHashIndexes(obj);
  for(int index : indexes)
   if(bs.get(index) == false)
    return false;
  return true;
 }
 
 public void union(BloomFilter<E> other){
  bs.or(other.bs);
 }
  /*粗略实现，采用MD5散列作为java随机数生成器的种子并取k个随机数作为索引*/
 public int[] getHashIndexes(E obj){
  int[] indexes = new int[numHashFunc];
  long seed = 0;
  byte[] digest;
  try {
   MessageDigest md = MessageDigest.getInstance("MD5");
   md.update(obj.toString().getBytes());
   digest = md.digest();
   for(int i=0;i<6;i++)
    seed = seed^(((long)(digest[i] & 0xFF)) << (8*i));
  } catch (NoSuchAlgorithmException e) {
   e.printStackTrace();
  }
  Random gen = new Random(seed);
  for(int i=0;i<numHashFunc;i++)
   indexes[i] = gen.nextInt(bitArraySize);
  return indexes;
 }
 
 public void write(DataOutput out) throws IOException{
  int byteArraySize = (int)(bitArraySize / 8);
  byte[] byteArray = new byte[byteArraySize];
  for(int i=0;i<byteArraySize;i++){
   byte nextElement = 0;
   for(int j=0;j<8;j++){
    if(bs.get(8*i+j))
     nextElement |= 1<<j;
   }
   byteArray[i] = nextElement;
  }
  out.write(byteArray);
 }
 
 public void readFileds(DataInput in) throws IOException{
  int byteArraySize = (int)(bitArraySize / 8);
  byte[] byteArray = new byte[byteArraySize];
  in.readFully(byteArray);
  for(int i=0;i<byteArraySize;i++){
   byte nextByte = byteArray[i];
   for(int j=0;j<8;j++){
    if(((int)nextByte & (1<<j)) != 0)
     bs.set(8*i+j);
   }
  }
 }
 public Map<Integer,String> readFile(String filePath){
        BufferedReader br;
        Map<Integer,String> map = new HashMap<Integer,String>();
  try {
   br = new BufferedReader(new InputStreamReader(
           new FileInputStream(filePath)));
   int i = 0;
   for (String line = br.readLine(); line != null; line = br.readLine()) {
             map.put(i++, line);
         }
   br.close();
  } catch (FileNotFoundException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  }
        return map;
    }
 public static void main(String[] args) {
  BloomFilter<String> bf = new BloomFilter<String>();
  Map<Integer,String> map = bf.readFile("C:\\Users\\zhf\\Desktop\\test.txt");
  for(Map.Entry<Integer, String> m : map.entrySet())
   bf.add(m.getValue());
  boolean flag = bf.contains("15");
  System.out.println(flag);
 }
}