BloomFilter最常见的作用是:判断某个元素是否在一个集合里面。它最重要的两个方法是:add() 和contains()。最大的特点是不会存在false negative,即:如果contains()返回false,则该元素一定不在集合中,但会存在一定的true negative,即:如果contains()返回true,则该元素一定可能在集合中。
因而可将小表中的key保存到BloomFilter中,在map阶段过滤大表,可能有一些不在小表中的记录没有过滤掉(但是在小表中的记录一定不会过滤掉),这没关系,只不过增加了少量的网络IO而已。
Bloom Filter 是一种空间效率很高的随机数据结构,它利用位数组很简洁地表示一个集合,并能判断一个元素是否属于这个集合。Bloomfilter的主要优势在于它的大小(比特位个数)为常数且在初始化时被设置。增加更多的元素到一个Bloomfilter不会增加它的大小。它仅增加误报的概率。但是这个概率非常小。
更多关于BloomFilter的介绍,可参考:http://blog.csdn.net/jiaomeng/article/details/1495500
实现 Bloom filter:
此段摘自《hadoop in action》,很简单也很直观
|
在两个集合的合并的时候,会采用union()方法做巧妙的实现,每个mapper根据自己的数据分片构造一个Bloom filter,我们再把这些这些 Bloom filter发送到一个单一的reducer上,将它们归并且记录最终的输出,由于Bloom filter会随着mapper的输出被打乱,Bloom filter类必须实现wirtable接口,它包括wirter()和readFileds()方法,这些方法实现了在内部的BitSet表示和一个字节数组的转换,从而让这些数据可以被序列化到DataInput/DataOutput.(摘自 《hadoop in action》)。
一个用于生成Bloom filter的MapReduce程序:
package com.hadoop.datajoin.bloomfilter; import java.io.IOException; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.KeyValueTextInputFormat; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.lib.NullOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.util.bloom.BloomFilter; import org.apache.hadoop.util.bloom.Key; import org.apache.hadoop.util.hash.Hash; public class BloomFilterDemo extends Configured implements Tool { public static class MapClass extends MapReduceBase implements Mapper<Text, Text, Text, BloomFilter> { BloomFilter bloomFilter = new BloomFilter(1000000, 6, Hash.JENKINS_HASH); OutputCollector<Text, BloomFilter> ct = null; @Override public void map(Text key, Text arg1, OutputCollector<Text, BloomFilter> output, Reporter arg3) throws IOException { if (ct == null) { ct = output; } System.out.println(key.toString() + "``````````````"); bloomFilter.add(new Key(key.toString().getBytes())); } @Override public void close() { try { ct.collect(new Text("testKey"), bloomFilter); } catch (Exception e) { e.printStackTrace(); } } } public static class ReduceClass extends MapReduceBase implements Reducer<Text, BloomFilter, Text, Text> { JobConf job = null; BloomFilter bf = new BloomFilter(1000000, 6, Hash.JENKINS_HASH); @Override public void configure(JobConf job) { this.job = job; } @Override public void reduce(Text key, Iterator<BloomFilter> values, OutputCollector<Text, Text> output, Reporter arg3) throws IOException { while (values.hasNext()) { bf.or(values.next()); } } @Override public void close() throws IOException { Path file = new Path(job.get("mapred.output.dir") + "/bloomfilter"); FSDataOutputStream out = file.getFileSystem(job).create(file); bf.write(out); out.close(); } } @Override public int run(String[] arg0) throws Exception { Configuration conf = getConf(); JobConf job = new JobConf(conf, HadoopSevenMapperDemo.class); Path in = new Path("/user/Administrator/input7"); Path out = new Path("/user/Administrator/output7"); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("HadoopSevenMapperDemo"); job.setMapperClass(MapClass.class); job.setReducerClass(ReduceClass.class); job.setNumReduceTasks(1); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BloomFilter.class); job.set("key.value.separator.in.input.line", ","); JobClient.runJob(job); return 0; } public static void main(String[] args) { int res; try { res = ToolRunner.run(new Configuration(), new HadoopSevenMapperDemo(), args); System.exit(res); } catch (Exception e) { e.printStackTrace(); } } } |