大数据量的过滤 (用于爬虫,蜘蛛) Bloom Filter 布隆过滤器

导读:
  原文:Bloom Filters in C#
  http://www.devsource.com/article2/0,1895,2113495,00.asp
  想像一下.如果你有一个非常大的无序的数据(url连接) 并且你要保证同样的一条连接不会在其它地方再次出现
  你实时的收集哪些数据,你没有办法来预防两个相同的url出现,再不断增加的数据当中.
  当这些数据是少的时候你可以轻松的创建一个list(dictonary or hashtable 或者你自已的数据结构)然后遍历它们,看它是不是已经存在在这个list当中,
  遍历所花的时间是非常多的,
  但是 当这些数据的长度超过可用的内存的时候? hashtable可以帮我加快速度,但是存...
   上面的比较多的费话..看起来累 呦口继续不下去了.
  得出的方案是用 bit数组来节省空间
  a hash table capable of holding M items will take m/8 bytes of memory
  接下来是实际的解决方法
  net 里面有一个类 BitArray它可以帮我们很容易的创建出一个hashtable
  创建一个简单的 HashingTable
  
  
  public class SimpleHashTable
  
  
  
  
  
  
  {
  
  
  private BitArray hashbits = null
  
  
  
  public SimpleHashTable(int tableSize)
  
  
  
  
  
  
  {
  
  
  hashbits = new BitArray(tableSize, false);
  
  
  }
  
  
  
  public bool Test(string str)
  
  
  
  
  
  
  {
  
  
  int hash = Math.Abs(str.GetHashCode()) % hashbits.Count;
  
  
  return hashbits[hash];
  
  
  }
  
  
  
  public bool Add(string str)
  
  
  
  
  
  
  {
  
  
  int hash = Math.Abs(str.GetHashCode()) % hashbits.Count;
  
  
  bool rslt = hashbits[hash];
  
  
  if (!rslt)
  
  
  hashbits[hash] = true
  
  return rslt;
  
  
  }
  
  }
  Add 里面没有这个值的时候是返回 false ,
  Test 里面有值的话是返回 true;
  
  
  class Program
  
  
  
  
  
  
  {
  
  
  static void Main(string[] args)
  
  
  
  
  
  
  {
  
  
  int urlsRead = 0
  
  int collisions = 0
  
  
  
  SimpleHashTable hashTable = new SimpleHashTable(1000000);
  
  
  using (StreamReader sr = new StreamReader("urls.txt"))
  
  
  
  
  
  
  {
  
  
  string url;
  
  
  while ((url = sr.ReadLine()) != null)
  
  
  
  
  
  
  {
  
  
  urlsRead++
  
  bool rslt = hashTable.Add(url);
  
  
  if (rslt)
  
  
  collisions++
  
  if ((urlsRead % 10000) == 0)
  
  
  
  
  
  
  {
  
  
  Console.WriteLine("{0} {1} {2}%",
  
  
  urlsRead, collisions, 100*
  
  (double)collisions / urlsRead);
  
  
  }
  
  }
  
  }
  
  Console.WriteLine("{0} urls read", urlsRead);
  
  
  Console.WriteLine("{0} collisions", collisions);
  
  
  Console.WriteLine("False positive rate = {0}%",
  
  
  100*(double)collisions / urlsRead);
  
  
  }
  
  }
  
  
  The output from that program, run against the 100,000 unique URLs in my file, is:
  10000 44 0.44%
  20000 187 0.935%
  30000 423 1.41%
  40000 753 1.8825%
  50000 1200 2.4%
  60000 1753 2.92166666666667%
  70000 2375 3.39285714285714%
  80000 3123 3.90375%
  90000 3946 4.38444444444444%
  100000 4834 4.834%
  100000 urls read
  4834 collisions
  False positive rate = 4.834%
  然后写了一个简单的测试类,我们可以看到它的碰撞(冲突)还是比较明显的
  
  接下来就是如何继续去解决这样的问题
  
  创建一个新的 Hash算法函数 HashString
  
  hi(x) = (f1(x) + if2(x)) mod m
  然后提供了一个 防止碰撞的结构. hashkeys 保存这个hash的三个位置
  
  
  public class BloomFilter
  
  
  
  
  
  
  {
  
  
  private BitArray hashbits;
  
  
  private int numKeys;
  
  
  private int[] hashKeys;
  
  
  
  
  public BloomFilter(int tableSize, int nKeys)
  
  
  
  
  
  
  {
  
  
  numKeys = nKeys;
  
  
  hashKeys = new int[numKeys];
  
  
  hashbits = new BitArray(tableSize);
  
  
  }
  
  
  
  private int HashString(string s)
  
  
  
  
  
  
  {
  
  
  int hash = 0
  
  
  
  for (int i = 0 i < s.Length; i++)
  
  
  
  
  
  
  {
  
  
  hash += s[i];
  
  
  hash += (hash << 10);
  
  
  hash ^= (hash >> 6);
  
  
  }
  
  hash += (hash << 3);
  
  
  hash ^= (hash >> 11);
  
  
  hash += (hash << 15);
  
  
  return hash;
  
  
  }
  
  
  
  private void CreateHashes(string str)
  
  
  
  
  
  
  {
  
  
  int hash1 = str.GetHashCode();
  
  
  int hash2 = HashString(str);
  
  
  
  
  hashKeys[0] = Math.Abs(hash1 % hashbits.Count);
  
  
  if (numKeys > 1)
  
  
  
  
  
  
  {
  
  
  for (int i = 1 i < numKeys; i++)
  
  
  
  
  
  
  {
  
  
  hashKeys[i] = Math.Abs((hash1 + (i * hash2))
  
  
  % hashbits.Count);
  
  
  }
  
  }
  
  }
  
  
  
  public bool Test(string str)
  
  
  
  
  
  
  {
  
  
  CreateHashes(str);
  
  
  // Test each hash key. Return false if any
  
  
  // one of the bits is not set.
  
  foreach (int hash in hashKeys)
  
  
  
  
  
  
  {
  
  
  if (!hashbits[hash])
  
  
  return false
  
  }
  
  // All bits set. The item is there.
  
  return true
  
  }
  
  
  
  public bool Add(string str)
  
  
  
  
  
  
  {
  
  
  // Initially assume that the item is in the table
  
  bool rslt = true
  
  CreateHashes(str);
  
  
  foreach (int hash in hashKeys)
  
  
  
  
  
  
  {
  
  
  if (!hashbits[hash])
  
  
  
  
  
  
  {
  
  
  // One of the bits wasn't set, so show that
  
  
  // the item wasn't in the table, and set that bit.
  
  rslt = false
  
  hashbits[hash] = true
  
  }
  
  }
  
  return rslt;
  
  
  }
  
  }
  测试:
  
  
  class Program
  
  
  
  
  
  
  {
  
  
  static void Main(string[] args)
  
  
  
  
  
  
  {
  
  
  int urlsRead = 0
  
  int hashCollisions = 0
  
  int bloomCollisions = 0
  
  
  
  SimpleHashTable hashTable = new SimpleHashTable(1000000);
  
  
  BloomFilter bloom = new BloomFilter(480833, 3);
  
  
  
  
  using (StreamReader sr = new StreamReader("urls.txt"))
  
  
  
  
  
  
  {
  
  
  string url;
  
  
  while ((url = sr.ReadLine()) != null)
  
  
  
  
  
  
  {
  
  
  urlsRead++
  
  bool rslt = hashTable.Add(url);
  
  
  if (rslt)
  
  
  hashCollisions++
  
  rslt = bloom.Add(url);
  
  
  if (rslt)
  
  
  bloomCollisions++
  
  
  
  if ((urlsRead % 10000) == 0)
  
  
  
  
  
  
  {
  
  
  Console.WriteLine("{0} {1} {2}% {3} {4}%", urlsRead,
  
  
  hashCollisions, 100*(double)hashCollisions / urlsRead,
  
  
  bloomCollisions, 100*(double)bloomCollisions / urlsRead);
  
  
  }
  
  }
  
  }
  
  Console.WriteLine("{0} urls read", urlsRead);
  
  
  Console.WriteLine("{0} hash collisions", hashCollisions);
  
  
  Console.WriteLine("False positive rate (hash) = {0}%",
  
  
  100*(double)hashCollisions / urlsRead);
  
  
  Console.WriteLine("{0} Bloom collisions", bloomCollisions);
  
  
  Console.WriteLine("False positive rate (Bloom) = {0}%",
  
  
  100*(double)bloomCollisions / urlsRead);
  
  
  }
  
  }
  
  
  10000 44 0.44% 1 0.01%
  20000 187 0.935% 10 0.05%
  30000 423 1.41% 38 0.126666666666667%
  40000 753 1.8825% 118 0.295%
  50000 1200 2.4% 262 0.524%
  60000 1753 2.92166666666667% 517 0.861666666666667%
  70000 2375 3.39285714285714% 866 1.23714285714286%
  80000 3123 3.90375% 1352 1.69%
  90000 3946 4.38444444444444% 2118 2.35333333333333%
  100000 4834 4.834% 2966 2.966%
  100000 urls read
  4834 hash collisions
  False positive rate (hash) = 4.834%
  2966 Bloom collisions
  False positive rate (Bloom) = 2.966%
  在添加数据的时候.他会判断前面这个位置是否有一个地址存了.如果有的话.它就存第二个. 如果都被存光了..哪就发生碰撞了
  实际总的容量是有限的.
  还有泛型和它的例子
  
  
  public abstract class BloomFilter
  
  
  
  
  
  {
  
  
  private BitArray hashbits;
  
  
  private int numKeys;
  
  
  protected int[] hashKeys;
  
  
  
  
  public BloomFilter(int tableSize, int nKeys)
  
  
  
  
  
  
  {
  
  
  numKeys = nKeys;
  
  
  hashKeys = new int[numKeys];
  
  
  hashbits = new BitArray(tableSize);
  
  
  }
  
  
  
  public bool Test(TValue val)
  
  
  
  
  
  
  {
  
  
  CreateHashes(val);
  
  
  // Test each hash key. Return false
  
  
  // if any one of the bits is not set.
  
  foreach (int hash in hashKeys)
  
  
  
  
  
  
  {
  
  
  if (!hashbits[hash])
  
  
  return false
  
  }
  
  // All bits set. The item is there.
  
  return true
  
  }
  
  
  
  public bool Add(TValue val)
  
  
  
  
  
  
  {
  
  
  // Initially assume that the item is in the table
  
  bool rslt = true
  
  CreateHashes(val);
  
  
  foreach (int hash in hashKeys)
  
  
  
  
  
  
  {
  
  
  if (!hashbits[hash])
  
  
  
  
  
  
  {
  
  
  // One of the bits wasn't set, so show that
  
  
  // the item wasn't in the table, and set that bit.
  
  rslt = false
  
  hashbits[hash] = true
  
  }
  
  }
  
  return rslt;
  
  
  }
  
  
  
  protected virtual void CreateHashes(TValue val)
  
  
  
  
  
  
  {
  
  
  int hash1 = CreateHash1(val);
  
  
  int hash2 = CreateHash2(val);
  
  
  
  
  hashKeys[0] = Math.Abs(hash1 % hashbits.Count);
  
  
  if (numKeys > 1)
  
  
  
  
  
  
  {
  
  
  for (int i = 1 i < numKeys; i++)
  
  
  
  
  
  
  {
  
  
  hashKeys[i] = Math.Abs((hash1 + (i * hash2)) %
  
  hashbits.Count);
  
  
  }
  
  }
  
  }
  
  
  
  protected abstract int CreateHash1(TValue val);
  
  
  
  
  protected abstract int CreateHash2(TValue val);
  
  
  }
  
  
  
  
  
  
  
  class StringBloomFilter : BloomFilter
  
  
  
  
  
  {
  
  
  public StringBloomFilter(int tableSize, int nKeys)
  
  
  : base(tableSize, nKeys)
  
  
  
  
  
  
  {
  
  
  }
  
  
  
  protected override int CreateHash1(string val)
  
  
  
  
  
  
  {
  
  
  return val.GetHashCode();
  
  
  }
  
  
  
  protected override int CreateHash2(string val)
  
  
  
  
  
  
  {
  
  
  int hash = 0
  
  
  
  for (int i = 0 i < val.Length; i++)
  
  
  
  
  
  
  {
  
  
  hash += val[i];
  
  
  hash += (hash << 10);
  
  
  hash ^= (hash >> 6);
  
  
  }
  
  hash += (hash << 3);
  
  
  hash ^= (hash >> 11);
  
  
  hash += (hash << 15);
  
  
  return hash;
  
  
  }
  
  }
  
  
  
  
  让我们的蜘蛛跑得更快吧
  Google 数学之美哪边也介绍到了.
  http://googlechinablog.com/2007/07/bloom-filter.html

本文转自
http://www.cnblogs.com/lovebanyi/archive/2007/07/06/808736.html
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值