大数据量的过滤 (用于爬虫,蜘蛛) Bloom Filter 布隆过滤器

最新推荐文章于 2023-05-17 16:52:11 发布

rcyl2003

最新推荐文章于 2023-05-17 16:52:11 发布

阅读量1.2k

点赞数

分类专栏：网络文章标签： filter string class url null 数据结构

网络专栏收录该内容

28 篇文章 0 订阅

订阅专栏

大数据量的过滤 (用于爬虫,蜘蛛) Bloom Filter 布隆过滤器

原文:Bloom Filters in C#
http://www.devsource.com/article2/0,1895,2113495,00.asp

想像一下.如果你有一个非常大的无序的数据(url连接) 并且你要保证同样的一条连接不会在其它地方再次出现
你实时的收集哪些数据,你没有办法来预防两个相同的url出现,再不断增加的数据当中.

当这些数据是少的时候你可以轻松的创建一个list(dictonary or hashtable 或者你自已的数据结构)然后遍历它们,看它是不是已经存在在这个list当中,
遍历所花的时间是非常多的,
但是当这些数据的长度超过可用的内存的时候? hashtable可以帮我加快速度,但是存...

上面的比较多的费话..看起来累呦口继续不下去了.

得出的方案是用 bit数组来节省空间
a hash table capable of holding M items will take m/8 bytes of memory
接下来是实际的解决方法
net 里面有一个类 BitArray它可以帮我们很容易的创建出一个hashtabel

创建一个简单的 HashingTable

public class SimpleHashTable

{

private BitArray hashbits = null;

public SimpleHashTable(int tableSize)

{

hashbits = new BitArray(tableSize, false);

}

public bool Test(string str)

{

int hash = Math.Abs(str.GetHashCode()) % hashbits.Count;

return hashbits[hash];

}

public bool Add(string str)

{

int hash = Math.Abs(str.GetHashCode()) % hashbits.Count;

bool rslt = hashbits[hash];

if (!rslt)

hashbits[hash] = true;

return rslt;

}

Add 里面没有这个值的时候是返回 false ,
Test 里面有值的话是返回 true;

class Program

{

static void Main(string[] args)

{

int urlsRead = 0;

int collisions = 0;

SimpleHashTable hashTable = new SimpleHashTable(1000000);

using (StreamReader sr = new StreamReader("urls.txt"))

{

string url;

while ((url = sr.ReadLine()) != null)

{

urlsRead++;

bool rslt = hashTable.Add(url);

if (rslt)

collisions++;

if ((urlsRead % 10000) == 0)

{

Console.WriteLine("{0} {1} {2}%",

urlsRead, collisions, 100*

(double)collisions / urlsRead);

}

Console.WriteLine("{0} urls read", urlsRead);

Console.WriteLine("{0} collisions", collisions);

Console.WriteLine("False positive rate = {0}%",

100*(double)collisions / urlsRead);

}

The output from that program, run against the 100,000 unique URLs in my file, is:

 10000  44  0.44%
20000  187  0.935%
30000  423  1.41%
40000  753  1.8825%
50000  1200  2.4%
60000  1753  2.92166666666667%
70000  2375  3.39285714285714%
80000  3123  3.90375%
90000  3946  4.38444444444444%
100000  4834  4.834%
100000 urls read
4834 collisions
False positive rate = 4.834%
 然后写了一个简单的测试类,我们可以看到它的碰撞(冲突)还是比较明显的
 
接下来就是如何继续去解决这样的问题
创建一个新的 Hash算法函数 HashString
hi(x) = (f1(x) + if2(x)) mod m
 然后提供了一个 防止碰撞的结构. hashkeys 保存这个hash的三个位置
 
  
   
    
  public 
    
  class 
   BloomFilter
     
   
  {
        private BitArray hashbits;
        private int numKeys;
        private int[] hashKeys;

        public BloomFilter(int tableSize, int nKeys)
        {
            numKeys = nKeys;
            hashKeys = new int[numKeys];
            hashbits = new BitArray(tableSize);
        }

        private int HashString(string s)
        {
            int hash = 0;

            for (int i = 0; i < s.Length; i++)
            {
                hash += s[i];
                hash += (hash << 10);
                hash ^= (hash >> 6);
            }
            hash += (hash << 3);
            hash ^= (hash >> 11);
            hash += (hash << 15);
            return hash;
        }

        private void CreateHashes(string str)
        {
            int hash1 = str.GetHashCode();
            int hash2 = HashString(str);

            hashKeys[0] = Math.Abs(hash1 % hashbits.Count);
            if (numKeys > 1)
            {
                for (int i = 1; i < numKeys; i++)
                {
                    hashKeys[i] = Math.Abs((hash1 + (i * hash2))
                        % hashbits.Count);
                }
            }
        }

        public bool Test(string str)
        {
            CreateHashes(str);
            // Test each hash key.  Return false if any 
            //  one of the bits is not set.
            foreach (int hash in hashKeys)
            {
                if (!hashbits[hash])
                    return false;
            }
            // All bits set.  The item is there.
            return true;
        }

        public bool Add(string str)
        {
            // Initially assume that the item is in the table
            bool rslt = true;
            CreateHashes(str);
            foreach (int hash in hashKeys)
            {
                if (!hashbits[hash])
                {
                    // One of the bits wasn't set, so show that
                    // the item wasn't in the table, and set that bit.
                    rslt = false;
                    hashbits[hash] = true;
                }
            }
            return rslt;
        }
    } 
 
 测试:
 
  
   
    
  class 
   Program
     
   
  {
        static void Main(string[] args)
        {
            int urlsRead = 0;
            int hashCollisions = 0;
            int bloomCollisions = 0;

            SimpleHashTable hashTable = new SimpleHashTable(1000000);
            BloomFilter bloom = new BloomFilter(480833, 3);

            using (StreamReader sr = new StreamReader("urls.txt"))
            {
                string url;
                while ((url = sr.ReadLine()) != null)
                {
                    urlsRead++;
                    bool rslt = hashTable.Add(url);
                    if (rslt)
                        hashCollisions++;
                    rslt = bloom.Add(url);
                    if (rslt)
                        bloomCollisions++;

                    if ((urlsRead % 10000) == 0)
                    {
                        Console.WriteLine("{0}  {1}  {2}%  {3}  {4}%", urlsRead, 
                            hashCollisions, 100*(double)hashCollisions / urlsRead,
                            bloomCollisions, 100*(double)bloomCollisions / urlsRead);
                    }
                }
            }
            Console.WriteLine("{0} urls read", urlsRead);
            Console.WriteLine("{0} hash collisions", hashCollisions);
            Console.WriteLine("False positive rate (hash) = {0}%", 
                100*(double)hashCollisions / urlsRead);
            Console.WriteLine("{0} Bloom collisions", bloomCollisions);
            Console.WriteLine("False positive rate (Bloom) = {0}%",
                100*(double)bloomCollisions / urlsRead);
        }
    } 
  
 
 
 10000  44  0.44%  1  0.01%
20000  187  0.935%  10  0.05%
30000  423  1.41%  38  0.126666666666667%
40000  753  1.8825%  118  0.295%
50000  1200  2.4%  262  0.524%
60000  1753  2.92166666666667%  517  0.861666666666667%
70000  2375  3.39285714285714%  866  1.23714285714286%
80000  3123  3.90375%  1352  1.69%
90000  3946  4.38444444444444%  2118  2.35333333333333%
100000  4834  4.834%  2966  2.966%
100000 urls read
4834 hash collisions
False positive rate (hash) = 4.834%
2966 Bloom collisions
False positive rate (Bloom) = 2.966% 
 在添加数据的时候.他会判断前面这个位置是否有一个地址存了.如果有的话.它就存第二个. 如果都被存光了..哪就发生碰撞了
实际总的容量是有限的.
还有泛型和它的例子 
  
   
  public 
    
  abstract 
    
  class 
   BloomFilter
 
   
  {
    private BitArray hashbits;
    private int numKeys;
    protected int[] hashKeys;

    public BloomFilter(int tableSize, int nKeys)
    {
        numKeys = nKeys;
        hashKeys = new int[numKeys];
        hashbits = new BitArray(tableSize);
    }

    public bool Test(TValue val)
    {
        CreateHashes(val);
        // Test each hash key.  Return false 
        //  if any one of the bits is not set.
        foreach (int hash in hashKeys)
        {
            if (!hashbits[hash])
                return false;
        }
        // All bits set.  The item is there.
        return true;
    }

    public bool Add(TValue val)
    {
        // Initially assume that the item is in the table
        bool rslt = true;
        CreateHashes(val);
        foreach (int hash in hashKeys)
        {
            if (!hashbits[hash])
            {
                // One of the bits wasn't set, so show that
                // the item wasn't in the table, and set that bit.
                rslt = false;
                hashbits[hash] = true;
            }
        }
        return rslt;
    }

    protected virtual void CreateHashes(TValue val)
    {
        int hash1 = CreateHash1(val);
        int hash2 = CreateHash2(val);

        hashKeys[0] = Math.Abs(hash1 % hashbits.Count);
        if (numKeys > 1)
        {
            for (int i = 1; i < numKeys; i++)
            {
                hashKeys[i] = Math.Abs((hash1 + (i * hash2)) %
                    hashbits.Count);
            }
        }
    }

    protected abstract int CreateHash1(TValue val);

    protected abstract int CreateHash2(TValue val);
} 
  

 
 
   
  
   
  class 
   StringBloomFilter : BloomFilter
 
   
  {
    public StringBloomFilter(int tableSize, int nKeys)
        : base(tableSize, nKeys)
    {
    }

    protected override int CreateHash1(string val)
    {
        return val.GetHashCode();
    }

    protected override int CreateHash2(string val)
    {
        int hash = 0;

        for (int i = 0; i < val.Length; i++)
        {
            hash += val[i];
            hash += (hash << 10);
            hash ^= (hash >> 6);
        }
        hash += (hash << 3);
        hash ^= (hash >> 11);
        hash += (hash << 15);
        return hash;
    }
} 
  

 
 
 让我们的蜘蛛跑得更快吧
Google 数学之美哪边也介绍到了.

rcyl2003

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
大数据量的过滤 (用于爬虫,蜘蛛) Bloom Filter 布隆过滤器

大数据量的过滤 (用于爬虫,蜘蛛) Bloom Filter 布隆过滤器原文:Bloom Filters in C#http://www.devsource.com/article2/0,1895,2113495,00.asp想像一下.如果你有一个非常大的无序的数据(url连接) 并且你要保证同样的一条连接不会在其它地方再次出现你实时的收集哪些数据,你没有办法来预防两个相同的url出现,再不
复制链接

扫一扫

专栏目录