HashTable，位图，BloomFilter分析（简单粗暴）

最新推荐文章于 2024-03-12 13:58:26 发布

angry_youth

最新推荐文章于 2024-03-12 13:58:26 发布

阅读量711

点赞数 1

分类专栏：数据结构 c++ 文章标签： hashtable 位图数据结构散列函数

本文链接：https://blog.csdn.net/angry_youth/article/details/76945553

版权

数据结构同时被 2 个专栏收录

20 篇文章 0 订阅

订阅专栏

c++

14 篇文章 0 订阅

订阅专栏

HashTable:（散列表/哈希表）

**
是根据关键字key而直接访问在内存存储位置的数据结构；它通过一个关键值的函数将所需的数据映射到表中的位置来访问数据，这个映射函数叫做散列函数，存放记录的数组叫做散列表；
构造哈希表的几种方法：
1)直接定址法：取关键字的某个线性函数为散列地址；（但数值之间相差较大，该方法不可用）
2)除留余数法：取关键值被某个不大于散列表长m的数p除后的所得的余数为散列地址；（会出现不同值会映射到相同的位置—哈希冲突）
处理哈西冲突的闭散列方法—–开放定址法
a)线性探测:一个位置有了就往后放，直到遇到空后放在空位置（且表示没有该值）
访问h+1 -> h+2 -> h+3…
b)二次探测：+平方
访问h+1^2 -> h+2^2 -> h+3^3 ->…
h = 插入的值%散列表长度（得到数组的下标）
负载因子=元素个数/散列表长度
对于“开放定址法”，负载因子决定着重要作用，当负载因子达到某一个固定值得时候就要扩增散列表的长度，这样才能缓解哈希冲突；
使用素数作为哈希表的容量可以减少哈希冲突；
处理哈希冲突的方法：开链法/拉链法（哈希桶）

3)平方取中法;
4).折叠法;
5).随机数法;
6).数学分析法;

代码示例：


#include<vector>
#include<string>

//线性探测和二次方探测

//enum State
//{
//  EXIST,DELETE,EMPTY
//};
//
//定义一个外部函数,创建一个素数表，每次扩容的大小从这里面选取
//当素数做除数的时候，产生的哈希冲突会减少
static size_t GetPrime(size_t cap)
{
    const int _PriListLen = 28;
    static const unsigned long _PriList[_PriListLen] = 
    {
        53ul,         97ul,         193ul,       389ul,       769ul,
        1543ul,       3079ul,       6151ul,      12289ul,     24593ul,
        49157ul,      98317ul,      196613ul,    393241ul,    786433ul,
        1572869ul,    3145739ul,    6291469ul,   12582917ul,  25165843ul,
        50331653ul,   100663319ul,  201326611ul, 402653189ul, 805306457ul,
        1610612741ul, 3221225473ul, 4294967291ul
    };
    for(size_t i = 0; i < _PriListLen; i++)
    {
        if(_PriList[i] > cap)
        {
            return _PriList[i];
        }
    }
    return _PriList[_PriListLen-1];
}

//
//template <class K,class V>
//struct HashNode
//{
//  pair<K,V> _kv;
//  State _state;
//
//  HashNode(const K& key,const V& value)
//      :_kv(key,value),_state(EMPTY)
//  {}
//  HashNode()
//      :_state(EMPTY)
//  {}
//};
//
//template <class K>
//struct _GetFunc
//{
//  size_t operator()(const K& key)
//  {
//      return key;
//  }
//};
//
如果是字符创作为键值的话应该重新定义
//template <class K = string>
//struct _GetFuncString
//{
//  static size_t _getfunc(const char* key)
//  {
//      signed int seek = 131;//31 131 1313 13131 131313 
//      signed int hash = 0;
//      while(*key)
//      {
//          hash = hash*seek + (*key++);
//      }
//      //类似于整数“1234”的保存方式是1*10^3+2*10^2+3*10^1+4*10^0这样的把字符拆开一个一个算总和存储
//      return (hash & 0x7FFFFFFF);
//  }
//  size_t operator()(const string& key)
//  {
//      return _getfunc(key.c_str());
//  }
//};
//
//
//template <class K,class V,class _GetFunc = _GetFunc<K>>
//class HashTable
//{
//public: 
//  typedef HashNode<K,V> Node;
//protected:
//  vector<HashNode<K,V>> _table;
//  size_t _size;//表示实际元素的个数
//public:
//  HashTable()
//      :_size(0)
//  {}
//  
//  //获取key键值，求出映射的下标
//  size_t GetFunc(const K& kv)
//  {
//      _GetFunc hash;
//      return hash(kv) % _table.size();
//  }
//  //插入一个值
//  //当节点状态是“空/删除”状态时是可以插入节点的
//  bool Insert(pair<K,V>& kv)
//  {
//      _checkCapacity();//检查容量
//      size_t index = GetFunc(kv.first);
//      size_t i = 1;
//      while(_table[index]._state == EXIST)
//      {
//          if(_table[index]._kv.first == kv.first)
//          {
//              return false;//不允许出现相同的键值
//          }
//          //二次方探测
//          index += i * i;
//          index = index % _table.size();
//          ++i;
//          //线性探测
//          /*++index;
//          if(index == _table.size())
//          {
//              index = 0;
//          }*/
//      }
//      _table[index]._kv = kv;
//      _table[index]._state = EXIST;
//      ++_size;//哈希表中的元素个数增加
//      return true;
//  }
//  //查找某个节点
//  //只有当节点状态是存在时我们的查找才有效
//  Node* Find(const K& key)
//  {
//      size_t index = GetFunc(key);
//      //如果遇到空位/删除还是没有找到，则说明该节点不存在
//      while(_table[index]._state == EXIST)
//      {
//          if(_table[index]._kv.first == key)
//          {
//              return &_table[index];
//          }
//          ++index;
//          if(index == _table.size())
//          {
//              index = 0;
//          }
//      }
//      return NULL;
//  }
//  //删除一个节点，对于数组来讲删除一个节点话费太大，
//  //不能真的删除，我们只需要将节点的状态置为DELETE,_table的大小--即可，下次就可以插入新节点
//  bool Remove(const K& key)
//  {
//      if(_size == 0)
//      {
//          return false;
//      }
//      Node* cur = Find(key);
//      if(cur)
//      {
//          cur->_state = DELETE;
//          --_size;
//          return true;
//      }
//      return false;
//      /*size_t index = GetFunc(key);
//      while(_table[index]._state == EXIST)
//      {
//          if(_table[index]._kv.first == key)
//          {
//              _table[index]._state = DELETE;
//              --_size;
//              return true;
//          }
//          ++index;
//          if(index == _table.size())
//          {
//              index = 0;
//          }
//      }*/
//  }
//
//private:
//  void _checkCapacity()
//  {
//      if(_table.empty())
//      {
//          _table.resize(GetPrime(0));
//          return;
//      }
//      //检查是否需要扩容
//      if(((_size*10)/_table.size()) >= 7)//负载因子设定为0.7
//      {
//          //需要开始扩容
//          size_t newSize = GetPrime(_table.size());
//          HashTable<K,V> newHash;
//          newHash._table.resize(newSize);
//          for(size_t i = 0; i < _table.size(); i++)
//          {
//              if(_table[i]._state == EXIST)
//              {
//                  newHash.Insert(_table[i]._kv);
//              }
//          }
//          _table.swap(newHash._table);
//
//      }
//  }
//};
//




//开链法（哈希桶）

template <class K,class V>
struct HashNode
{
    pair<K,V> _kv;
    HashNode<K,V>* _next;//指向下一个节点

    HashNode(const pair<K,V>& kv)
        :_kv(kv),_next(NULL)
    {}
};


template<class K>
struct _GetFunc
{
    size_t operator()(const K& key)
    {
        return key;
    }
};


struct _GetFuncString
{
    size_t _GetFunc(const char* key)
    {
        signed int seek = 131;
        signed int hash = 0;
        while(*key)
        {
            hash = hash*seek + (*key++);
        }
        return (hash & 0x7FFFFFFF);
    }

    size_t operator()(const string key)
    {
        return _GetFunc(key.c_str());
    }
};

template<class K, class V, class GetFunc>
class HashTable;//先声明一下

template <class K,class V,class GetFunc=_GetFunc<K>>
struct HashTableIterator
{
    typedef HashNode<K,V> Node;
    typedef HashTableIterator<K,V,GetFunc> Self;
    typedef HashTable<K,V,GetFunc> HashTable;
protected:
    Node* _node;
    HashTable* _hash;

    size_t getFunc(const K& key)
    {
        GetFunc hash;
        return hash(key) % (_hash->_table.size());
    }

public:
    HashTableIterator(Node* node,HashTable* hash)
        :_node(node),_hash(hash)
    {}
    Self& operator=(const Self& h)
    {
        if(this != &h)
        {
            this->_node = h._node;
            this->_hash = h._hash;
        }
        return *this;
    }

    pair<K,V>& operator*()
    {
        return _node->_kv;
    }
    pair<K,V>* operator->()
    {
        return &_node->_kv;
    }
    Self& operator++()//前置++
    {
        if(_node->_next)
        {
            _node = _node->_next;
        }
        else//可能是某个位置的最后一个结点了
        {
            size_t index = getFunc(_node->_kv.first);
            _node = NULL;
            for(size_t i = index+1; i < (_hash->_table.size()); i++)
            {
                if(_hash->_table[i])
                {
                    _node = _hash->_table[i];
                    break;
                }
            }
        }
        return *this;
    }
    Self operator++(int)
    {
        Self cur = *this;
        ++(*this);
        return cur;
    }
    bool operator!=(const Self& h) const 
    {
        return _node != h._node;
    }

};

template <class K,class V,class GetFunc=_GetFuncString>
class HashTable
{
public:
    typedef HashNode<K,V> Node;
    typedef HashTableIterator<K,V,GetFunc> Iterator;
    friend struct HashTableIterator<K,V,GetFunc>;//因为_table是保护成员，外部对象不可以直接访问
protected:
    vector<HashNode<K,V>*> _table;//数组中放的应该是节点
    size_t _size;//有效数据个数
public:
    HashTable()
        :_size(0)
    {}
    ~HashTable()
    {
        for(size_t i = 0; i < _table.size(); i++)
        {
            Node* del = _table[i];
            while(del)
            {
                Node* next = del->_next;
                delete del;
                del = next;
            }
            _table[i] = NULL;
        }
        /*_size = 0;*/
    }
    //获取下标
    size_t getFunc(const K& key)
    {
        GetFunc hash;
        return hash(key) % _table.size();
    }
    //插入数据
    bool Insert(const pair<K,V>& kv)
    {
        //先检查容量
        _checkCapacity();
        //求下标
        size_t index = getFunc(kv.first);
        //头插，先判断插入的结点是否有效
        Node* cur = _table[index];
        while(cur)
        {
            if(cur->_kv.first == kv.first)
            {
                return false;
            }
            cur = cur->_next;
        }
        cur = new Node(kv);
        cur->_next = _table[index];
        _table[index] = cur;
        ++_size;
        return true;
    }
    pair<Iterator,bool> InsertPair(const pair<K,V>& kv)
    {
        //先检查容量
        _checkCapacity();
        //求下标
        size_t index = getFunc(kv.first);
        //头插，先判断插入的结点是否有效
        Node* cur = _table[index];
        while(cur)
        {
            if(cur->_kv.first == kv.first)
            {
                return make_pair(Iterator(cur,this),false);
            }
            cur = cur->_next;
        }
        cur = new Node(kv);
        cur->_next = _table[index];
        _table[index] = cur;
        ++_size;
        return make_pair(Iterator(cur,this),true);
    }

    V& operator[](const K& key)
    {
        pair<Iterator,bool> tmp = this->InsertPair(make_pair(key,V()));
        //如果没有就插入，如果有就直接返回并且迭代器指向它
        return tmp.first->second;
    }
    //找到一个结点
    Node* Find(const K& key)
    {
        if(_size == 0)
        {
            return NULL;
        }
        size_t index = getFunc(key);
        Node* cur = _table[index];
        while(cur)
        {
            if(cur->_kv.first == key)
            {
                return cur;
            }
            cur = cur->_next;
        }
        return NULL;
    }
    //删除一个结点
    bool Remove(const K& key)
    {
        if(_size == 0)
        {
            return false;
        }
        size_t index = getFunc(key);
        Node* prev = NULL;
        Node* del = _table[index];
        while(del)
        {
            if(del->_kv.first == key)
            {
                if(prev == NULL)
                {
                    _table[index] = del->_next;
                }
                else
                {
                    prev->_next = del->_next;
                }
                delete del;
                --_size;
                return true;
            }
            prev = del;
            del = del->_next;
        }
        return false;
    }

    void Resize(size_t size)
    {
        _table.resize(GetPrime(size));
    }

    Iterator Begin()
    {
        if(_size == 0)
        {
            return Iterator(NULL,this);
        }
        Node* cur = NULL;
        for(size_t i = 0; i < _table.size(); i++)
        {
            if(_table[i])
            {
                cur = _table[i];
                break;
            }
        }
        return Iterator(cur,this);
    }

    Iterator End()
    {
        return Iterator(NULL,this);
    }

private:
    void _checkCapacity()
    {
        if(_table.size() == _size)
        {
            size_t newsize = GetPrime(_size);
            //扩容的时候理应重新排一下位置
            vector<Node*> newtable;
            newtable.resize(newsize);
            for(size_t i = 0; i < _size; i++)
            {
                Node* cur = _table[i];
                while(cur)
                {
                    Node* next = cur->_next;//保存下一个节点
                    size_t index = getFunc(cur->_kv.first);//重新计算下标
                    cur->_next = newtable[index];
                    newtable[index] = cur;
                    cur = next;
                }
                //这里为什么不用直接插入的方法？因为插入需要重新new新节点，这样一来花费太大
                //为什么线性探测需要用插入呢？因为线性探测数组中每一个元素就放一个数据，
                //不需要new节点，直接赋值运算，所有这其实是一个重复的动作，所以调用插入函数
                _table[i] = NULL;
            }
            _table.swap(newtable);
        }
    }

};

//make_pair的模板函数
//template <class K,class V>
//pair<K,V> make_pair(const K& key,const V& value)
//{
//   return pair<K,V>(key,value);
//}

应用：
**

位图：

**
是一个数组，数组中的每个数据的每个二进制位都表示一个数据，数据存在表示0，数据不存在表示1；
下标表示除的结果，二进制的位置表示除的余数；

面试题：存储一个40亿的无符号整数，没有排序
思路：
1G大约等于10亿个byte(字节)
1byte = 8 bit(8个比特位)（每一个比特位可以标志一个数）
存放40亿个数需要40亿个位置
1G大概可以存放80亿个数，所以存放40亿个数需要500MB左右；
**

布隆过滤器：

**
哈希+位图
当一个元素被加入数组时，通过 K 个 HashFunc 函数将这个元素映射成一个位阵列中的 K 个点，把它们置为 1(和位图思想一样，只是它要占用多个二进制位)。检索时，我们只要看看这些点是不是都是 1 就（大约）知道集合中有没有它了：
如果这些点有任何一个 0，则被检索元素一定不在；
如果都是 1，则被检索元素很可能在。（记住是可能）
优点是空间效率很高，布隆过滤器存储空间和插入 / 查询时间都是常数O(k)。
缺点
误算率是其中之一。随着存入的元素数量增加，误算率随之增加。但是如果元素数量太少，则使用散列表足矣。误判补救方法是：再建立一个小的白名单，存储那些可能被误判的信息。
注：要保证安全地删除元素并不简单。首先我们必须保证删除的元素的确在布隆过滤器里面，这一点单凭过滤器是无法保证的，而且计数器回绕也会造成问题。

位图和布隆过滤器的区别：
位图——>利用哈希直接定址法；优点是节省空间，快；缺点是只能运用于整形；
布隆过滤器——->哈希+位图；优点是节省空间，快，可以标记任意类型；缺点是会产生误判；

代码示例：

//BitMap位图
//利用哈希的直接定址法，用来表示一个整形存在或者不存在的数组

#include<vector>
#include<string>

typedef size_t MyType;

class BitMap
{
protected:
    vector<char> _bitMap;
public:
    BitMap(MyType range)
    {
        //给数组开空间
        _bitMap.resize(range/8+1,0);

    }
    //将数据置入数组中
    void Set(size_t value)
    {
        size_t index = value/8;//找到数组下标
        size_t pos = value%8;//找到元素中的二进制的位置
        _bitMap[index] |= (1<<pos);
    }
    //将数据从数组中删除
    void ReSet(size_t value)
    {
        size_t index = value>>3;
        size_t pos = value%8;
        _bitMap[index] &= (~(1<<pos));
    }
    //判断某个数在不在
    bool Test(size_t value)
    {
        size_t index = value>>3;
        size_t pos = value%8;
        return (_bitMap[index] & (1<<pos));
    }

};


void TestBitMap()
{
    //BitMap bm(1024*1024*1024*4);
    BitMap bm(-1);
    bm.Set(1101);
    bm.Set(1001);
    bm.Set(1001111);
    bm.Set(1002221);

    cout<<bm.Test(1101)<<endl;
    cout<<bm.Test(1102)<<endl;
    cout<<bm.Test(1001)<<endl;
    cout<<bm.Test(1002221)<<endl;
}


//布隆过滤器-------->哈希+位图（可以标记任意类型）

struct _GetFunc1
{
    size_t GetFunc(const char* value)
    {
        size_t seek = 131;
        register size_t hash = 0;
        while(*value)
        {
            hash = hash*seek + (*value++);
        }
        return hash;
    }
    size_t operator()(const string& key)
    {
        return GetFunc(key.c_str());
    }
};

struct _GetFunc2
{
    size_t GetFunc(const char* value)
    {
        register size_t hash = 0;
        size_t seek = 63689;
        while(*value)
        {
            hash = hash*seek + (*value++);
            seek *= 378551;
        }
        return hash;
    }
    size_t operator()(const string& key)
    {
        return GetFunc(key.c_str());
    }
};

struct _GetFunc3
{
    size_t GetFunc(const char* value)
    {
        if(!*value)    
            return 0;  
        register size_t hash = 1315423911;  
        while (size_t ch = (size_t)*value++)  
        {  
            hash ^= ((hash << 5) + ch + (hash >> 2));  
        }  
        return hash;  
    }
    size_t operator()(const string& key)
    {
        return GetFunc(key.c_str());
    }
};

template <class K=string,class HashFunc1=_GetFunc1,
          class HashFunc2=_GetFunc2,class HashFunc3=_GetFunc3>
class BloomFilter
{
protected:
    BitMap _BM;
    size_t _range;
public:
    BloomFilter(size_t range)
        :_range(range),_BM(range)
    {
    }
    void Set(const K key)
    {
        //将传进来的参数转化成size_t,但是转化后的大小还不能超过范围
        size_t hash1 = HashFunc1()(key) % _range;
        //这里为什么要%_range,如果这个数据转化成size_t类型变得超级大了，
        //那么就无法正确定位到有效的下标
        size_t hash2 = HashFunc2()(key) % _range;
        size_t hash3 = HashFunc3()(key) % _range;
        _BM.Set(hash1);
        _BM.Set(hash2);
        _BM.Set(hash3);
    }
    bool Test(const K key)
    {
        size_t hash1 = HashFunc1()(key) % _range;
        if(_BM.Test(hash1) == false)
        {
            return false;
        }
        size_t hash2 = HashFunc2()(key) % _range;
        if(_BM.Test(hash2) == false)
        {
            return false;
        }
        size_t hash3 = HashFunc3()(key) % _range;
        if(_BM.Test(hash3) == false)
        {
            return false;
        }
        return true;
    }
    //这里为什么不写删除？因为布隆过滤器任意删除一个数据都可能会影响到其它的数据，比较复杂，可以自己研究

};