暴雪hash来作整数的hash

最新推荐文章于 2023-07-17 22:01:00 发布

夕阳那边

最新推荐文章于 2023-07-17 22:01:00 发布

阅读量623

点赞数

分类专栏：算法文章标签：暴雪 hash

本文链接：https://blog.csdn.net/iihtd/article/details/51693216

版权

算法专栏收录该内容

17 篇文章 0 订阅

订阅专栏

问题提出

我们常常在项目中会用到探测一个数组B中是不是有数组A中的数据。
方案1：将B中的数据放到一个位图中，然后用A中的数据探测。
缺点：如果B中的数据比较稀疏，比如[1,2,900,800]等，位图开销大，因为要为没有出现的数也需要有一个bit来标识。
方案2：将B中的数据存到一个hash表中，然后用A中的数据作探测。
暴雪hash的原理见我之前转载的文章暴雪hash
以unordered_set为例，将所有的数据存在set中会占用大量的空间。如果采用改版后的暴雪hash算法，那么可以采用一个bool字段来标识这个整数。
源码参考来源
 整数hash函数选取的来源

源码如下：

//头文件
#ifndef _Uint64Hash_H
#define _Uint64Hash_H
#include <string>

using namespace std;
#define MAXTABLELEN 1024    // 默认哈希索引表大小 
//////////////////////////////////////////////////////////////////////////  
// 哈希索引表定义  
typedef struct  _HASHTABLE
{  
    uint64_t nHashA;  
    uint64_t nHashB;  
    bool bExists;  
}HASHTABLE, *PHASHTABLE ;  

class Uint64Hash
{
public:
    Uint64Hash(const long nTableLength = MAXTABLELEN);
    ~Uint64Hash(void);
private:  
    uint64_t cryptTable[0x500];  
    uint64_t m_tablelength;    // 哈希索引表长度  
    HASHTABLE *m_HashIndexTable; 
private:
    void InitCryptTable();                                               // 对哈希索引表预处理
    uint64_t HashUint64(uint64_t key); 
    uint64_t HashUint64A(uint64_t key); 
    uint64_t HashUint64B(uint64_t key); 
public:
    uint64_t Hash(uint64_t val);
    bool Hashed(uint64_t val);    // 检测url是否被hash过

};
#endif


#include<iostream>
#include "Uint64Hash.h"
//#include <boost/timer.hpp>
#include <string>
#include <stdlib.h>
using namespace std;

Uint64Hash::Uint64Hash(const long nTableLength /*= MAXTABLELEN*/)
{
    InitCryptTable();  
    m_tablelength = nTableLength;  
    //初始化hash表
    m_HashIndexTable = new HASHTABLE[nTableLength];  
    for ( int i = 0; i < nTableLength; i++ )  
    {  
        m_HashIndexTable[i].nHashA = -1;  
        m_HashIndexTable[i].nHashB = -1;  
        m_HashIndexTable[i].bExists = false;  

    }          
}

Uint64Hash::~Uint64Hash(void)
{
    //清理内存
    if ( NULL != m_HashIndexTable )  
    {  
        delete []m_HashIndexTable;  
        m_HashIndexTable = NULL;  
        m_tablelength = 0;  
    }  
}

/************************************************************************/
/*函数名：InitCryptTable
*功  能：对哈希索引表预处理  
*返回值：无
************************************************************************/
void Uint64Hash::InitCryptTable()  
{   
    uint64_t seed = 0x00100001, index1 = 0, index2 = 0, i;  

    for( index1 = 0; index1 < 0x100; index1++ )  
    {   
        for( index2 = index1, i = 0; i < 5; i++, index2 += 0x100 )  
        {   
            uint64_t temp1, temp2;  
            seed = (seed * 125 + 3) % 0x2AAAAB;  
            temp1 = (seed & 0xFFFF) << 0x10;  
            seed = (seed * 125 + 3) % 0x2AAAAB;  
            temp2 = (seed & 0xFFFF);  
            cryptTable[index2] = ( temp1 | temp2 );   
        }   
    }   
}  

/************************************************************************/
/*函数名：HashUint64
*功  能：求取哈希值   
*返回值：返回hash值
************************************************************************/
uint64_t Uint64Hash::HashUint64( uint64_t key)  
{   
    key = (~key) + (key << 21); // key = (key << 21) - key - 1; 
  key = key ^ (key >> 24); 
  key = (key + (key << 3)) + (key << 8); // key * 265 
  key = key ^ (key >> 14); 
  key = (key + (key << 2)) + (key << 4); // key * 21 
  key = key ^ (key >> 28); 
  key = key + (key << 31); 
  return key; 


}  

uint64_t Uint64Hash::HashUint64A( uint64_t key)  
{   
  key = (~key) + (key << 18); // key = (key << 18) - key - 1; 
  key = key ^ (key >> 31); 
  key = key * 21; // key = (key + (key << 2)) + (key << 4); 
  key = key ^ (key >> 11); 
  key = key + (key << 6); 
  key = key ^ (key >> 22); 
  return key; 

}  

uint64_t Uint64Hash::HashUint64B( uint64_t key)  
{   

 key = ~key + (key << 15); // key = (key << 15) - key - 1; 
  key = key ^ (key >> 12); 
  key = key + (key << 2); 
  key = key ^ (key >> 4); 
  key = key * 2057; // key = (key + (key << 3)) + (key << 11); 
  key = key ^ (key >> 16); 
  return key; 

}  

/************************************************************************/
/*函数名：Hashed
*功  能：检测一个字符串是否被hash过
*返回值：如果存在，1；否则，返回0
************************************************************************/
bool Uint64Hash::Hashed( uint64_t key )  

{   
    //不同的字符串三次hash还会碰撞的几率无限接近于不可能
    uint64_t nHash = HashUint64(key);  
    uint64_t nHashA = HashUint64A(key);  
    uint64_t nHashB = HashUint64B(key);  
    uint64_t nHashStart = nHash % m_tablelength,  
    nHashPos = nHashStart;  

    while ( m_HashIndexTable[nHashPos].bExists)  
    {   
        if ( m_HashIndexTable[nHashPos].nHashA == nHashA && m_HashIndexTable[nHashPos].nHashB == nHashB)    
            return 1;   
        else   
            nHashPos = (nHashPos + 1) % m_tablelength;  

        if (nHashPos == nHashStart)   
            break;   
    }  

    return 0; //没有找到  
}  

/************************************************************************/
/*函数名：Hash
*功  能：hash一个字符串 
*返回值：成功，返回位置；失败，返回-1
************************************************************************/
uint64_t Uint64Hash::Hash(uint64_t key)
{  
    uint64_t nHash = HashUint64(key);  
    uint64_t nHashA = HashUint64A(key);  
    uint64_t nHashB = HashUint64B(key);  
    uint64_t nHashStart = nHash % m_tablelength, 
        nHashPos = nHashStart;  

    while ( m_HashIndexTable[nHashPos].bExists)  
    {   
        nHashPos = (nHashPos + 1) % m_tablelength;  
        if (nHashPos == nHashStart) //一个轮回  
        {  
            //hash表中没有空余的位置了,无法完成hash
            return -1;   
        }  
    }  
    m_HashIndexTable[nHashPos].bExists = true;  
    m_HashIndexTable[nHashPos].nHashA = nHashA;  
    m_HashIndexTable[nHashPos].nHashB = nHashB;  

    return nHashPos;  
}
int main(){
//    boost::timer t;
    const int   SIZE = 5000000;
    Uint64Hash s( SIZE*2 ) ;
    for(int i = 0 ; i < SIZE ; i++){
        s.Hash(i );
    }
    for(int i = 0 ; i < SIZE ; i++){
        if(  !s.Hashed(i )){
            cout<<"ERROR"<<endl;
            break;
        }
    }
    cout<<t.elapsed()<<endl;

    for(int i = SIZE ; i < 2*SIZE ; i++){
        if(  s.Hashed(i )){
            cout<<"ERROR " << i<<endl;
        }
    }
 //   cout<<t.elapsed()<<endl;

}

编译命令是：
g++ -O3 –std=c++11
如果需要用到boost 中的timer来统计时间的话，编译命令是：
g++ -O3 –std=c++11 *.cpp -lboost_system

和stl中unordered_set比较

unordered_set的代码为：

#include<iostream>
#include <unordered_set> 
#include <boost/timer.hpp>

using namespace std;
int main(){
    boost::timer t;
    const int SIZE = 5000000;

    unordered_set<int> s(SIZE);
    for(int i = 0 ; i < SIZE ; i++){
        s.insert( i );
    }
    for(int i = 0 ; i < SIZE ; i++){
        if( s.end()  == s.find( i )){
            cout<<"ERROR"<<endl;
            break;
        }
    }
    cout<<t.elapsed()<<endl;
}

时间上面，暴雪的hash函数在时间效率上慢很多，在都加上了O3优化选项后，在500W数量上面，暴雪的插入和探测时间是1.11s，unordered_set耗时：0.46s。

夕阳那边

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
暴雪hash来作整数的hash

暴雪hash来作整数的hash我们常常在项目中会用到探测一个数组B中是不是有数组A中的数据。方案1：将B中的数据放到一个位图中，然后用A中的数据探测。缺点：如果B中的数据比较稀疏，比如[1,2,900,800]等，位图开销大，因为要为没有出现的数也需要有一个bit来标识。方案2：将B中的数据存到一个hash表中，然后用A中的数据作探测。暴雪hash的原理见我之前转载的文章暴雪hash
复制链接

扫一扫