zz 简单的HashTable

最新推荐文章于 2024-10-12 10:36:40 发布

strollerV

最新推荐文章于 2024-10-12 10:36:40 发布

阅读量611

点赞数

文章标签： table 数据结构 insert 数据库 null string

本文链接：https://blog.csdn.net/strollerv/article/details/2125653

版权

近期工作中要处理100W条记录,前一个同事使用SQLLite数据作为数据结构存储,采用数据库查询一条记录,时间当然不成问题.后来,我将数据库的数据导出来,发现由原来的150Ｍ多变成20M 多，而装载数据库存需要２５Ｓ，这种用空间换时间的做法未免牺牲太大.所以我想到不用数据库.而用普通的文件存储,要解决查询一条记录肯定要使用某种数据结构.首先想到的是STL中的hash_set,它采用hash_table作为底层数据结构,查询肯定不成问题,可惜初始化所需时间无法接受,我想可能hash_table的内部表格重建过于复杂,而我所要的并不复杂,于是我想写一个简单的hash_table,只要能进行插入,查询就可以,而且只针对字符串就行了,所以查看一些资料,写了一个简单的Hash_Table.

从底层来看,Hash_Table与stl中的hash_table的区别在于,内存管理不一样,hash_table采用std::alloc,Hash_Table直接使用new.还有一点不同的表格重建的策略不一样.

template
class CMyHashTable
{
public:
typedef const Value* ValPointer;
typedef const Value& ValRef;
struct ValNode
{
         Value        Data;
   DWORD        Key;
   ValNode*     pNext;
};

typedef ValNode*        NodePointer;
typedef const ValNode* NodeCPointer;
typedef ValNode&        NodeRef;
typedef const ValNode& NodeCRef;
protected:
const int* m_boxSize;
int   m_boxCount;
int      m_level;
int            m_nbox;           //桶的大小,初始为1024
int            m_limh;
int            m_liml;
NodePointer*   m_boxes;
int      m_size;           //元素的问个数
public:
CMyHashTable(int boxCount, const int* boxSize, int initLevel);
~CMyHashTable();
public:
ValPointer find(ValRef value) const
{
  ValNode s;
  _fill_rec(s, value);
  return &_find(s)->Data;
}

bool insert(ValRef value)
{
ValNode s;
_fill_rec(s, value);

if (_find(s) != NULL)
return false;

_insert_rec(s);
return true;
}

void insert_force(ValRef value)
{
   ValNode s;
  _fill_rec(s, value);
  _insert_rec(s);
}

bool remove(ValRef value)
{
ValNode s;
_fill_rec(s, value);

  bool rtv = _find_remove(s);
  if (rtv)
  {
   --m_size;
   if (m_size < m_liml)
    decrease_level();
  }

return rtv;
}

int size() const { return m_size; }
int capacity() const { return m_limh; }

void for_each(void (*fp)(ValRef));
protected:
inline DWORD _short_key(DWORD key) const
{
return key & (m_nbox - 1);
}
inline static DWORD _short_key(DWORD key, DWORD nbox)
{
return key & (nbox - 1);
}

NodeCPointer _find(NodeCRef) const;
bool _find_remove(NodeCRef);

void _insert_rec(NodeCRef rec)
{
++m_size;

NodePointer pnew = new ValNode;
*pnew = rec;

  DWORD short_key = _short_key(rec.Key);
  pnew->pNext = m_boxes[short_key];
  m_boxes[short_key] = pnew;

if (m_size >= m_limh)
increase_level();
}

void switch_level(int new_level);
void increase_level()
{
  if (m_level < m_boxCount - 1)
   switch_level(m_level + 1);
  else
   m_limh = 0x20000000; // Unable to increase level
}
void decrease_level()
{
  if (m_level > 0)
   switch_level(m_level - 1);
  else
   m_liml = 0; // Unable to decrease level
}

static void _fill_rec(NodeRef s, ValRef value)
{
  s.Data  = value;
  s.Key=       HashFun()(value);
  s.pNext  = NULL;
}

};

template
CMyHashTable ::CMyHashTable(int boxCount, const int* boxSize, int initLevel)
:m_boxCount(boxCount), m_boxSize(boxSize)
{
m_level = initLevel;

m_limh = m_nbox = m_boxSize[m_level];
m_liml = initLevel > 0 ? m_boxSize[initLevel - 1] / 2 : 0;

m_boxes = new NodePointer[m_nbox];
m_size = 0;
memset(m_boxes, 0, sizeof(NodePointer) * m_nbox);
}
template
CMyHashTable ::~CMyHashTable()
{
for (int i = 0; i < m_nbox; ++i)
{
  NodePointer p = m_boxes[i];
  while (p != NULL)
  {
   NodePointer q = p->pNext;
   delete p;
   p = q;
  }
}
delete[] m_boxes;
}

template
typename CMyHashTable ::NodeCPointer CMyHashTable ::_find(NodeCRef r) const
{
DWORD short_key = _short_key(r.Key);
NodePointer p = m_boxes[short_key];

while (p != NULL)
{
  if (p->Key == r.Key)
  {
   if (EqualKey()(p->Data, r.Data))
    return p;
  }
  p = p->pNext;
}
return NULL;
}
template
bool CMyHashTable ::_find_remove(NodeCRef r)
{
DWORD  short_key = _short_key(r.Key);
NodePointer* p = &m_boxes[short_key];

while (*p != NULL)
{
  if ((*p)->Key == r.Key)
  {
   if (EqualKey()((*p)->value, r.value))
   {
    NodePointer q = *p;
    *p = q->next;
    delete q;
    return true;
   }
  }
  p = &(*p)->next;
}
return false;
}

template
void CMyHashTable ::switch_level(int new_level)
{
int new_n = m_boxSize[m_level = new_level];
NodePointer* new_boxes = new NodePointer[new_n];
memset(new_boxes, 0, sizeof(NodePointer) * new_n);

for (int i = 0; i < m_nbox; ++i)
{
  NodePointer  p = m_boxes[i], q;
  while (p != NULL)
  {
   q = p->pNext;
   DWORD sht_key = _short_key(p->Key, new_n);
   p->pNext = new_boxes[sht_key];
   new_boxes[sht_key] = p;

p = q;
}
}

m_limh = m_nbox = new_n;
m_liml = m_level > 0 ? m_boxSize[m_level - 1] / 2 : 0;
delete[] m_boxes;
m_boxes = new_boxes;
}

另外使用这个的两个仿函数

struct EPSTR
{
bool operator()(const string& s1, const string& s2) const
{
return strcmp(s1.c_str(), s2.c_str()) == 0;
}
};

struct HASH_FUN
{
size_t operator()(const string& str) const
{
  const char* psz = str.c_str();
  unsigned long _hash = 0;
  for (; *psz != 0; ++psz)
  {
   _hash *= 16777619;
   _hash ^= (unsigned long) (unsigned short&)*psz;
  }
  return _hash;
}
};

我使用这个HASH_TABLE装载100W条数据,只要16S,查询一条记录在1MS以内.应该说比使用数据库要好得多.