在《算法精解:C语言描述》中看到两个hash表的实现,一个是链式hash表,一个是开地址hash表,我在平时工作的项目中用到了开地址hash表,也做了一些修改和优化,在这里分享一下,原理不想重复讲了,网上或者书上都有,这里只讲我修改的代码。下面是全部源码:
/*
* Copyright (c) 2014, Jusse Wang <wanyco@gmail.com>.
* All rights reserved.
*
* @date: 2014-11-07
* @file ohtbl.h
* @brief open-addressed hash tables header file
*
*/
/* ohtbl.h - open-addressed hash tables */
/* */
/* This program is free software; you can redistribute it and/or modify */
/* it under the terms of the GNU General Public License as published by */
/* the Free Software Foundation; either version 2, or (at your option) */
/* any later version. */
/* */
/* This program is distributed in the hope that it will be useful, */
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
/* GNU General Public License for more details. */
/* */
/* You should have received a copy of the GNU General Public License */
/* along with this program; if not, write to the Free Software */
/* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA */
/* 02111-1307, USA. */
/* */
/* */
/* Double hashing: */
/* -------------- */
/* */
/* The double hashing function is */
/* */
/* h(k,i) = (h1(k) + i*h2(k)) % m */
/* */
/* m is prime, and the two hash functions can be: */
/* */
/* h1(k) = k % m; */
/* h2(k) = 1 + (k % (m - 1)); */
/* */
/* Obs: h2(k) never can be 0! */
/* */
/* */
/* The First 1,000 Primes */
/* (the 1,000th is 7919) */
/* or more information on primes see http://www.utm.edu/research/primes */
/* */
/* code is based on "Mastering Algorithms with C", Kyle Loudon */
#ifndef __OHTBL_H__
#define __OHTBL_H__
#ifdef __cplusplus
extern "C" {
#endif
#include <stdlib.h>
struct ohtbl_item {
struct ohtbl_item *prev;
struct ohtbl_item *next;
void *data;
};
typedef struct ohtbl {
void *vacated;
unsigned int (*h1)(const void *key);
unsigned int (*h2)(const void *key);
int (*match)(const void *key1,const void *key2);
void (*destroy)(void *data);
unsigned int tolerance;
unsigned int count;
unsigned int size;
unsigned int conflict;
struct ohtbl_item *cur;
struct ohtbl_item *head;
struct ohtbl_item *tail;
struct ohtbl_item *table;
} ohtbl_t;
/**
* @function: obthl_init
* @bried: open-addressed hash table initialization
* @param: htbl: the open-addressed hash table pointer, you must allocate it well before.
* size: the hash table size
* h1: calculate hash value function pointer 1
* h2: calculate hash value function pointer 2
* match:match hash table element function pointer
* alloc: allocate hash table memory function pointer
* destroy: free/destroy hash table memory function pointer
* tolerance: the maximum number of hash table element tolerance
* @return: 1 if everything ok, otherwise return 0.
*/
int ohtbl_init(ohtbl_t *htbl, unsigned int size, unsigned int (*h1)(const void *elem),
unsigned int (*h2)(const void *elem), int (*match)(const void *ohtbl_elem, const void *elem),
void *(*alloc)(size_t), void (*destroy)(void *data), unsigned int tolerance);
/**
* @function: ohtbl_reset
* @brief: reset all the hash table element to zero, you have to manage the element pointer (e.g. reuse/free)
* @param: htbl: the open-addressed hash table which to reset.
* @return: void
*/
void ohtbl_reset(ohtbl_t *htbl);
/**
* @function: ohtbl_destroy
* @brief: destroy the hash table
* @param: htbl: the open-addressed hash table which to destroy.
* @return: void
*/
void ohtbl_destroy(ohtbl_t *htbl);
/**
* @function: obthl_insert
* @brief: insert elem into hash table
* @param: htbl: the hash table which to insert
* elem: hash table element
* @return: 1 if there is no conflict, return -1 when the hash table element number
* reached the maximum tolerated value, otherwise return 0
*/
int ohtbl_insert(ohtbl_t *htbl, void *elem);
/**
* @function: obtbl_remove
* @brief: remove the elem from hash table
* @param: htbl: the hash table
* elem: the element which to remove
* @return: the element pointer which be removed, or NULL
*/
void *ohtbl_remove(ohtbl_t *htbl, const void *elem);
/**
* @function: ohtbl_flush
* @brief: flush current position for traversing
* @param: htbl: the hash table
* @return: void
*/
void ohtbl_flush(ohtbl_t *htbl);
/**
* @function: ohtbl_traverse
* @brief: Traverse hash table
* @param: htbl: the hash table
* @return: the hash table element pointer, or NULL
*/
void *ohtbl_traverse(ohtbl_t *htbl);
/**
* @function: ohtbl_foreach
* @brief: traverse the hash table, and calls the callback function on each element
* @param: htbl: the hash table
* callback: the callback function pointer
* data: the argument of callback function
* @return: void
*/
void ohtbl_foreach(ohtbl_t *htbl, void (*callback)(void *ohtbl_elem, void *data), void *data);
/**
* @function: __ohtbl_lookup
* @brief: lookup the elem whether in the hash table
* @param: htbl: the hash table
* elem: the element which will to be lookup
* ekey: use to save the elem's hash table key
* @return: the hash table element pointer, or NULL
*/
void *__ohtbl_lookup(ohtbl_t *htbl, const void *elem, unsigned int *ekey);
#define ohtbl_lookup(htbl, data) __ohtbl_lookup(htbl, data, NULL)
#define ohtbl_elem_count(htbl) (htbl)->count
#define ohtbl_for_each_entry(elm, htbl) \
struct ohtbl_item *ohtbl_cur = htbl->head; \
for (ohtbl_cur = ohtbl_cur->next; \
ohtbl_cur != NULL && (elm = ohtbl_cur->data) && ohtbl_cur != htbl->head; \
ohtbl_cur = ohtbl_cur->next, elm = ohtbl_cur->data)
#ifdef __cplusplus
}
#endif
#endif
/*
* Copyright (c) 2014, Jusse Wang <wanyco@gmail.com>.
* All rights reserved.
*
* @date: 2014-11-07
* @file ohtbl.c
* @brief open-addressed hash tables implementation file
*
*/
/* ohtbl.c - open-addressed hash tables */
/* */
/* This program is free software; you can redistribute it and/or modify */
/* it under the terms of the GNU General Public License as published by */
/* the Free Software Foundation; either version 2, or (at your option) */
/* any later version. */
/* */
/* This program is distributed in the hope that it will be useful, */
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
/* GNU General Public License for more details. */
/* */
/* You should have received a copy of the GNU General Public License */
/* along with this program; if not, write to the Free Software */
/* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA */
/* 02111-1307, USA. */
/* code is based on "Mastering Algorithms with C", Kyle Loudon */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <linux/types.h>
#include <assert.h>
#include "ohtbl.h"
/**
* Reserve a memory location
* when a element be removed, the hash table element's data pointer will be set to the vacated's address
*/
static char vacated;
/**
* @function: obthl_init
* @bried: open-addressed hash table initialization
* @param: htbl: the open-addressed hash table pointer, you must allocate it well before.
* size: the hash table size
* h1: calculate hash value function pointer 1
* h2: calculate hash value function pointer 2
* match:match hash table element function pointer
* alloc: allocate hash table memory function pointer
* destroy: free/destroy hash table memory function pointer
* tolerance: the maximum number of hash table element tolerance
* @return: 1 if everything ok, otherwise return 0.
*/
int ohtbl_init(ohtbl_t *htbl, unsigned int size, unsigned int (*h1)(const void *elem),
unsigned int (*h2)(const void *elem), int (*match)(const void *ohtbl_elem, const void *elem),
void *(*alloc)(size_t), void (*destroy)(void *data), unsigned int tolerance)
{
unsigned int i;
unsigned int hash_size_byte;
hash_size_byte = (size + 1) * sizeof(struct ohtbl_item);
htbl->table = (struct ohtbl_item *)alloc(hash_size_byte);
if (htbl->table == NULL) {
return 0;
}
memset(htbl->table, 0, hash_size_byte);
/* Initialize each position */
htbl->size = size;
for (i = 0; i < htbl->size; i++) {
htbl->table[i].data = NULL;
}
htbl->head = &htbl->table[size];
htbl->tail = &htbl->table[size];
/* Set the vacated member to the sentinel memory address reserved for this */
htbl->vacated = &vacated;
/* Encapsulate the functions */
htbl->h1 = h1;
htbl->h2 = h2;
htbl->match = match;
htbl->destroy = destroy;
/* Initialize the number of elements in the table */
htbl->count = htbl->conflict = 0;
/* Initialize tolerance */
htbl->tolerance = tolerance >= size ? (size * 0.8) : tolerance;
return 1;
}
/**
* @function: ohtbl_reset
* @brief: reset all the hash table element to zero, you have to manage the element pointer (e.g. reuse/free)
* @param: htbl: the open-addressed hash table which to reset.
* @return: void
*/
void ohtbl_reset(ohtbl_t *htbl)
{
unsigned int hash_size_byte;
if (htbl == NULL) {
return;
}
hash_size_byte = (htbl->size + 1) * sizeof(struct ohtbl_item);
memset(htbl->table, 0, hash_size_byte);
htbl->head = &htbl->table[htbl->size];
htbl->tail = &htbl->table[htbl->size];
htbl->count = htbl->conflict = 0;
}
/**
* @function: ohtbl_destroy
* @brief: destroy the hash table
* @param: htbl: the open-addressed hash table which to destroy.
* @return: void
*/
void ohtbl_destroy(ohtbl_t *htbl)
{
unsigned int i;
if (htbl->destroy != NULL){
/* Call a user-defined function to free dynamically allocated data */
for (i = 0; i < htbl->size; i++) {
if (htbl->table[i].data != NULL && htbl->table[i].data != htbl->vacated) {
htbl->destroy(htbl->table[i].data);
}
}
}
/* Free the storage allocated for the hash table */
htbl->destroy(htbl->table);
//* No operations are allowed now, but clear the structure as a precaution */
memset(htbl, 0, sizeof(ohtbl_t));
}
/**
* @function: obthl_insert
* @brief: insert elem into hash table
* @param: htbl: the hash table which to insert
* elem: hash table element
* @return: 1 if there is no conflict, return -1 when the hash table element number
* reached the maximum tolerated value, otherwise return 0
*/
int ohtbl_insert(ohtbl_t *htbl, void *elem)
{
unsigned int key = 0;
/* Do not exceed the tolerance in the table */
if (htbl->count == htbl->tolerance) {
return -1;
}
/* Do nothing if the data is already in the table */
if (__ohtbl_lookup(htbl, elem, &key) != NULL) {
return 0;
}
/* Insert the data into the table */
htbl->table[key].data = elem;
htbl->count++;
/* Append element to linked list */
htbl->head->prev = htbl->tail->next = &htbl->table[key];
htbl->table[key].next = htbl->head;
htbl->table[key].prev = htbl->tail;
htbl->tail = htbl->tail->next;
if (key != htbl->h1(elem) % htbl->size) {
htbl->conflict++;
}
return 1;
}
/**
* @function: obtbl_remove
* @brief: remove the elem from hash table
* @param: htbl: the hash table
* elem: the element which to remove
* @return: the element pointer which be removed, or NULL
*/
void *ohtbl_remove(ohtbl_t *htbl, const void *elem)
{
unsigned int i = 0;
unsigned int key = 0;
unsigned int next_key = 0;
unsigned int is_conflict = 0;
void *ret = NULL;
if (htbl == NULL || htbl->count == 0) {
return NULL;
}
for (i = 0; i < htbl->size; i++){
key = (htbl->h1(elem) + i * htbl->h2(elem)) % htbl->size;
if (htbl->table[key].data == NULL) {
/* Return that the elem was not found */
return NULL;
} else if (htbl->table[key].data == htbl->vacated) {
/* Search beyond vacated positions */
continue;
} else if (htbl->match(htbl->table[key].data, elem)) {
/* Pass back the elem from the table */
htbl->count--;
ret = htbl->table[key].data;
if (htbl->tail == &htbl->table[key]) {
htbl->tail = htbl->table[key].prev;
}
htbl->table[key].data = htbl->vacated;
htbl->table[key].prev->next = htbl->table[key].next;
htbl->table[key].next->prev = htbl->table[key].prev;
if ((i + 1) < htbl->size) {
next_key = (htbl->h1(elem) + (i + 1) * htbl->h2(elem)) % htbl->size;
if (htbl->table[next_key].data != NULL) {
is_conflict = 1;
}
}
if (is_conflict == 1 && htbl->conflict > 0) {
htbl->conflict--;
}
return ret;
}
is_conflict = 1;
}
return ret;
}
/**
* @function: ohtbl_flush
* @brief: flush current position for traversing
* @param: htbl: the hash table
* @return: void
*/
void ohtbl_flush(ohtbl_t *htbl)
{
htbl->cur = htbl->head;
}
/**
* @function: ohtbl_traverse
* @brief: Traverse hash table
* @param: htbl: the hash table
* @return: the hash table element pointer, or NULL
*/
void *ohtbl_traverse(ohtbl_t *htbl)
{
if (htbl->cur == htbl->tail) {
return NULL;
}
htbl->cur = htbl->cur->next;
return htbl->cur->data;
}
/**
* @function: ohtbl_foreach
* @brief: traverse the hash table, and calls the callback function on each element
* @param: htbl: the hash table
* callback: the callback function pointer
* data: the argument of callback function
* @return: void
*/
void ohtbl_foreach(ohtbl_t *htbl, void (*callback)(void *ohtbl_elem, void *data), void *data)
{
void *elem;
if (htbl == NULL || callback == NULL) {
return;
}
ohtbl_flush(htbl);
while ((elem = ohtbl_traverse(htbl)) != NULL) {
callback(elem, data);
}
}
/**
* @function: __ohtbl_lookup
* @brief: lookup the elem whether in the hash table
* @param: htbl: the hash table
* elem: the element which will to be lookup
* ekey: use to save the elem's hash table key
* @return: the hash table element pointer, or NULL
*/
void *__ohtbl_lookup(ohtbl_t *htbl, const void *elem, unsigned int *ekey)
{
unsigned int key = 0;
unsigned int i = 0, n = 0;
/* Use double hashing to hash the key */
for (i = 0; i < htbl->size; i++) {
key = (htbl->h1(elem) + i * htbl->h2(elem)) % htbl->size;
if (htbl->table[key].data == NULL) {
/* Return that the elem was not found */
if (ekey != NULL && n == 0) {
*ekey = key;
n++;
}
return NULL;
} else if (htbl->table[key].data == htbl->vacated) {
if (ekey != NULL && n == 0) {
*ekey = key;
n++;
}
continue;
} else if (htbl->match(htbl->table[key].data, elem)) {
if (ekey != NULL) {
*ekey = key;
}
/* Data was found */
return htbl->table[key].data;
}
}
/* Return that the elem was not found */
return NULL;
}
所做的修改有:
1、添加hash表遍历接口和实现:
hash表的遍历有时还是挺有用的,比如一个容量为100000的hash表里面有100个元素,想遍历这个hash表的所有元素,总不能循环100000次吧,所以在ohtbl_t中加了循环双向列表就可以解决这个问题了。请参考ohtbl_flush、ohtbl_traverse、ohtbl_foreach、ohtbl_for_each_entry的实现代码。
2、添加内存分配函数参数,可以在共享内存中建立hash表:
在多进程模型中,难免会用到共享内存,如果想在共享内存中建立hash表,那就得把分配hash表内存的函数提供出来,自己只需要实现在共享内存分配函数然后传给ohtbl_init的alloc参数就可以了。请参考ohtbl_init的实现代码。
3、限制hash表大小容忍度:
一般hash表如果插入的元素越多,其冲突概率就越大,所以hash表一般要有一个容忍度,也就是最多能往这个hash表插入多少个元素,容忍度越低则冲突率就越低,但利用率就越低,所以我一般设置成40%,这个看经验了,如果hash函数写得好,容忍度可以设置得高一些。代码请参考ohtbl_init的tolerance参数和实现,这里tolerance不是百分比,是经过用容忍度计算后的值。
4、添加hash表重置接口和实现:
hash表有时也需要重置,代码请参考ohtbl_reset函数的实现。
5、优化hash表的插入和查询:
在《算法精解》里面ohtbl_insert函数是可以优化的,它先是调用ohtbl_lookup来查找元素是否已经存在,如果存在则插入失败直接返回,如果要插入的元素不在hash表里才能插入,之后又重新遍历和查找一次要插入的位置,这一步想要的结果是可以直接在上一步ohtbl_lookup中直接得到的,所以就优化了一下,代码请参考__ohtbl_lookup和ohtbl_insert的实现。
(如有不解之处,欢迎留言或者站内信)