这个散列实现的性能应该是比较好的,因为某大小为30M的英文作品,用该算法统计里面每个单词出现的次数,所需时间2秒不到.同样的Python程序大概要7秒.
不过,Python的散列表算法确实很猛,真不愧是Python其他功能大量依赖的基础算法.Python3 散列表的探测算法(点击查看源码)性能很好.而且表大小可以简单地设定为2的幂,不像有些探测算法要求表大小为素数.这样就可以很方便地扩建表.
简单来说,Python的散列表探测分为2个步骤,第一步计算第一个索引,直接用散列值h和表尺寸减1的与运算作为结果:
mask = table->size-1
i = h & mask //第一个索引i
如果发生冲突,则进入第二步,开始探测循环:
perturb = h //初始化扰码
i = (i << 2) + i + perturb + 1;
i & mask //下一个索引
perturb >>= 5 //如果需i继续探测,则右移扰码,跳入下次循环
暂时未找到Python是如何hash字符串的,因此引用了这个算法.
Python散列表有个初始尺寸,好像是8.插入新成员时,如果使用槽的数目超过尺寸的三分之二,则扩充表(表小于50000时是4倍,否则是2倍).
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define ERROR -1
#define SUCCESS 0
#define PYHASH_NUMBER 5381
#define PYHASH_MINSIZE 8 //最小表尺寸,需保证为2的幂
#define PYHASH_DEFALUT_VALUE 0 //哈希表默认值
#define PERTURB_SHIFT 5
#define PYHASH_NEED_RESIZE(tb) (((tb)->used+(tb)->deleted)*3>=(((tb)->size)*2))
#define pyhash_keycmp strcmp
#define pyhash_keydup strdup
typedef char* pyhash_key;
typedef size_t pyhash_value;
typedef enum {
BLANK, USED, DELETED,
} pyhash_node_status;
typedef struct {
pyhash_key key;
pyhash_value value;
pyhash_node_status status;
} pyhash_node;
typedef struct {
size_t used; //在用的总数
size_t deleted; //删除的总数
size_t size; //总数,即在用+删除+空白
size_t default_value;
pyhash_node *nodes; //哈希表
} hashtable;
pyhash_node *pyhash_create_nodes(size_t size) ;
int pyhash_free_nodes(pyhash_node *nds, size_t size) ;
hashtable *pyhash_create_table(size_t size) ; //创建字典对象
int pyhash_free_table(hashtable *tb) ;//清除字典
pyhash_node *pyhash_search(hashtable *tb, pyhash_key key) ;//常规查找,仅key存在时返回结果
int pyhash_update(hashtable *tb, pyhash_key key, pyhash_value value) ;//常规更新,仅key存在时执行更新
int pyhash_insert(hashtable *tb, pyhash_key key, pyhash_value value) ;//常规插入,仅key不存在时执行插入
pyhash_node *pyhash_force_search(hashtable *tb, pyhash_key key) ;//强制查找,若key不存在,则插入后再返回该key
int pyhash_force_update(hashtable *tb, pyhash_key key, pyhash_value value) ;//强制更新,若key不存在,则插入后再更新该key
int pyhash_delete(hashtable *tb, pyhash_key key) ;
static size_t hash(pyhash_key s) ;//哈希函数
static pyhash_node *hash_meta_search(hashtable *tb, pyhash_key key) ;
static void hash_table_status(hashtable *t) ;
static int hash_cmp(const void *a, const void *b) ;
static void hash_all_members(hashtable *t) ; //按值倒序打印全部成员
static void hash_probe_rate(hashtable *t) ;//依次查询散列表的每一个成员,得出探测冲突比率.评估查询性能.
static size_t probe_number = 0; //用于累积遍历整个散列表的冲突次数
static int hash_resize(hashtable *tb) ;//字典重建
int main() {
hashtable *tb = pyhash_create_table(0);
char wd[100];
while (scanf("%s\n", wd) == 1) {
pyhash_node *ep = pyhash_force_search(tb, wd);
if (ep == NULL)
return ERROR;
ep->value++;
}
hash_probe_rate(tb);
hash_all_members(tb);
pyhash_free_table(tb);
return SUCCESS;
}
size_t hash(pyhash_key s) {
size_t h = PYHASH_NUMBER;
for (; *s ; s++)
h = ((h << 5) + h) + *s;
return h;
}
pyhash_node *pyhash_create_nodes(size_t size) {
pyhash_node *nds = (pyhash_node*) malloc(sizeof(pyhash_node) * size);
if (nds == NULL)
return NULL;
size_t i = 0;
for (; i < size; i++) {
nds[i].status = BLANK;
nds[i].key = NULL;
nds[i].value = 0;
}
return nds;
}
int pyhash_free_nodes(pyhash_node *nds, size_t size) {
size_t i = 0;
for (; i < size; i++)
if (nds[i].status == USED || nds[i].status == DELETED)
free(nds[i].key);
free(nds);
return SUCCESS;
}
hashtable *pyhash_create_table(size_t size) {
//保证表大小(size)为2的幂非常重要,防止查询无限循环.
if (size == 0)
size = PYHASH_MINSIZE;
hashtable *tb = (hashtable*) malloc(sizeof(hashtable));
if (tb == NULL)
return NULL;
if ((tb->nodes = pyhash_create_nodes(size)) == NULL)
return NULL;
tb->size = size;
tb->deleted = 0;
tb->used = 0;
tb->default_value = PYHASH_DEFALUT_VALUE;
return tb;
}
int pyhash_free_table(hashtable *tb) {
pyhash_free_nodes(tb->nodes,tb->size);
free(tb);
return SUCCESS;
}
int hash_resize(hashtable *tb) {
size_t old_size = tb->size;
size_t new_size = (tb->used>50000?2:4)*tb->size;
pyhash_node *new_ets = pyhash_create_nodes(new_size);
if (new_ets == NULL)
return ERROR;
pyhash_node *old_ets = tb->nodes;
tb->nodes = new_ets;
tb->size = new_size;
tb->deleted = 0;
tb->used = 0;
size_t i = 0;
for (; i < old_size; i++)
if (old_ets[i].status == USED)
if (pyhash_insert(tb, old_ets[i].key, old_ets[i].value)==ERROR)
return ERROR;
pyhash_free_nodes(old_ets, old_size);
return SUCCESS;
}
pyhash_node *hash_meta_search(hashtable *tb, pyhash_key key) {
size_t perturb;
size_t h = hash(key);
size_t mask = tb->size - 1;
size_t i = h & mask;
pyhash_node *ep0 = tb->nodes;
pyhash_node *ep = &ep0[i];
if (ep->status == BLANK)
return ep;
int r = pyhash_keycmp(ep->key, key);
if (ep->status == USED && !r)
return ep;
if (ep->status == DELETED && !r)
return ep;
for (perturb = h;; perturb >>= PERTURB_SHIFT) {
probe_number++;
i = (i << 2) + i + perturb + 1;
ep = &ep0[i & mask];
if (ep->status == BLANK)
return ep;
r = pyhash_keycmp(ep->key, key);
if (ep->status == USED && !r)
return ep;
if (ep->status == DELETED && !r)
return ep;
}
return NULL;
}
pyhash_node *pyhash_search(hashtable *tb, pyhash_key key) {
pyhash_node *ep = hash_meta_search(tb,key);
if (ep == NULL)
return NULL;
if (ep->status == USED)
return ep;
else
return NULL;
}
int pyhash_update(hashtable *tb, pyhash_key key, pyhash_value value) {
pyhash_node *ep = hash_meta_search(tb,key);
if (ep == NULL)
return ERROR;
if (ep->status == USED) {
ep->value = value;
return SUCCESS;
} else
return ERROR;
}
int pyhash_insert(hashtable *tb, pyhash_key key, pyhash_value value) {
pyhash_node *ep = hash_meta_search(tb, key);
if (ep == NULL)
return ERROR;
if (ep->status == BLANK) {
if ((ep->key = pyhash_keydup(key))==NULL)
return ERROR;
ep->status = USED;
tb->used++;
ep->value = value;
if (PYHASH_NEED_RESIZE(tb))
return hash_resize(tb);
return SUCCESS;
} else if (ep->status == DELETED) {
ep->status = USED;
tb->used++;
tb->deleted--;
ep->value = value;
return SUCCESS;
}
return ERROR;
}
pyhash_node *pyhash_force_search(hashtable *tb, pyhash_key key) {
pyhash_node *ep = hash_meta_search(tb,key);
if (ep==NULL)
return NULL;
if (ep->status == USED)
return ep;
if (ep->status == BLANK) {
if ((ep->key = pyhash_keydup(key))==NULL)
return NULL;
tb->used++;
ep->status = USED;
ep->value = tb->default_value;
if (PYHASH_NEED_RESIZE(tb)) {
if(hash_resize(tb)==ERROR)
return NULL;
return hash_meta_search(tb, key);
} else
return ep;
}
if (ep->status == DELETED) {
tb->deleted--;
tb->used++;
ep->status = USED;
ep->value = tb->default_value;
return ep;
}
return NULL;
}
int pyhash_force_update(hashtable *tb, pyhash_key key, pyhash_value value) {
pyhash_node *ep = pyhash_force_search(tb, key);
if (ep == NULL)
return ERROR;
ep->value = value;
return SUCCESS;
}
int pyhash_delete(hashtable *tb, pyhash_key key) {
size_t perturb;
size_t h = hash(key);
size_t mask = tb->size - 1;
size_t i = h & mask;
pyhash_node *ep0 = tb->nodes;
pyhash_node *ep = &ep0[i];
int r = pyhash_keycmp(ep->key, key);
if (ep->status == USED && !r) {
ep->status = DELETED;
tb->deleted++;
tb->used--;
return SUCCESS;
}
if ((ep->status == DELETED && !r) || (ep->status == BLANK))
return ERROR;
for (perturb = h;; perturb >>= PERTURB_SHIFT) {
i = (i << 2) + i + perturb + 1;
ep = &ep0[i & mask];
if (ep->status == USED && !r) {
ep->status = DELETED;
tb->deleted++;
tb->used--;
return SUCCESS;
}
if ((ep->status == DELETED && !r) || (ep->status == BLANK))
return ERROR;
}
return ERROR;
}
void hash_table_status(hashtable *t) {
printf("hashtable size:%lu, used:%lu, deleted:%lu\n", t->size, t->used,
t->deleted);
}
int hash_cmp(const void *a, const void *b) {
return (*(pyhash_node *) a).value > (*(pyhash_node *) b).value ? -1 : 1;
}
void hash_all_members(hashtable *t) {
pyhash_node *nds = t->nodes;
pyhash_node es[t->used];
size_t i = 0, j = 0, size = t->size;
for (; i < size; i++)
if (nds[i].status == USED)
es[j++] = nds[i];
qsort(es, t->used, sizeof(es[0]), hash_cmp);
for (i = 0; i < t->used; i++)
printf("%s\t%lu\n", es[i].key, es[i].value);
}
void hash_probe_rate(hashtable *t) {
extern size_t probe_number;
probe_number = 0;
pyhash_node *nds = t->nodes;
size_t i = 0;
for (; i < t->size; i++)
if (nds[i].status == USED)
pyhash_search(t, nds[i].key);
printf("probe rate: %.2f (hashtable size: %lu, used: %lu, deleted: %lu)\n",
(probe_number + t->used) / (float) t->used, t->size, t->used,
t->deleted);
}