Python3散列表的简单实现(C语言)

最新推荐文章于 2022-03-18 21:17:37 发布

lispythonic

最新推荐文章于 2022-03-18 21:17:37 发布

阅读量950

点赞数

分类专栏： C 算法

本文链接：https://blog.csdn.net/napo1987/article/details/32732041

版权

算法同时被 2 个专栏收录

11 篇文章 0 订阅

订阅专栏

4 篇文章 0 订阅

订阅专栏

这个散列实现的性能应该是比较好的,因为某大小为30M的英文作品,用该算法统计里面每个单词出现的次数,所需时间2秒不到.同样的Python程序大概要7秒.

不过,Python的散列表算法确实很猛,真不愧是Python其他功能大量依赖的基础算法.

Python3 散列表的探测算法(点击查看源码)性能很好.而且表大小可以简单地设定为2的幂,不像有些探测算法要求表大小为素数.这样就可以很方便地扩建表.

简单来说,Python的散列表探测分为2个步骤,第一步计算第一个索引,直接用散列值h和表尺寸减1的与运算作为结果:

mask = table->size-1

i = h & mask //第一个索引i

如果发生冲突,则进入第二步,开始探测循环:

perturb = h //初始化扰码

i = (i << 2) + i + perturb + 1;

i & mask //下一个索引

perturb >>= 5 //如果需i继续探测,则右移扰码,跳入下次循环

暂时未找到Python是如何hash字符串的,因此引用了这个算法.

Python散列表有个初始尺寸,好像是8.插入新成员时,如果使用槽的数目超过尺寸的三分之二,则扩充表(表小于50000时是4倍,否则是2倍).

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define ERROR -1
#define SUCCESS 0
#define PYHASH_NUMBER 5381
#define PYHASH_MINSIZE  8 //最小表尺寸,需保证为2的幂
#define PYHASH_DEFALUT_VALUE 0 //哈希表默认值
#define PERTURB_SHIFT 5
#define PYHASH_NEED_RESIZE(tb) (((tb)->used+(tb)->deleted)*3>=(((tb)->size)*2))
#define pyhash_keycmp strcmp
#define pyhash_keydup strdup

typedef char* pyhash_key;
typedef size_t pyhash_value;
typedef enum {
    BLANK, USED, DELETED,
} pyhash_node_status;
typedef struct {
    pyhash_key key;
    pyhash_value value;
    pyhash_node_status status;
} pyhash_node;
typedef struct {
    size_t used; //在用的总数
    size_t deleted; //删除的总数
    size_t size; //总数,即在用+删除+空白
    size_t default_value;
    pyhash_node *nodes; //哈希表
} hashtable;

pyhash_node *pyhash_create_nodes(size_t size) ;
int pyhash_free_nodes(pyhash_node *nds, size_t size) ;
hashtable *pyhash_create_table(size_t size) ; //创建字典对象
int pyhash_free_table(hashtable *tb) ;//清除字典
pyhash_node *pyhash_search(hashtable *tb, pyhash_key key) ;//常规查找,仅key存在时返回结果
int pyhash_update(hashtable *tb, pyhash_key key, pyhash_value value) ;//常规更新,仅key存在时执行更新
int pyhash_insert(hashtable *tb, pyhash_key key, pyhash_value value) ;//常规插入,仅key不存在时执行插入
pyhash_node *pyhash_force_search(hashtable *tb, pyhash_key key) ;//强制查找,若key不存在,则插入后再返回该key
int pyhash_force_update(hashtable *tb, pyhash_key key, pyhash_value value) ;//强制更新,若key不存在,则插入后再更新该key
int pyhash_delete(hashtable *tb, pyhash_key key) ;
static size_t hash(pyhash_key s) ;//哈希函数
static pyhash_node *hash_meta_search(hashtable *tb, pyhash_key key) ;
static void hash_table_status(hashtable *t) ;
static int hash_cmp(const void *a, const void *b) ;
static void hash_all_members(hashtable *t) ; //按值倒序打印全部成员
static void hash_probe_rate(hashtable *t) ;//依次查询散列表的每一个成员,得出探测冲突比率.评估查询性能.
static size_t probe_number = 0; //用于累积遍历整个散列表的冲突次数
static int hash_resize(hashtable *tb) ;//字典重建

int main() {
    hashtable *tb = pyhash_create_table(0);
    char wd[100];
    while (scanf("%s\n", wd) == 1) {
        pyhash_node *ep = pyhash_force_search(tb, wd);
        if (ep == NULL)
            return ERROR;
        ep->value++;
    }
    hash_probe_rate(tb);
    hash_all_members(tb);
    pyhash_free_table(tb);
    return SUCCESS;
}

size_t hash(pyhash_key s) {
    size_t h = PYHASH_NUMBER;
    for (; *s ; s++)
        h = ((h << 5) + h) + *s;
    return h;
}

pyhash_node *pyhash_create_nodes(size_t size) {
    pyhash_node *nds = (pyhash_node*) malloc(sizeof(pyhash_node) * size);
    if (nds == NULL)
        return NULL;
    size_t i = 0;
    for (; i < size; i++) {
        nds[i].status = BLANK;
        nds[i].key = NULL;
        nds[i].value = 0;
    }
    return nds;
}

int pyhash_free_nodes(pyhash_node *nds, size_t size) {
    size_t i = 0;
    for (; i < size; i++)
        if (nds[i].status == USED || nds[i].status == DELETED)
            free(nds[i].key);
    free(nds);
    return SUCCESS;
}

hashtable *pyhash_create_table(size_t size) {
    //保证表大小(size)为2的幂非常重要,防止查询无限循环.
    if (size == 0)
        size = PYHASH_MINSIZE;
    hashtable *tb = (hashtable*) malloc(sizeof(hashtable));
    if (tb == NULL)
        return NULL;
    if ((tb->nodes = pyhash_create_nodes(size)) == NULL)
        return NULL;
    tb->size = size;
    tb->deleted = 0;
    tb->used = 0;
    tb->default_value = PYHASH_DEFALUT_VALUE;
    return tb;
}

int pyhash_free_table(hashtable *tb) {
    pyhash_free_nodes(tb->nodes,tb->size);
    free(tb);
    return SUCCESS;
}

int hash_resize(hashtable *tb) {
    size_t old_size = tb->size;
    size_t new_size = (tb->used>50000?2:4)*tb->size;
    pyhash_node *new_ets = pyhash_create_nodes(new_size);
    if (new_ets == NULL)
        return ERROR;
    pyhash_node *old_ets = tb->nodes;
    tb->nodes = new_ets;
    tb->size = new_size;
    tb->deleted = 0;
    tb->used = 0;
    size_t i = 0;
    for (; i < old_size; i++)
        if (old_ets[i].status == USED)
            if (pyhash_insert(tb, old_ets[i].key, old_ets[i].value)==ERROR)
                return ERROR;
    pyhash_free_nodes(old_ets, old_size);
    return SUCCESS;
}

pyhash_node *hash_meta_search(hashtable *tb, pyhash_key key) {
    size_t perturb;
    size_t h = hash(key);
    size_t mask = tb->size - 1;
    size_t i = h & mask;
    pyhash_node *ep0 = tb->nodes;
    pyhash_node *ep = &ep0[i];
    if (ep->status == BLANK)
        return ep;
    int r = pyhash_keycmp(ep->key, key);
    if (ep->status == USED && !r)
        return ep;
    if (ep->status == DELETED && !r)
        return ep;
    for (perturb = h;; perturb >>= PERTURB_SHIFT) {
        probe_number++;
        i = (i << 2) + i + perturb + 1;
        ep = &ep0[i & mask];
        if (ep->status == BLANK)
            return ep;
        r = pyhash_keycmp(ep->key, key);
        if (ep->status == USED && !r)
            return ep;
        if (ep->status == DELETED && !r)
            return ep;
    }
    return NULL;
}

pyhash_node *pyhash_search(hashtable *tb, pyhash_key key) {
    pyhash_node *ep = hash_meta_search(tb,key);
    if (ep == NULL)
        return NULL;
    if (ep->status == USED)
        return ep;
    else
        return NULL;
}

int pyhash_update(hashtable *tb, pyhash_key key, pyhash_value value) {
    pyhash_node *ep = hash_meta_search(tb,key);
    if (ep == NULL)
        return ERROR;
    if (ep->status == USED) {
        ep->value = value;
        return SUCCESS;
    } else
        return ERROR;
}

int pyhash_insert(hashtable *tb, pyhash_key key, pyhash_value value) {
    pyhash_node *ep = hash_meta_search(tb, key);
    if (ep == NULL)
        return ERROR;
    if (ep->status == BLANK) {
        if ((ep->key = pyhash_keydup(key))==NULL)
            return ERROR;
        ep->status = USED;
        tb->used++;
        ep->value = value;
        if (PYHASH_NEED_RESIZE(tb))
            return hash_resize(tb);
        return SUCCESS;
    } else if (ep->status == DELETED) {
        ep->status = USED;
        tb->used++;
        tb->deleted--;
        ep->value = value;
        return SUCCESS;
    }
    return ERROR;
}

pyhash_node *pyhash_force_search(hashtable *tb, pyhash_key key) {
    pyhash_node *ep = hash_meta_search(tb,key);
    if (ep==NULL)
        return NULL;
    if (ep->status == USED)
        return ep;
    if (ep->status == BLANK) {
        if ((ep->key = pyhash_keydup(key))==NULL)
            return NULL;
        tb->used++;
        ep->status = USED;
        ep->value = tb->default_value;
        if (PYHASH_NEED_RESIZE(tb)) {
            if(hash_resize(tb)==ERROR)
                return NULL;
            return hash_meta_search(tb, key);
        } else
            return ep;
    }
    if (ep->status == DELETED) {
        tb->deleted--;
        tb->used++;
        ep->status = USED;
        ep->value = tb->default_value;
        return ep;
    }
    return NULL;
}

int pyhash_force_update(hashtable *tb, pyhash_key key, pyhash_value value) {
    pyhash_node *ep = pyhash_force_search(tb, key);
    if (ep == NULL)
        return ERROR;
    ep->value = value;
    return SUCCESS;
}

int pyhash_delete(hashtable *tb, pyhash_key key) {
    size_t perturb;
    size_t h = hash(key);
    size_t mask = tb->size - 1;
    size_t i = h & mask;
    pyhash_node *ep0 = tb->nodes;
    pyhash_node *ep = &ep0[i];
    int r = pyhash_keycmp(ep->key, key);
    if (ep->status == USED && !r) {
        ep->status = DELETED;
        tb->deleted++;
        tb->used--;
        return SUCCESS;
    }
    if ((ep->status == DELETED && !r) || (ep->status == BLANK))
        return ERROR;
    for (perturb = h;; perturb >>= PERTURB_SHIFT) {
        i = (i << 2) + i + perturb + 1;
        ep = &ep0[i & mask];
        if (ep->status == USED && !r) {
            ep->status = DELETED;
            tb->deleted++;
            tb->used--;
            return SUCCESS;
        }
        if ((ep->status == DELETED && !r) || (ep->status == BLANK))
            return ERROR;
    }
    return ERROR;
}

void hash_table_status(hashtable *t) {
    printf("hashtable size:%lu, used:%lu, deleted:%lu\n", t->size, t->used,
           t->deleted);
}

int hash_cmp(const void *a, const void *b) {
    return (*(pyhash_node *) a).value > (*(pyhash_node *) b).value ? -1 : 1;
}

void hash_all_members(hashtable *t) {
    pyhash_node *nds = t->nodes;
    pyhash_node es[t->used];
    size_t i = 0, j = 0, size = t->size;
    for (; i < size; i++)
        if (nds[i].status == USED)
            es[j++] = nds[i];
    qsort(es, t->used, sizeof(es[0]), hash_cmp);
    for (i = 0; i < t->used; i++)
        printf("%s\t%lu\n", es[i].key, es[i].value);
}

void hash_probe_rate(hashtable *t) {
    extern size_t probe_number;
    probe_number = 0;
    pyhash_node *nds = t->nodes;
    size_t i = 0;
    for (; i < t->size; i++)
        if (nds[i].status == USED)
            pyhash_search(t, nds[i].key);
    printf("probe rate: %.2f (hashtable size: %lu, used: %lu, deleted: %lu)\n",
           (probe_number + t->used) / (float) t->used, t->size, t->used,
           t->deleted);
}