一种改进哈希冲突的方法

最新推荐文章于 2024-08-11 10:54:41 发布

xyz347

最新推荐文章于 2024-08-11 10:54:41 发布

阅读量459

点赞数

分类专栏：数据结构与算法文章标签：哈希冲突 C

本文链接：https://blog.csdn.net/xyz347/article/details/50557601

版权

数据结构与算法专栏收录该内容

3 篇文章 1 订阅

订阅专栏

通过双向链表来改善冲突的处理方式。具体看源文件注释，哈希计算方法一个抄Linux的，一个是网上找的，均有说明。

头文件

#ifndef __BFLS_HASH_H
#define __BFLS_HASH_H


#ifdef __cplusplus
extern "C" {
#endif

typedef unsigned int (*bfls_hash_hashf)(void *data);
typedef int (*bfls_hash_cmpf)(void*data, void *key);
typedef int (*bfls_hash_iterf)(void*data, void *arg);



unsigned int bfls_hash_char(void *data);
unsigned int bfls_hash_32(unsigned int val);
unsigned int bfls_hash_size(unsigned int elenum);
void* bfls_hash_create(void *addr, unsigned int elenum, int uselow, bfls_hash_hashf func, bfls_hash_cmpf cmp);
void bfls_hash_destroy(void *tbl);
void *bfls_hash_get(void *table, void *key);
int bfls_hash_add(void *table, void *data);
void *bfls_hash_del(void *table, void *key);
unsigned int bfls_hash_iter(void *table, bfls_hash_iterf iter, void *arg);
void bfls_hash_show_index(void *table, unsigned int index);
void bfls_hash_show_key(void *table, void *data);
void bfls_hash_show(void *table);

#ifdef __cplusplus
}
#endif


#endif

源文件

/*
  哈希表实现。
  里面使用了双向链表来维护哈希表里面的elements。有两个链表：
  1、空闲的elements的链表。
  2、有数据的elements的链表
  为了在64位系统中节省空间，不用void*，用int来指向，当数据小于2G的时候
  是没有问题的。
  里面的散列函数都是抄别人的。
  
  1, 刚开始所有的节点都挂载在空闲列表上
  free_list ---->NODE1--->NODE2--->NODE3--->NODE4--->NODE5
  2, 假如往哈希表添加元素在NODE2上
  free_list ---->NODE1------------>NODE3--->NODE4--->NODE5
                          NODE2
  3, 假如再添加一个元素，NODE2冲突，则从链表头摘取一个
  free_list ---------------------->NODE3--->NODE4--->NODE5
                          NODE2--->NODE1
  4, 假如再添加一个元素，哈希值落在NODE1，则NODE1归位，取第一个空余节点
  free_list ------------------------------->NODE4--->NODE5
                NODE1     NODE2--->NODE3
  另外为了避免冲突的时候，需要对节点数据进行比对，里面还引入了
  sub hash的方法，只有hash和sub hash都一样的时候才需要进行数据
  比较，之前尝试用4万多个字符串测试，hash和sub hash都相同的只有
  一对
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <bfls_hash.h>

#ifdef __cplusplus
extern "C" {
#endif

#define HASH_DEBUG

/* 根据哈希值计算索引和子哈希值，如果索引用低位，则子哈希用高位，否则反过来 */
#define BFLS_HASH_INDEX(tbl, hash)   (((tbl)->shift==-1)?((hash)%(tbl)->elenum):(((hash)>>((tbl)->shift))%(tbl)->elenum))
#define BFLS_HASH_SUBHS(tbl, hash)   (((tbl)->shift==-1)?((hash)>>16):((hash)&0xffff))

/* table flags */
#define BFLS_HASH_TFLAG_DONOT_FREE      (1<<0)

/* data flags */
#define BFLS_HASH_DFLAG_MASTER      (1<<31) //这个坑的数据是“正统”的（不是被借用的）
#define BFLS_HASH_DFLAG_SLAVE       (1<<30) //这个坑是被借用的

/* element 数据结构 */
typedef struct
{
    int next;               // 不用指针，64位系统浪费4个字节，用int便于用-1
    int prev;    
    unsigned int flags;     // 高16位是flags，低16位是sub hash，也即另一个哈希值
    void *data;             // 数据指针
#ifdef HASH_DEBUG
    unsigned int conflict;  // 这个坑的冲突数
#endif
}hash_data_t;

/* table数据结构 */
typedef struct
{
    bfls_hash_hashf   hash; // 散列函数
    bfls_hash_cmpf    cmp;  // 比较函数
    unsigned int elenum;    // element number
    int          shift;     // 哈希允许取高位或者低位，如果用高位，则通过移位实现，低位通过取余实现。低位时shift=-1
    unsigned int count;     // 当前elements数目
    unsigned int flags;     // flags
    int          next;      // free list
    hash_data_t* data;      // elements
#ifdef HASH_DEBUG
    unsigned int maxcfl;    // 整个表的最大冲突数
    unsigned int maxcflidx; // 最大冲突所在位置
    unsigned int mastercnt; // 散列数目
#endif
}hash_tbl_t;

/*
 从free list里面分配一个 
 i >= 0表示分配指定的
 i == -1表示分配第一个
*/
static int hdlist_alloc(hash_tbl_t *tbl, int i)
{
    if (i < 0) {
        i = tbl->next; 
        #ifdef HASH_DEBUG
        if (i == -1) { // 不应该出现
            printf("ERROR, i==-1\n");
            exit(-1);
        }
        #endif
    }

    if (tbl->data[i].next != -1) {
        tbl->data[tbl->data[i].next].prev = tbl->data[i].prev;
    }
    if (tbl->data[i].prev != -1) {
        tbl->data[tbl->data[i].prev].next = tbl->data[i].next;
    } else {
        tbl->next = tbl->data[i].next;
    }
    
    return i;
}

/*
 把hash_data_t放回free list里面
*/
static void hdlist_free(hash_tbl_t *tbl, int i)
{
    tbl->data[i].next = tbl->next;
    tbl->data[i].prev = -1;
    tbl->next = i;
}

/* 从elements list里面查找 */
static hash_data_t* hdlist_get(hash_tbl_t *tbl, unsigned int index, unsigned int subhs, void *key)
{
    hash_data_t *tbd = &tbl->data[index];
    while (1) {
        if (subhs==(tbd->flags&0xffff) && 0==tbl->cmp(tbd->data, key)) {
            break;
        } else if (tbd->next != -1) {
            tbd = &tbl->data[tbd->next];
        } else {
            tbd = NULL;
            break;
        }
    }
    
    return tbd;
}

/* 用来计算需要移位的数目 */
static unsigned int hash_power2(unsigned int size)
{
    // TODO >0x80000000 的情况
    unsigned int k = 0;
    unsigned int shift = 0x80000000;
    while (size <= shift) {
        shift >>= 1;
        ++k;
    }
    return k;
}

/*
 @Author Brian Kernighan & Dennis Ritchie 
 */
unsigned int bfls_hash_char(void *data)
{ 
    const char *str = (char*)data;
    register unsigned int hash = 0; 
    register unsigned int ch;
    while (ch = (unsigned int)*str++)  
    {         
        hash = hash * 131 + ch;
    }  
    return hash;  
}

/* From linux */
unsigned int bfls_hash_32(unsigned int val)
{
	return val * 0x9e370001UL;
}

/* 计算表头+elements的大小，bytes */
unsigned int bfls_hash_size(unsigned int elenum)
{
    return sizeof(hash_tbl_t)+sizeof(hash_data_t)*elenum;
}

/*
说明: 创建哈希表
入参: addr        内存地址，如果非空表示调用者负责内存的分配，如果为空则内部分配
      elenum      元素的个数
      uselow      1表示使用哈希值的低位做索引，0表示使用高位
      func        散列函数
      cmmp        比较函数
*/
void* bfls_hash_create(void *addr, unsigned int elenum, int uselow, bfls_hash_hashf func, bfls_hash_cmpf cmp)
{
    hash_tbl_t *tbl;
    int shift;
    unsigned int size;
    int i;
    
    shift = (1==uselow)?-1:hash_power2(elenum);
    size = bfls_hash_size(elenum);

    if (elenum < 2) {
        return NULL;
    } else if (NULL == addr) {
        tbl = (hash_tbl_t*)calloc(1, size);
        if (NULL == tbl)
            return NULL;
    } else {
        memset(addr, 0, size);
        tbl = (hash_tbl_t*)addr;
        tbl->flags = BFLS_HASH_TFLAG_DONOT_FREE;
    }

    tbl->data = (hash_data_t*)(tbl+1);
    tbl->hash = func;
    tbl->cmp = cmp;
    tbl->elenum = elenum;
    tbl->shift = shift;
    //tbl->subhs = 0x10000;
    /*TODO 暂时散列到0-0xffff之间，测试了一个60K的表项，增加了subhs之后
      只有12组字符串发生冲突，每组都只有两个字符串*/
    
    for (i=(int)elenum-1; i>=0; --i) {
        tbl->data[i].next = i+1;
        tbl->data[i].prev = i-1;
    }
    tbl->data[elenum-1].next = -1;
    
    return (void*)tbl;
}

/* 销毁哈希表 */
void bfls_hash_destroy(void *tbl)
{
    if (0 == (BFLS_HASH_TFLAG_DONOT_FREE & ((hash_tbl_t*)tbl)->flags))
        free(tbl);
}

/* 查找哈希表 */
void *bfls_hash_get(void *table, void *key)
{
    hash_tbl_t  *tbl = (hash_tbl_t*)table;
    hash_data_t *tbd;
    unsigned int hash = tbl->hash(key);
    unsigned int index = BFLS_HASH_INDEX(tbl, hash);//hash%tbl->size;
    unsigned int subhs = BFLS_HASH_SUBHS(tbl, hash);//hash%tbl->subhs;

    tbd = &tbl->data[index];
    if ((0 != (BFLS_HASH_DFLAG_MASTER&tbd->flags)) 
     && (NULL != (tbd=hdlist_get(tbl, index, subhs, key)))) {
         return tbd->data;
    } else {
        return NULL;
    }
}

// TODO BOOL?
int bfls_hash_add(void *table, void *data)
{
    hash_tbl_t  *tbl = (hash_tbl_t*)table;
    hash_data_t *tbd;
    unsigned int hash = tbl->hash(data);
    unsigned int index = BFLS_HASH_INDEX(tbl, hash);
    unsigned int subhs = BFLS_HASH_SUBHS(tbl, hash);
    int i;

    tbd = &tbl->data[index];

    if (tbl->count >= tbl->elenum) {
        goto ERRLABEL; 
    } else if (0 != (BFLS_HASH_DFLAG_MASTER&tbd->flags)) { //冲突，从空闲链表找一个
        if (NULL == hdlist_get(tbl, index, subhs, data)) {
            i = hdlist_alloc(tbl, -1);
            tbl->data[i].next = tbd->next;
            tbl->data[i].prev = index;
            tbl->data[i].flags = BFLS_HASH_DFLAG_SLAVE+subhs;
            tbl->data[i].data = data;

            if (tbd->next != -1) {
                tbl->data[tbd->next].prev = i;
            }
            tbd->next = i;

            #ifdef HASH_DEBUG
            ++tbd->conflict;
            if (tbd->conflict > tbl->maxcfl) {
                tbl->maxcfl = tbd->conflict;
                tbl->maxcflidx = index;
            }
            #endif
        } else {
            goto ERRLABEL;
        }
    } else if (0 != (BFLS_HASH_DFLAG_SLAVE&tbd->flags)) { //被别人占了
        i = hdlist_alloc(tbl, -1);
        memcpy(&tbl->data[i], tbd, sizeof(hash_data_t));
        tbl->data[tbd->prev].next = i;
        if (-1 != tbd->next) {
            tbl->data[tbd->next].prev = i;
        }
        
        tbd->next = -1;
        tbd->prev = -1;
        tbd->data = data;
        tbd->flags = BFLS_HASH_DFLAG_MASTER+subhs;
        #ifdef HASH_DEBUG
        tbd->conflict = 0;
        ++tbl->mastercnt;
        #endif
    } else {
        i = hdlist_alloc(tbl, index);

        tbd->next = -1;
        tbd->prev = -1;
        tbd->data = data;
        tbd->flags = BFLS_HASH_DFLAG_MASTER+subhs;
        #ifdef HASH_DEBUG
        tbd->conflict = 0;
        ++tbl->mastercnt;
        #endif
    }
    
    ++tbl->count;

    return 0;
ERRLABEL:
    return -1;// TODO
}


void *bfls_hash_del(void *table, void *key)
{
    hash_tbl_t  *tbl = (hash_tbl_t*)table;
    hash_data_t *tbd;
    unsigned int hash = tbl->hash(key);
    unsigned int index = BFLS_HASH_INDEX(tbl, hash);//hash%tbl->size;
    unsigned int subhs = BFLS_HASH_SUBHS(tbl, hash);//hash%tbl->subhs;
    unsigned int i;
    
    tbd = &tbl->data[index]; 
    if (0 != (BFLS_HASH_DFLAG_MASTER&tbd->flags)) {
        tbd = hdlist_get(tbl, index, subhs, key);
        if (NULL == tbd) {
            return NULL;
        } else if (0 != (BFLS_HASH_DFLAG_MASTER&tbd->flags)) { //首节点，那么我们删第二个节点
            --tbl->count;
            i = tbd->next;
            if (-1 != i) {
                void *deldata = tbd->data;
                tbd->data = tbl->data[i].data;
                tbd->flags = BFLS_HASH_DFLAG_MASTER+(tbl->data[i].flags & 0xffff);
                tbd->next = tbl->data[i].next;
                if (tbl->data[i].next != -1) {
                    tbl->data[tbl->data[i].next].prev = index;
                }
                tbl->data[i].flags = 0;
                hdlist_free(tbl, i);
                return deldata;
            } else {
                tbd->flags = 0;
                hdlist_free(tbl, index);
                return tbd->data;
            }
        } else { // slave
            --tbl->count;
            i = tbl->data[tbd->prev].next; // my index
            if (-1 != tbd->next) {
                tbl->data[tbd->next].prev = tbd->prev;
            }
            tbl->data[tbd->prev].next = tbd->next;
            tbd->flags = 0;
            hdlist_free(tbl, i);
            return tbd->data;
        }
    } else {
        return NULL;
    }
}

// return count
unsigned int bfls_hash_iter(void *table, bfls_hash_iterf iter, void *arg)
{
    hash_tbl_t  *tbl = (hash_tbl_t*)table;
    hash_data_t *tbd;
    unsigned int i;
    unsigned int cnt = 0;
    
    for (i=0; i<tbl->elenum; ++i) {
        tbd = &tbl->data[i];
        if (0 != ((BFLS_HASH_DFLAG_MASTER|BFLS_HASH_DFLAG_SLAVE)&tbd->flags)) {
            if (0 == iter(tbd->data, arg))
                ++cnt;
            else
                break;
        }
    }
    
    return cnt;
}

void bfls_hash_show_index(void *table, unsigned int index)
{
    hash_tbl_t  *tbl = (hash_tbl_t*)table;
    hash_data_t *tbd;
    int i = (int)index;

    tbd = &tbl->data[i];
    if (0 != (BFLS_HASH_DFLAG_MASTER&tbd->flags)) {
        while (-1 != i) {
            tbd = &tbl->data[i];
            printf("Index:%-8d subhs:%04x data:%s\n", i, tbd->flags&0xffff, tbd->data);
            i = tbd->next;
        }
    } else {
        printf("Not Master\n");
    }
}

void bfls_hash_show_key(void *table, void *data)
{
    hash_tbl_t  *tbl = (hash_tbl_t*)table;
    unsigned int hash = tbl->hash(data);
    unsigned int index = BFLS_HASH_INDEX(tbl, hash);//hash%tbl->size;
    
    bfls_hash_show_index(table, index);
}

void bfls_hash_show(void *table)
{
    hash_tbl_t *tbl = (hash_tbl_t*)table;
    printf("Hash address:%#x\nSize:%d\nCount:%d\nShift:%d\n", (int)table, tbl->elenum, tbl->count, tbl->shift);
#ifdef HASH_DEBUG
    printf("Max Conflict:%d\nMax conflict index:%d\nMaster count:%d\n", tbl->maxcfl, tbl->maxcflidx, tbl->mastercnt);
#endif
}

#ifdef __cplusplus
}
#endif