《redis设计与实现》-4 字典

最新推荐文章于 2019-12-03 15:59:24 发布

bohu83

最新推荐文章于 2019-12-03 15:59:24 发布

阅读量215

点赞数

分类专栏：数据库 redis从入门到放弃文章标签： hash 字典 hashmap redis

本文链接：https://blog.csdn.net/bohu83/article/details/84138138

版权

数据库同时被 2 个专栏收录

87 篇文章 13 订阅

订阅专栏

redis从入门到放弃

44 篇文章 12 订阅

订阅专栏

一序

上一篇整理了redis的hash，其实算法看不懂是个人能力有限，但是剩下的没有那么复杂了，在理解了字典的结构后，主要是hash的扩容rehash等。所以，本篇分为两个部分，上面介绍结构，下面介绍api的部分实现，如扩容、缩容等。

Redis中的字典采用哈希表作为底层实现，一个哈希表有多个节点，每个节点保存一个键值对。C语言没有这个结构，所以redis构件自己的实现。字典的实现代码在dict.c和dict.h文件中。
Redis的数据库就是使用字典作为底层实现的，通过key和value的键值对形式，代表了数据库中全部数据。而且，所有对数据库的增、删、查、改的命令，都是建立在对字典的操作上。

二字典结构

这部分代码在dict.h

2.1 表结构

哈希表

/* This is our hash table structure. Every dictionary has two of this as we
 * implement incremental rehashing, for the old to the new table. */
typedef struct dictht {
     dictEntry **table;      //存放一个数组的地址，数组存放着哈希表节点dictEntry的地址。
    unsigned long size;     //哈希表table的大小，初始化大小为4
    unsigned long sizemask; //用于将哈希值映射到table的位置索引。它的值总是等于(size-1)。
    unsigned long used;     //记录哈希表已有的节点（键值对）数量。
} dictht;

table是一个数组，里面的节点是dictEntry.每个dictEntry都保存着一个键值对。

/*
 * 哈希表节点
 */
typedef struct dictEntry {
    
    // 键
    void *key;

    // 值
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
    } v;

    // 指向下个哈希表节点，形成链表
    struct dictEntry *next;

} dictEntry;

next属性是指向另一个哈希节点的指针，这个指针可以将多个哈希值相同的键值对连在一起，解决键冲突的问题。书上有个图：

字典结构：

/*
 * 字典
 */
typedef struct dict {

    // 类型特定函数
    dictType *type;

    // 私有数据
    void *privdata;

    // 哈希表
    dictht ht[2];

    // rehash 索引
    // 当 rehash 不在进行时，值为 -1
    int rehashidx; /* rehashing not in progress if rehashidx == -1 */

    // 目前正在运行的安全迭代器的数量
    int iterators; /* number of iterators currently running */

} dict;

type属性是一个指向类型为dictType的指针，每个dictType保留了一组用于操作特定类型键值对的函数，而privatedata属性则为特定函数提供入参，从而实现了多态。

/*
 * 字典类型特定函数
 */
typedef struct dictType {

    // 计算哈希值的函数
    unsigned int (*hashFunction)(const void *key);

    // 复制键的函数
    void *(*keyDup)(void *privdata, const void *key);

    // 复制值的函数
    void *(*valDup)(void *privdata, const void *obj);

    // 对比键的函数
    int (*keyCompare)(void *privdata, const void *key1, const void *key2);

    // 销毁键的函数
    void (*keyDestructor)(void *privdata, void *key);
    
    // 销毁值的函数
    void (*valDestructor)(void *privdata, void *obj);

} dictType;

ht属性是一个包含两个项的数组，就是两个哈希表。一版情况下只有ht[0]有值，ht[1]哈希表只有在rehash的情况下使用。

除了ht[1],跟rehash有关的是rehashidx属性，表示rehash目前的进度，如果没有进行rehash，它的值是-1.

hash 跟hash冲突的链表法解决上面提到过了，跟jdk7的hashmap一个原理。

idx = h & d->ht[table].sizemask;

2.2 扩容跟收缩

哈希表的负载因子（load factor）=ht[0].used(以保存节点数量)/ht[0].size（哈希表大小）

，维持在一个合理的范围之内，就需要对哈希表进行扩展或收缩。这个hashmap的原理是一致的，不是0,75而已。

扩容:没有执行bgsave,当 hash 表中元素的个数等于第一维数组的长度时（就是书上说的负载因子》=1），就会开始扩容，扩容的新数组是原数组大小的 2 倍。不过如果 Redis 正在做 bgsave，为了减少内存页的过多分离 (Copy On Write)，Redis 尽量不去扩容 (dict_can_resize)，但是如果 hash 表已经非常满了，元素的个数已经达到了第一维数组长度的 5 倍 (dict_force_resize_ratio)，说明 hash 表已经过于拥挤了，这个时候就会强制扩容。

缩容：当 hash 表因为元素的逐渐删除变得越来越稀疏时，，Redis 会对 hash 表进行缩容来减少 hash 表的第一维数组空间占用。缩容的条件是元素个数低于数组长度的 10%(负载因子<0.1)。缩容不会考虑 Redis 是否正在做 bgsave。

看下代码：

/* Expand the hash table if needed */
/*
 * 根据需要，初始化字典（的哈希表），或者对字典（的现有哈希表）进行扩展
 *
 * T = O(N)
 */
static int _dictExpandIfNeeded(dict *d)
{
    /* Incremental rehashing already in progress. Return. */
    // 渐进式 rehash 已经在进行了，直接返回
    if (dictIsRehashing(d)) return DICT_OK;

    /* If the hash table is empty expand it to the initial size. */
    // 如果字典（的 0 号哈希表）为空，那么创建并返回初始化大小的 0 号哈希表
    // T = O(1)
    if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);

    /* If we reached the 1:1 ratio, and we are allowed to resize the hash
     * table (global setting) or we should avoid it but the ratio between
     * elements/buckets is over the "safe" threshold, we resize doubling
     * the number of buckets. */
    // 一下两个条件之一为真时，对字典进行扩展
    // 1）字典已使用节点数和字典大小之间的比率接近 1：1
    //    并且 dict_can_resize 为真
    // 2）已使用节点数和字典大小之间的比率超过 dict_force_resize_ratio
    if (d->ht[0].used >= d->ht[0].size &&
        (dict_can_resize ||
         d->ht[0].used/d->ht[0].size > dict_force_resize_ratio))
    {
        // 新哈希表的大小至少是目前已使用节点数的两倍
        // T = O(N)
        return dictExpand(d, d->ht[0].used*2);
    }

    return DICT_OK;
}

/* Our hash table capability is a power of two */
/*
 * 计算第一个大于等于 size 的 2 的 N 次方，用作哈希表的值
 *
 * T = O(1)
 */
static unsigned long _dictNextPower(unsigned long size)
{
    unsigned long i = DICT_HT_INITIAL_SIZE;

    if (size >= LONG_MAX) return LONG_MAX;
    while(1) {
        if (i >= size)
            return i;
        i *= 2;
    }
}

这里多贴了一个计算》size的计算2的N次方的方法，在dictExpand会用的。再看看缩容的代码。

int dictResize(dict *d)     //缩小字典d
{
    int minimal;

    //如果dict_can_resize被设置成0，表示不能进行rehash，或正在进行rehash，返回出错标志DICT_ERR
    if (!dict_can_resize || dictIsRehashing(d)) return DICT_ERR;

    minimal = d->ht[0].used;            //获得已经有的节点数量作为最小限度minimal
    if (minimal < DICT_HT_INITIAL_SIZE)//但是minimal不能小于最低值DICT_HT_INITIAL_SIZE（4）
        minimal = DICT_HT_INITIAL_SIZE;
    return dictExpand(d, minimal);      //用minimal调整字典d的大小
}

常量比如DICT_HT_INITIAL_SIZE在.c找不到就去dict.h去找。缩容跟扩容都调用了dictExpand方法，缩的长度是已used为基准，以此往上找，最接近的2的N次方。书上这里没有贴代码，画图把过程展示出来了。

/* Expand or create the hash table */
/*
 * 创建一个新的哈希表，并根据字典的情况，选择以下其中一个动作来进行：
 *
 * 1) 如果字典的 0 号哈希表为空，那么将新哈希表设置为 0 号哈希表
 * 2) 如果字典的 0 号哈希表非空，那么将新哈希表设置为 1 号哈希表，
 *    并打开字典的 rehash 标识，使得程序可以开始对字典进行 rehash
 *
 * size 参数不够大，或者 rehash 已经在进行时，返回 DICT_ERR 。
 *
 * 成功创建 0 号哈希表，或者 1 号哈希表时，返回 DICT_OK 。
 *
 * T = O(N)
 */
int dictExpand(dict *d, unsigned long size)
{
    // 新哈希表
    dictht n; /* the new hash table */

    // 根据 size 参数，计算哈希表的大小
    // T = O(1)
    unsigned long realsize = _dictNextPower(size);

    /* the size is invalid if it is smaller than the number of
     * elements already inside the hash table */
    // 不能在字典正在 rehash 时进行
    // size 的值也不能小于 0 号哈希表的当前已使用节点
    if (dictIsRehashing(d) || d->ht[0].used > size)
        return DICT_ERR;

    /* Allocate the new hash table and initialize all pointers to NULL */
    // 为哈希表分配空间，并将所有指针指向 NULL
    n.size = realsize;
    n.sizemask = realsize-1;
    // T = O(N)
    n.table = zcalloc(realsize*sizeof(dictEntry*));
    n.used = 0;

    /* Is this the first initialization? If so it's not really a rehashing
     * we just set the first hash table so that it can accept keys. */
    // 如果 0 号哈希表为空，那么这是一次初始化：
    // 程序将新哈希表赋给 0 号哈希表的指针，然后字典就可以开始处理键值对了。
    if (d->ht[0].table == NULL) {
        d->ht[0] = n;
        return DICT_OK;
    }

    /* Prepare a second hash table for incremental rehashing */
    // 如果 0 号哈希表非空，那么这是一次 rehash ：
    // 程序将新哈希表设置为 1 号哈希表，
    // 并将字典的 rehash 标识打开，让程序可以开始对字典进行 rehash
    d->ht[1] = n;
    d->rehashidx = 0;
    return DICT_OK;
}

如果好理解一些，最后的代码可以加个else判断。就是这种：

if (d->ht[0].table == NULL) {

d->ht[0] = n;

}else{

d->ht[1] = n;

d->rehashidx = 0;

}

return DICT_OK;

是不是这种看起来更好理解一些。纯粹个人喜好哈。

2.3 渐进式扩容：

收缩或者扩展哈希表需要将ht[0]表中的所有键全部rehash到ht[1]中，但是rehash操作不是一次性、集中式完成的，而是分多次，渐进式，断续进行的，这样才不会对服务器性能造成影响。数据量少了很快弄处理完，多了就得考虑性能。道理是想通的。比如我们提工单删数据库记录。几条的情况可以立即处理，白天业务高峰期，删了5W条。那就锁表了，直接影响正常业务的update报异常。

给ht[1]申请空间，让字典同时拥有ht[0]跟ht[1]两个哈希表。
前面说过字典有个成员rehashidx，当rehashidx为-1时表示不进行rehash，当rehashidx值为0时，表示开始进行rehash。
在rehash期间，每次对字典的添加、删除、查找、或更新操作时，都会判断是否正在进行rehash操作，如果是，则顺带进行单步rehash，并将rehashidx+1。
当rehash时进行完成时，将rehashidx置为-1，表示完成rehash

渐进式的好处就是采用了分而治之的策略，把集中式的处理分摊到每一个添加、删除、更新等操作上。看个代码操作例子

int dictAdd(dict *d, void *key, void *val)
{
    // 尝试添加键到字典，并返回包含了这个键的新哈希节点
    // T = O(N)
    dictEntry *entry = dictAddRaw(d,key);

    // 键已存在，添加失败
    if (!entry) return DICT_ERR;

    // 键不存在，设置节点的值
    // T = O(1)
    dictSetVal(d, entry, val);

    // 添加成功
    return DICT_OK;
}
dictEntry *dictAddRaw(dict *d, void *key)
{
    int index;
    dictEntry *entry;
    dictht *ht;

    // 如果条件允许的话，进行单步 rehash
    // T = O(1)
    if (dictIsRehashing(d)) _dictRehashStep(d);

    /* Get the index of the new element, or -1 if
     * the element already exists. */
    // 计算键在哈希表中的索引值
    // 如果值为 -1 ，那么表示键已经存在
    // T = O(N)
    if ((index = _dictKeyIndex(d, key)) == -1)
        return NULL;

    // T = O(1)
    /* Allocate the memory and store the new entry */
    // 如果字典正在 rehash ，那么将新键添加到 1 号哈希表
    // 否则，将新键添加到 0 号哈希表
    ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0];
    // 为新节点分配空间
    entry = zmalloc(sizeof(*entry));
    // 将新节点插入到链表表头
    entry->next = ht->table[index];
    ht->table[index] = entry;
    // 更新哈希表已使用节点数量
    ht->used++;

    /* Set the hash entry fields. */
    // 设置新节点的键
    // T = O(1)
    dictSetKey(d, entry, key);

    return entry;
}
/* This function performs just a step of rehashing, and only if there are
 * no safe iterators bound to our hash table. When we have iterators in the
 * middle of a rehashing we can't mess with the two hash tables otherwise
 * some element can be missed or duplicated.
 *
 * 在字典不存在安全迭代器的情况下，对字典进行单步 rehash 。
 *
 * 字典有安全迭代器的情况下不能进行 rehash ，
 * 因为两种不同的迭代和修改操作可能会弄乱字典。
 *
 * This function is called by common lookup or update operations in the
 * dictionary so that the hash table automatically migrates from H1 to H2
 * while it is actively used. 
 *
 * 这个函数被多个通用的查找、更新操作调用，
 * 它可以让字典在被使用的同时进行 rehash 。
 *
 * T = O(1)
 */
static void _dictRehashStep(dict *d) {
    if (d->iterators == 0) dictRehash(d,1);
}
/* Performs N steps of incremental rehashing. Returns 1 if there are still
 * keys to move from the old to the new hash table, otherwise 0 is returned.
 *
 * 执行 N 步渐进式 rehash 。
 *
 * 返回 1 表示仍有键需要从 0 号哈希表移动到 1 号哈希表，
 * 返回 0 则表示所有键都已经迁移完毕。
 *
 * Note that a rehashing step consists in moving a bucket (that may have more
 * than one key as we use chaining) from the old to the new hash table.
 *
 * 注意，每步 rehash 都是以一个哈希表索引（桶）作为单位的，
 * 一个桶里可能会有多个节点，
 * 被 rehash 的桶里的所有节点都会被移动到新哈希表。
 *
 * T = O(N)
 */
int dictRehash(dict *d, int n) {
 int empty_visits = n*10; /* Max number of empty buckets to visit. */
    if (!dictIsRehashing(d)) return 0;  //只有rehashidx不等于-1时，才表示正在进行rehash，否则返回0

    while(n-- && d->ht[0].used != 0) {  //分n步，而且ht[0]上还有没有移动的节点
        dictEntry *de, *nextde;

        /* Note that rehashidx can't overflow as we are sure there are more
         * elements because ht[0].used != 0 */
        //确保rehashidx没有越界，因为rehashidx是从-1开始，0表示已经移动1个节点，它总是小于hash表的size的
        assert(d->ht[0].size > (unsigned long)d->rehashidx);

        //第一个循环用来更新 rehashidx 的值,略过空的索引。
        //将rehashidx移动到ht[0]有节点的下标，也就是table[d->rehashidx]非空
        while(d->ht[0].table[d->rehashidx] == NULL) {
            d->rehashidx++;
            if (--empty_visits == 0) return 1;
        }
        de = d->ht[0].table[d->rehashidx];     //指向该索引的链表表头节点
        /* Move all the keys in this bucket from the old to the new hash HT */
        //将链表中的所有节点迁移到新哈希表
        while(de) {
            unsigned int h;

            nextde = de->next;  //备份下一个节点的地址
            /* Get the index in the new hash table */
            h = dictHashKey(d, de->key) & d->ht[1].sizemask;    //获得计算哈希值并得到哈希表中的下标h

            //将该节点插入到下标为h的位置
            de->next = d->ht[1].table[h];
            d->ht[1].table[h] = de;

            //更新两个表节点数目计数器
            d->ht[0].used--;
            d->ht[1].used++;

            //将de指向以一个处理的节点
            de = nextde;
        }
        d->ht[0].table[d->rehashidx] = NULL;    //迁移过后将该下标的指针置为空
        d->rehashidx++;                         //更新rehashidx
    }

    /* Check if we already rehashed the whole table... */
    if (d->ht[0].used == 0) {           //ht[0]上已经没有节点了，说明已经迁移完成
        zfree(d->ht[0].table);          //释放hash表内存
        d->ht[0] = d->ht[1];            //将迁移过的1号哈希表设置为0号哈希表
        _dictReset(&d->ht[1]);          //重置ht[1]哈希表
        d->rehashidx = -1;              //rehash标志关闭
        return 0;                       //表示前已完成
    }

    /* More to rehash... */
    return 1;           //表示还有节点等待迁移
}

渐进式的hash操作过程：

因为在渐进式的hash操作过程中，字典会同时使用ht[0],ht[1]两个哈希表。查找会存在ht[0]找不到，再去ht[1]去找。毕竟是o（1）的复杂度还是很快的。同时在这个过程中，新添加的数据都会增加到ht[1],ht[0]的数据不在增加，随着操作的发生最终变成空表。

总结：

跟Java的hashmap有些类似，但是渐进式是它独特的方式。对于hashmap也是要尽量避免的，毕竟扩容是个O（N）。

bohu83

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
《redis设计与实现》-4 字典

一序上一篇整理了redis的hash，其实算法看不懂是个人能力有限，但是剩下的没有那么复杂了，在理解了字典的结构后，主要是hash的扩容rehash等。所以，本篇分为两个部分，上面介绍结构，下面介绍api的部分实现，如扩容、缩容等。 Redis中的字典采用哈希表作为底层实现，一个哈希表有多个节点，每个节点保存一个键值对。C语言没有这个结构，所以redis构件自己的实现。字...
复制链接

扫一扫