redis hashtable部分


typedef struct dict {
    dictType *type;
    void *privdata;  //暂时不知道是干啥用的
    dictht ht[2];
    long rehashidx; /* 如果在rehash就是指当前的bucket index,-1表示不在rehash */
    unsigned long iterators; /* number of iterators currently running */
} dict;


int dictRehash(dict *d, int n) {
    int empty_visits = n*10; /* Max number of empty buckets to visit. */
    if (!dictIsRehashing(d)) return 0;

    while(n-- && d->ht[0].used != 0) {
        dictEntry *de, *nextde;

        /* Note that rehashidx can't overflow as we are sure there are more
         * elements because ht[0].used != 0 */
        assert(d->ht[0].size > (unsigned long)d->rehashidx);
        while(d->ht[0].table[d->rehashidx] == NULL) {
            if (--empty_visits == 0) return 1;
        de = d->ht[0].table[d->rehashidx];
        /* Move all the keys in this bucket from the old to the new hash HT */
        while(de) {
            uint64_t h;

            nextde = de->next;
            /* Get the index in the new hash table */
            h = dictHashKey(d, de->key) & d->ht[1].sizemask;
            de->next = d->ht[1].table[h];
            d->ht[1].table[h] = de;
            de = nextde;
        d->ht[0].table[d->rehashidx] = NULL;

    /* Check if we already rehashed the whole table... */
    if (d->ht[0].used == 0) {
        d->ht[0] = d->ht[1];
        d->rehashidx = -1;
        return 0;

    /* More to rehash... */
    return 1;


这个文件里面最难理解的可能就是dictscan这个函数,遍历dict。而且这个函数设计意图并不是一次遍历所有的entry,而是每次遍历一个slot。所以怎么确保每条都被遍历,哪怕这遍历的过程中 table 有resize的操作就十分重要 。
注释把其原理和操作方式都解释的很明白了,redis的dict size都是2的N次方,这就决定了hash(key)对 2^n - 1取模就能算到对应的篮子,而遍历的时候篮子的顺序是从1111最大的那个篮子开始的,一直到0000. 为什么呢?之后会说

如果是1100 就会变成 1100->11->100->1011.



    /* Set unmasked bits so incrementing the reversed cursor
  * operates on the masked bits */
    v |= ~m0;

    /* Increment the reverse cursor */
    v = rev(v);
    v = rev(v);

为什么能做到resize过程不会影响遍历呢?因为size是2的次方,所以key所在的位置就是hash(key)的后N位,比如假设hash(n)= n,size为16的时候, 30二进制位0001 1110 即在1110位置上,resize成64后,30被rehash到11110位置上了。其实无所谓,分下面几种情况
如果30被遍历过,假设我们要遍历了xxxx这个槽,在这之前resize发生了,那我们遍历完新的槽之后会执行 v |= ~m0 这个玩意,这部操作直接在我们遍历结束的时候把xxxx变成了11xxxx,然后跟着接着会立刻遍历 11xxxx的几种情况 01xxxx 10xxxx 11xxxx 都会被遍历

那如果xxxx被遍历过了,那么rehash到11xxxx 10xxxx 01xxxx 这些槽的内容100%已经被遍历过了,因为这些槽中的原因在resize之前一定是在xxxx这个槽里面的。

从高位开始加1也是这个原因,确保resize新加的几位永远在最前面,这样以后已经被遍历过xxxx中的元素就再也不会被遍历了 以30为例子,如果在已经遍历了1110的状况下rehash到64位 如果从地位开始 那么 1 1110势必还会再被遍历一次.
同时假设 yyyy是扫描xxxx的前一次扫描,那么其中的元素就再也不会扫描了,因为11xxxx的四种情况都已经过去了


/* dictScan() is used to iterate over the elements of a dictionary.
 * Iterating works the following way:
 * 1) Initially you call the function using a cursor (v) value of 0.
 * 2) The function performs one step of the iteration, and returns the
 *    new cursor value you must use in the next call.
 * 3) When the returned cursor is 0, the iteration is complete.
 * The function guarantees all elements present in the
 * dictionary get returned between the start and end of the iteration.
 * However it is possible some elements get returned multiple times.
 * For every element returned, the callback argument 'fn' is
 * called with 'privdata' as first argument and the dictionary entry
 * 'de' as second argument.
 * The iteration algorithm was designed by Pieter Noordhuis.
 * The main idea is to increment a cursor starting from the higher order
 * bits. That is, instead of incrementing the cursor normally, the bits
 * of the cursor are reversed, then the cursor is incremented, and finally
 * the bits are reversed again.
 * This strategy is needed because the hash table may be resized between
 * iteration calls.
 * dict.c hash tables are always power of two in size, and they
 * use chaining, so the position of an element in a given table is given
 * by computing the bitwise AND between Hash(key) and SIZE-1
 * (where SIZE-1 is always the mask that is equivalent to taking the rest
 *  of the division between the Hash of the key and SIZE).
 * For example if the current hash table size is 16, the mask is
 * (in binary) 1111. The position of a key in the hash table will always be
 * the last four bits of the hash output, and so forth.
 * If the hash table grows, elements can go anywhere in one multiple of
 * the old bucket: for example let's say we already iterated with
 * a 4 bit cursor 1100 (the mask is 1111 because hash table size = 16).
 * If the hash table will be resized to 64 elements, then the new mask will
 * be 111111. The new buckets you obtain by substituting in ??1100
 * with either 0 or 1 can be targeted only by keys we already visited
 * when scanning the bucket 1100 in the smaller hash table.
 * By iterating the higher bits first, because of the inverted counter, the
 * cursor does not need to restart if the table size gets bigger. It will
 * continue iterating using cursors without '1100' at the end, and also
 * without any other combination of the final 4 bits already explored.
 * Similarly when the table size shrinks over time, for example going from
 * 16 to 8, if a combination of the lower three bits (the mask for size 8
 * is 111) were already completely explored, it would not be visited again
 * because we are sure we tried, for example, both 0111 and 1111 (all the
 * variations of the higher bit) so we don't need to test it again.
 * Yes, this is true, but we always iterate the smaller table first, then
 * we test all the expansions of the current cursor into the larger
 * table. For example if the current cursor is 101 and we also have a
 * larger table of size 16, we also test (0)101 and (1)101 inside the larger
 * table. This reduces the problem back to having only one table, where
 * the larger one, if it exists, is just an expansion of the smaller one.
 * This iterator is completely stateless, and this is a huge advantage,
 * including no additional memory used.
 * The disadvantages resulting from this design are:
 * 1) It is possible we return elements more than once. However this is usually
 *    easy to deal with in the application level.
 * 2) The iterator must return multiple elements per call, as it needs to always
 *    return all the keys chained in a given bucket, and all the expansions, so
 *    we are sure we don't miss keys moving during rehashing.
 * 3) The reverse cursor is somewhat hard to understand at first, but this
 *    comment is supposed to help.
unsigned long dictScan(dict *d,
                       unsigned long v,
                       dictScanFunction *fn,
                       dictScanBucketFunction* bucketfn,
                       void *privdata)
    dictht *t0, *t1;
    const dictEntry *de, *next;
    unsigned long m0, m1;

    if (dictSize(d) == 0) return 0;

    if (!dictIsRehashing(d)) {
        t0 = &(d->ht[0]);
        m0 = t0->sizemask;

        /* Emit entries at cursor */
        if (bucketfn) bucketfn(privdata, &t0->table[v & m0]);
        de = t0->table[v & m0];
        while (de) {
            next = de->next;
            fn(privdata, de);
            de = next;

        /* Set unmasked bits so incrementing the reversed cursor
         * operates on the masked bits */
        v |= ~m0;

        /* Increment the reverse cursor */
        v = rev(v);
        v = rev(v);

    } else {
        t0 = &d->ht[0];
        t1 = &d->ht[1];

        /* Make sure t0 is the smaller and t1 is the bigger table */
        if (t0->size > t1->size) {
            t0 = &d->ht[1];
            t1 = &d->ht[0];

        m0 = t0->sizemask;
        m1 = t1->sizemask;

        /* Emit entries at cursor */
        if (bucketfn) bucketfn(privdata, &t0->table[v & m0]);
        de = t0->table[v & m0];
        while (de) {
            next = de->next;
            fn(privdata, de);
            de = next;

        /* Iterate over indices in larger table that are the expansion
         * of the index pointed to by the cursor in the smaller table */
        do {
            /* Emit entries at cursor */
            if (bucketfn) bucketfn(privdata, &t1->table[v & m1]);
            de = t1->table[v & m1];
            while (de) {
                next = de->next;
                fn(privdata, de);
                de = next;

            /* Increment the reverse cursor not covered by the smaller mask.*/
            v |= ~m1;
            v = rev(v);
            v = rev(v);

            /* Continue while bits covered by mask difference is non-zero */
        } while (v & (m0 ^ m1));

    return v;




