redis源码分析之AOF重写

最新推荐文章于 2024-07-26 11:29:31 发布

qq_25106373

最新推荐文章于 2024-07-26 11:29:31 发布

阅读量2.2k

点赞数 1

分类专栏： redis java学习笔记文章标签： redis 数据库缓存

原文链接：https://blog.csdn.net/qq_32791231/article/details/117623408?spm=1001.2014.3001.5502

版权

java学习笔记同时被 2 个专栏收录

78 篇文章 0 订阅

订阅专栏

redis

2 篇文章 0 订阅

订阅专栏

一、背景

AOF是Redis增量模式的持久化方式，随着redis的持续运行，会不断有新的数据写入AOF文件中，逐渐占用大量磁盘空间，还会降低Redis启动时候的回放加载效率。Redis中有rewrite机制来合并AOF历史记录。
说明：阅读此文需要对redis有一定的了解，比如redis配置文件、redis持久化概念、redis事件。
本文基于朋友发来的一份AOF重写日志，本文源码围绕此日志展开，通过日志+源码的方式来分析AOF重写过程，重写日志截图如下：

AOF重写日志为了后续看代码方便，日志内容作了序号标记，后续会对这些做了序号标记的日志做说明

二、AOF重写步骤简述

主进程fork一个子进程，子进程创建完成后持有主进程的内存数据；
子进程将历史内存数据用插入命令写入临时AOF文件；
父进程将累计的差异数据追加到重写缓冲区(server.aof_rewrite_buf_blocks)中；
重写过程中，父进程会将一部分差异数据通过管道发给子进程；
当子进程完成步骤2后，会读取父进程发来的的差异数据并将其写入临时文件，然后退出；
父进程捕捉到子进程的退出码，如果是OK，将累计的差异数据写入临时文件，并且对临时文件rename，用它代替旧的AOF文件，至此就完成AOF的重写。

三、缓冲区

正式看代码之前，需要了解两个重要的缓冲区：AOF缓冲区(aof_buf)和AOF重写缓冲区(aof_rewrite_buf_blocks)。aof_buf是父线程将数据写入AOF前使用的缓冲区；aof_rewrite_buf_blocks是AOF重写期间，父线程用于存放差异数据的缓冲区。
为何重写的时候需要aof_rewrite_buf_blocks缓冲区呢？因为fork操作使用的写时复制技术（copy-on-write），子进程只能共享fork操作时的内存数据，对于之后的差异数据，主进程就单独开辟了一块缓冲区来保存，也就是aof_rewrite_buf_blocks缓冲区。

两个缓冲区数据结构如下：

1. aof_buf

typedef struct redisServer {
    // AOF缓冲区，在进入事件loop之前写入
    sds aof_buf; /* AOF buffer, written before entering the event loop */
};

2. aof_rewrite_buf_blocks是由aofrwblock为结点组成的链表。
2.1 aof_rewrite_buf_blocks

struct redisServer {
    ...
    // AOF重写缓冲链表，由多个缓冲区（一个aofrwblock是一个缓冲区）组成
    list *aof_rewrite_buf_blocks;   /* Hold changes during an AOF rewrite. */
    ...
}

2.2 aofrwblock

/* 定义每个缓冲区的大小为10M */
#define AOF_RW_BUF_BLOCK_SIZE (1024*1024*10)    /* 10 MB per block */
typedef struct aofrwblock {
    unsigned long used, free; // 该块已使用的
    char buf[AOF_RW_BUF_BLOCK_SIZE];
} aofrwblock;

四、相关源码

重要配置说明：

auto-aof-rewrite-percentage 100 //涨的百分比 0就是不rewrite
auto-aof-rewrite-min-size 64mb //最小开始rewrite的大小

触发条件：AOF重写的触发从大的方面分为手动触发和自动触发，自动触发又分为两种情况，分别如下：

（1 ）在定期执行的serverCron()函数中，如果没有rdb进程和aof进程在工作，就会调用rewriteAppendOnlyFileBackground() 执行aof重写操作（redis.c/serverCron()在Redis 服务器运行期间一直定期执行，直到服务器关闭）

int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
   ...
   /* Start a scheduled AOF rewrite if this was requested by the user while
    * a BGSAVE was in progress. */
  if (server.rdb_child_pid == -1 && server.aof_child_pid == -1 &&
       server.aof_rewrite_scheduled)
   {
       rewriteAppendOnlyFileBackground();
   }
       ...
}

（2）当前aof文件增长率超过了配置的涨幅百分比（此处的增长率是和上一次重写后的大小比较），并且文件的大小要大于最小rewrite值（auto-aof-rewrite-min-size），此时也会调用rewiriteAppendOnlyFileBackground() 函数进行aof重写
注：日志标记1出现在下面源码中，说明上图的日志截图是由于aof文件内容大小达到阈值触发的自动重写，日志1输出了当前aof文件内容的增长率，并表示会开始进行重写了

/* Trigger an AOF rewrite if needed. */
         if (server.aof_state == AOF_ON && //开启了aof
             server.rdb_child_pid == -1 && //没有rdb子进程
             server.aof_child_pid == -1 && //没有aof子进程
             server.aof_rewrite_perc && //配置了百分比rewrite
             server.aof_current_size > server.aof_rewrite_min_size)//达到最小的rewrite的点
         {
            long long base = server.aof_rewrite_base_size ?
                            server.aof_rewrite_base_size : 1;
            long long growth = (server.aof_current_size*100/base) - 100;
            if (growth >= server.aof_rewrite_perc) { //计算成长倍数，如果大于配置的auto-aof-rewrite-percentage值，就进行重写
                //日志标记-1
                serverLog(LL_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);
                rewriteAppendOnlyFileBackground(); //进行rwrite
            }
         }
    }

3. rewriteAppendOnlyFileBackground() 中调用了重要方法 rewriteAppendOnlyFile() ，此方法是重写功能的详细实现
注：日志标记8、2出现在下面源码中，日志8是子进程执行完重写方法后打印，输出子进程使用了多少内存；日志2则是由父进程打印了新创建的子进程的id（了解fork()函数后，就能理解为何日志2和8都会输出）

int rewriteAppendOnlyFileBackground(void) {
    pid_t childpid;
    long long start;

    // 如果正在进行重写或正在进行RDB持久化操作，则返回C_ERR
    if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;
    // 创建父子进程间通信的管道
    if (aofCreatePipes() != C_OK) return C_ERR;
    // 记录fork()开始时间
    start = ustime();

    // 调用fork()函数创建子进程，函数返回0表示是子进程，
    if ((childpid = fork()) == 0) {
        char tmpfile[256];

        /* Child */
        // 关闭监听的套接字
        closeListeningSockets(0);
        // 设置进程名字
        redisSetProcTitle("redis-aof-rewrite");
        // 创建临时文件
        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
        // 对临时文件进行AOF重写
        if (rewriteAppendOnlyFile(tmpfile) == C_OK) {
            // 获取子进程使用的内存空间大小
            size_t private_dirty = zmalloc_get_private_dirty();
            if (private_dirty) {
                //日志标记-8
                serverLog(LL_NOTICE,
                    "AOF rewrite: %zu MB of memory used by copy-on-write",
                    private_dirty/(1024*1024));
            }
            // 成功退出子进程
            exitFromChild(0);
        } else {
            // 异常退出子进程
            exitFromChild(1);
        }


    // 父进程
    } else {
        /* Parent */
        // 设置fork()函数消耗的时间
        server.stat_fork_time = ustime()-start;
        // 计算fork的速率，GB/每秒
        server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */
        // 将"fork"和fork消耗的时间关联到延迟诊断字典中
        latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000);
        if (childpid == -1) {
            serverLog(LL_WARNING,
                "Can't rewrite append only file in background: fork: %s",
                strerror(errno));
            return C_ERR;
        }
        // 日志标记-2
        serverLog(LL_NOTICE,
            "Background append only file rewriting started by pid %d",childpid);
        // 将AOF日程标志清零
        server.aof_rewrite_scheduled = 0;
        // AOF开始的时间
        server.aof_rewrite_time_start = time(NULL);
        // 设置AOF重写的子进程pid
        server.aof_child_pid = childpid;
        // 在AOF或RDB期间，不能对哈希表进行resize操作
        updateDictResizePolicy();
        // 将aof_selected_db设置为-1，强制让feedAppendOnlyFile函数执行时，执行一个select命令
        server.aof_selected_db = -1;
        // 清空脚本缓存
        replicationScriptCacheFlush();
        return C_OK;
    }
    return C_OK; /* unreached */
}

    4. rewriteAppendOnlyFile() 实现了重写功能（此函数由子进程执行），这里面一是重写了历史数据到临时文件，二是将读取到的主进程发过来的差异数据追加到临时文件（主进程如何发送的后续细讲）
    注1：子进程写差异数据到临时AOF文件之前，会先告诉主进程不要再发差异数据过来了，主进程同意后(子进程收到主进程发来的停止符”!“,就表示主进程同意了)，才会将主进程发来的差异数据写入临时AOF文件
    注2：日志标记5、6、7在此方法中出现。日志5表示子进程确认了主进程不会再发送差异数据；日志6对子进程收到的差异数据做了一个统计；日志7表示子进程完成了重写aof文件内容到临时文件的工作

// 写一系列的命令，用来完全重建数据集到filename文件中，被 REWRITEAOF and BGREWRITEAOF调用
// 为了使重建数据集的命令数量最小，Redis会使用 可变参的命令，例如RPUSH, SADD 和 ZADD。
// 然而每次单个命令的元素数量不能超过AOF_REWRITE_ITEMS_PER_CMD
int rewriteAppendOnlyFile(char *filename) {
    dictIterator *di = NULL;
    dictEntry *de;
    rio aof;
    FILE *fp;
    char tmpfile[256];
    int j;
    long long now = mstime();
    char byte;
    size_t processed = 0;

    // 将临时文件的名字保存到tmpfile中(此处的临时文件会在返回结果之前更名为rewriteAppendOnlyFileBackground()中的临时文件名，也就是本方法接收的filename参数)
    snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
    // 打开文件
    fp = fopen(tmpfile,"w");
    if (!fp) {
        serverLog(LL_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));
        return C_ERR;
    }
    // 设置一个空sds给 保存子进程AOF时差异累计数据的sds
    server.aof_child_diff = sdsempty();
    // 初始化rio为文件io对象
    rioInitWithFile(&aof,fp);
    // 如果开启了增量时同步（开启后可以防止在rio缓冲区中累计太多命令，造成写入AOF文件时IO阻塞时间过长）
    if (server.aof_rewrite_incremental_fsync)
        // 设置自动同步的字节数限制为AOF_AUTOSYNC_BYTES = 32MB
        rioSetAutoSync(&aof,AOF_AUTOSYNC_BYTES);

    // 遍历所有的数据库
    for (j = 0; j < server.dbnum; j++) {
        // 按照格式构建 SELECT 命令内容
        char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
        // 当前数据库指针
        redisDb *db = server.db+j;
        // 数据库的键值对字典
        dict *d = db->dict;
        // 如果数据库中没有键值对则跳过当前数据库
        if (dictSize(d) == 0) continue;
        // 创建一个安全的字典迭代器
        di = dictGetSafeIterator(d);
        if (!di) {
            // 创建失败返回C_ERR
            fclose(fp);
            return C_ERR;
        }

        // 将SELECT 命令写入AOF文件，确保后面的命令能正确载入到数据库
        if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
        // 将数据库的ID写入AOF文件
        if (rioWriteBulkLongLong(&aof,j) == 0) goto werr;

        // 遍历保存当前数据的键值对的字典
        while((de = dictNext(di)) != NULL) {
            sds keystr;
            robj key, *o;
            long long expiretime;

            // 当前节点保存的键值
            keystr = dictGetKey(de);
            // 当前节点保存的值对象
            o = dictGetVal(de);
            // 初始化一个在栈中分配的键对象
            initStaticStringObject(key,keystr);

            // 获取该键值对的过期时间
            expiretime = getExpire(db,&key);

            // 如果当前键已经过期，则跳过该键
            if (expiretime != -1 && expiretime < now) continue;

            // 根据值的对象类型，将键值对写到AOF文件中

            // 值为字符串类型对象
            if (o->type == OBJ_STRING) {
                char cmd[]="*3\r\n$3\r\nSET\r\n";
                // 按格式写入SET命令
                if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
                /* Key and value */
                // 按格式写入键值对对象
                if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
                if (rioWriteBulkObject(&aof,o) == 0) goto werr;
            }
            // 列表类型对象、有序集合类型对象、哈希类型对象处理

            // 如果该键有过期时间，且没过期，写入过期时间
            // 在rio的缓冲区中每次写了10K，就从父进程读累计的差异，保存到子进程的aof_child_diff中
            if (aof.processed_bytes > processed+1024*10) {
                // 更新已写的字节数
                processed = aof.processed_bytes;
                // 从父进程读累计写入的缓冲区的差异，在重写结束时链接到文件的结尾
                aofReadDiffFromParent();
            }
        }
        dictReleaseIterator(di);    //释放字典迭代器
        di = NULL;
    }

    // 当父进程仍然在发送数据时，先执行一个缓慢的同步，以便下一次最中的同步更快
    if (fflush(fp) == EOF) goto werr;
    if (fsync(fileno(fp)) == -1) goto werr;

    // 再次从父进程读取几次数据，以获得更多的数据，我们无法一直读取，因为服务器从client接受的数据总是比发送给子进程要快，所以当数据来临的时候，我们尝试从在循环中多次读取。
    // 如果在20次之内没有新的数据到来，那么我们终止读取
    int nodata = 0;
    mstime_t start = mstime();  //读取的开始时间
    // 少于1秒并且小于20次的时候一直循环等待数据到来
    while(mstime()-start < 1000 && nodata < 20) {
        // 在1ms之内，查看从父进程读数据的fd是否变成可读的，若不可读则aeWait()函数返回0
        if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0)
        {
            nodata++;   //更新新数据到来的时间，超过20ms则退出while循环
            continue;
        }
        // 当管道的读端可读时，清零nodata
        nodata = 0; /* Start counting from zero, we stop on N *contiguous* timeouts. */
        // 从父进程读累计写入的缓冲区的差异，在重写结束时链接到文件的结尾
        aofReadDiffFromParent();
    }

    // 请求父进程停止发送累计差异数据
    if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr;
    // 将从父进程读ack的fd设置为非阻塞模式
    if (anetNonBlock(NULL,server.aof_pipe_read_ack_from_parent) != ANET_OK)
        goto werr;
    // 在5000ms之内，从fd读1个字节的数据保存在byte中，查看byte是否是'!'
    if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 ||
        byte != '!') goto werr;
      //日志标记-5
    // 如果收到的是父进程发来的'!'，则打印日志
    serverLog(LL_NOTICE,"Parent agreed to stop sending diffs. Finalizing AOF...");

    // 最后一次从父进程读累计写入的缓冲区的差异
    aofReadDiffFromParent();
    //日志标记-6
    serverLog(LL_NOTICE,
        "Concatenating %.2f MB of AOF diff received from parent.",
        (double) sdslen(server.aof_child_diff) / (1024*1024));

    // 将子进程aof_child_diff中保存的差异数据写到AOF文件中
    if (rioWrite(&aof,server.aof_child_diff,sdslen(server.aof_child_diff)) == 0)
        goto werr;


    // 再次冲洗文件缓冲区，执行同步操作
    if (fflush(fp) == EOF) goto werr;
    if (fsync(fileno(fp)) == -1) goto werr;
    if (fclose(fp) == EOF) goto werr;   //关闭文件


    // 将当前临时文件的名字，改成temp-rewriteaof-bg-%d.aof（%d=pid）
    if (rename(tmpfile,filename) == -1) {
        serverLog(LL_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
        unlink(tmpfile);
        return C_ERR;
    }
    // 日志标记-7
    serverLog(LL_NOTICE,"SYNC append only file rewrite performed");
    return C_OK;

// 写错误处理
werr:
    serverLog(LL_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
    fclose(fp);
    unlink(tmpfile);
    if (di) dictReleaseIterator(di);
    return C_ERR;
}

5. 父进程是何时将差异数据发送给子进程的呢？我们要先看一个重要函数：feedAppendOnlyFile() ，此函数的主要功能是将数据写入AOF缓冲区（aof_buf），写完后如果判断出后台正在执行重写，就会调用 aofRewriteBufferAppend() ，而在这个函数中会创建‘aof_pipe_write_data_to_child’事件，发送差异数据给子进程由此事件的回调函数来实现的（在后文会有介绍）

void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
    sds buf = sdsempty();   //设置一个空sds
    robj *tmpargv[3];

    // 使用SELECT命令，显式的设置当前数据库
    if (dictid != server.aof_selected_db) {
        char seldb[64];
        snprintf(seldb,sizeof(seldb),"%d",dictid);
        // 构造SELECT命令的协议格式
        buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
            (unsigned long)strlen(seldb),seldb);
        // 执行AOF时，当前的数据库ID
        server.aof_selected_db = dictid;
    }
    // 如果是 EXPIRE/PEXPIRE/EXPIREAT 三个命令，则要转换成 PEXPIREAT 命令
    if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
        cmd->proc == expireatCommand) {
        /* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */
        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
    // 如果是 SETEX/PSETEX 命令，则转换成 SET and PEXPIREAT
    } else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {
        /* Translate SETEX/PSETEX to SET and PEXPIREAT */
        // SETEX key seconds value
        // 构建SET命令对象
        tmpargv[0] = createStringObject("SET",3);
        tmpargv[1] = argv[1];
        tmpargv[2] = argv[3];
        // 将SET命令按协议格式追加到buf中
        buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
        decrRefCount(tmpargv[0]);
        // 将SETEX/PSETEX命令和键对象按协议格式追加到buf中
        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
        
    // 其他命令直接按协议格式转换，然后追加到buf中
    } else {
        buf = catAppendOnlyGenericCommand(buf,argc,argv);
    }
    // 如果正在进行AOF，则将命令追加到AOF缓冲中，在重新进入事件循环之前，这些命令会被冲洗到磁盘上，并向client回复
    if (server.aof_state == AOF_ON)
        server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));
    // 如果后台正在进行重写，那么将命令追加到重写缓冲区中，以便我们记录重写的临时AOF文件和当前数据库的差异
    if (server.aof_child_pid != -1)
        aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));
    sdsfree(buf);
}

    6. aofRewriteBufferAppend() 由父进程执行：写入数据到重写缓冲（aof_rewrite_buf_blocks）。
    注：函数末尾可能会创建‘aof_pipe_write_data_to_child’写事件（没有此写事件时候进行创建，在此事件设置了回调函数aofChildWriteDiffData，而在这个回调函数里，父进程会通过管道发送一部分重写缓冲区（aof_rewrite_buf_blocks）的数据给子进程）
    注：日志标记3在这个方法中打印，输出了当前重写缓冲区的大小

void aofRewriteBufferAppend(unsigned char *s, unsigned long len) {
    // 该函数为主进程调用，是有指令产生时写入len长度的数据到aof_rewrite_buf_blocks的函数。
    listNode *ln = listLast(server.aof_rewrite_buf_blocks);
    aofrwblock *block = ln ? ln->value : NULL;
    while(len) {
        /* If we already got at least an allocated block, try appending
         * at least some piece into it. */
        if (block) {
            // 当前还有空闲块，判断其free是否大于等于len以决定是否需要申请一个新的aofrwblock
            unsigned long thislen = (block->free < len) ? block->free : len;
            if (thislen) {  /* The current block is not already full. */
                // 该块还没满，使用memcpy写入s的thislen长度的数据
                memcpy(block->buf+block->used, s, thislen);
                block->used += thislen;
                block->free -= thislen;
                s += thislen;
                len -= thislen;
            }
        }
        // 如果该if为真，说明s仍有数据未被写入，但是当前的aofrwblock已经满了
        if (len) { /* First block to allocate, or need another block. */
            //申请一个新的aofrwblock并将其挂在链表尾
            int numblocks;
            block = zmalloc(sizeof(*block));
            block->free = AOF_RW_BUF_BLOCK_SIZE;
            block->used = 0;
            listAddNodeTail(server.aof_rewrite_buf_blocks,block);
            /* Log every time we cross more 10 or 100 blocks, respectively
             * as a notice or warning. */
            // 这里是每10个块或者100个块被申请的时候打一次日志
            numblocks = listLength(server.aof_rewrite_buf_blocks);
            if (((numblocks+1) % 10) == 0) {
                // 如果是100的时候会提升到WARNING等级，表示有很多的数据没有被写入（因为childWrite会把写完的块删掉）
                int level = ((numblocks+1) % 100) == 0 ? LL_WARNING :
                                                         LL_NOTICE;
                //日志标记-3
                serverLog(level,"Background AOF buffer size: %lu MB",
                    aofRewriteBufferSize()/(1024*1024));
            }
        }
    }
    /* Install a file event to send data to the rewrite child if there is
     * not one already. */
    // 如果没有一个aof_pipe_write_data_to_child的文件写事件被挂载到server.el，创建这一个文件事件，定为AE_WRITABLE
    if (aeGetFileEvents(server.el,server.aof_pipe_write_data_to_child) == 0) {
        aeCreateFileEvent(server.el, server.aof_pipe_write_data_to_child,
            AE_WRITABLE, aofChildWriteDiffData, NULL);
    }
}

7. aofRewriteBufferSize()：统计当前aof_block_buff_bolcks已使用的大小。父进程写差异数据到缓冲区时和其它地方都有调用到此函数

unsigned long aofRewriteBufferSize(void) {
    // 计算server.aof_write_buf_blocks各个结点used的总和
    listNode *ln;
    listIter li;
    unsigned long size = 0;
    // 创建正向迭代器
    listRewind(server.aof_rewrite_buf_blocks,&li);
    // 使用迭代器遍历server.aof_rewrite_buf_blocks，得到每个结点的aofrwblock，进而得到used
    while((ln = listNext(&li))) {
        aofrwblock *block = listNodeValue(ln);
        size += block->used;
    }
    return size;
}

    8. aofChildWriteDiffData(): 此函数是被当成‘aof_pipe_write_data_to_child’事件的回调函数来执行的，关于‘aof_pipe_write_data_to_child’事件的创建在 aofRewriteBufferAppend() 末尾。
    主要功能：在这个函数中，父线程会将重写缓冲区已有（aof_rewrite_buf_blocks）的数据循环写入与子进程通信的管道上（父子进程通信使用管道技术），已写入管道的差异数据会从重写缓冲区中删除，由此可知重写缓冲区的数据是可能减少的
    说明：
    （1）‘aof_pipe_write_data_to_child’事件的删除也会在此函数中（不是每次都删除，有判断条件，详见代码）；
    （2）子进程会在rewriteAppendOnlyFile() 函数中读取管道中的差异数据并写入临时AOF文件

void aofChildWriteDiffData(aeEventLoop *el, int fd, void *privdata, int mask) {
    listNode *ln;
    aofrwblock *block;
    ssize_t nwritten;
    UNUSED(el);
    UNUSED(fd);
    UNUSED(privdata);
    UNUSED(mask);
    while(1) {
        ln = listFirst(server.aof_rewrite_buf_blocks);
        block = ln ? ln->value : NULL;
        // 当aof_stop_sending_diff标志位被设定的时候（停止把diff送到子进程）或者block已经为空时（表示没有diff数据需要写入），退出
        if (server.aof_stop_sending_diff || !block) {
            // 从事件轮询中心中移除该事件，其FD为server.aof_pipe_write_data_to_child（往子进程写数据的管道）
            aeDeleteFileEvent(server.el,server.aof_pipe_write_data_to_child,
                              AE_WRITABLE);
            return;
        }
        if (block->used > 0) {
            // 往pipe中写入used字节的buf数据
            nwritten = write(server.aof_pipe_write_data_to_child,
                             block->buf,block->used);
            // <=0表示写入失败，因为block->used>0说明至少有大于0个字节的数据要被写入
            if (nwritten <= 0) return;
            // 这些数据已经被写入了，使用memmove进行移除[block->buf,block->buf+nwritten]（已被写入的nwritten长度的数据）
            memmove(block->buf,block->buf+nwritten,block->used-nwritten);
            // 更新该块的used和free
            block->used -= nwritten;
            block->free += nwritten;
        }
        // 如果这个块都没有被用到，就应该及时释放其内存
        if (block->used == 0) listDelNode(server.aof_rewrite_buf_blocks,ln);
    }
}

9. aofReadDiffFromParent() ：从管道读取父进程发送的差异数据写入aof_child_diff。rewriteAppendOnlyFile() 函数中调用了此方法

ssize_t aofReadDiffFromParent(void) {
    char buf[65536]; /* Default pipe buffer size on most Linux systems. */
    ssize_t nread, total = 0;
    // 当aof_pipe_read_data_from_parent这个管道有可用数据时，读取数据并写入到aof_child_diff中
    while ((nread =
            read(server.aof_pipe_read_data_from_parent,buf,sizeof(buf))) > 0) {
        // sdscatlen的作用是对于一个sds来说，可以把buf的nread个字节安全地拼接在该sds上
        server.aof_child_diff = sdscatlen(server.aof_child_diff,buf,nread);
        total += nread;
    }
    return total;
}

10. aofChildPipeReadable() ：父进程收到子进程ack（aof_pipe_read_ack_from_child）的回调函数，此函数中父进程先是收到子进程的停止符”!“（这个停止符是子进程让父进程停止发差异数据到它那），然后主进程会加上停止标志（aof_stop_sending_diff=1），再给子进程回复一个停止符”!“，表示已收到通知你的消息，我不会再发了。
注：日志标记4出现了，表示父进程收到子进程请求停止发送差异数据的通知

void aofChildPipeReadable(aeEventLoop *el, int fd, void *privdata, int mask) {
    // 这个函数是被aofCreatePipe时的aeCreateEvent作为回调函数的，所以该函数被执行的时候就是轮询到了该可读事件的时候
    char byte;
    UNUSED(el);
    UNUSED(privdata);
    UNUSED(mask);
    // 缓冲区以!做结尾，表示子进程请求父进程不要再发送差异数据了，每次从fd中读1字节（只是判断是否可读而已，并不是要去读）
    if (read(fd,&byte,1) == 1 && byte == '!') {
        //日志标记-4
        serverLog(LL_NOTICE,"AOF rewrite child asks to stop sending diffs.");
        // !为停止符，置aof_stop_sending_diff为1
        server.aof_stop_sending_diff = 1;
        // 向child的ack管道写入!，表示已经收到停止指示
        if (write(server.aof_pipe_write_ack_to_child,"!",1) != 1) {
            /* If we can't send the ack, inform the user, but don't try again
             * since in the other side the children will use a timeout if the
             * kernel can't buffer our write, or, the children was
             * terminated. */
            // 如果写不进，报日志错误
            serverLog(LL_WARNING,"Can't send ACK to AOF child: %s",
                strerror(errno));
        }
    }
    /* Remove the handler since this can be called only one time during a
     * rewrite. */
    // 这里是由aofChildPipeReadable的特性决定的，因为这是一个回调函数，用于事件触发时做处理的，这个事件已经触发完成了，所以要把这个文件事件给删除掉
    aeDeleteFileEvent(server.el,server.aof_pipe_read_ack_from_child,AE_READABLE);
}

11. 子进程结束后的后续操作：父进程会通过 serverCron() 函数得到子进程的退出状态（子进程完成重写后会退出），如果子进程退出了，父进程会调用backgroundRewriteDoneHandler() 将AOF重写缓冲区aof_rewrite_buf_blocks中的数据添加到新的AOF文件，并完成新老AOF文件的替换

int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
    ……
    /* Check if a background saving or AOF rewrite in progress terminated. */
    if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 ||
        ldbPendingChildren())
    {
        int statloc;
        pid_t pid;


        if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {//非阻塞 等待进程结束
            int exitcode = WEXITSTATUS(statloc);
            int bysignal = 0;


            if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc); //确定是不是信号量退出的


            if (pid == -1) {//失败了
                serverLog(LL_WARNING,"wait3() returned an error: %s. "
                    "rdb_child_pid = %d, aof_child_pid = %d",
                    strerror(errno),
                    (int) server.rdb_child_pid,
                    (int) server.aof_child_pid);
            } else if (pid == server.rdb_child_pid) {//如果是一个rdb子进程
                backgroundSaveDoneHandler(exitcode,bysignal);
                if (!bysignal && exitcode == 0) receiveChildInfo();
            } else if (pid == server.aof_child_pid) {//如果是aof子进程
                backgroundRewriteDoneHandler(exitcode,bysignal);
                if (!bysignal && exitcode == 0) receiveChildInfo();
            } else {
                if (!ldbRemoveChild(pid)) {
                    serverLog(LL_WARNING,
                        "Warning, detected child with unmatched pid: %ld",
                        (long)pid);
                }
            }
            updateDictResizePolicy();
            closeChildInfoPipe();
        }
    }
}

12. backgroundRewriteDoneHandler() 由父进程调用，将差异数据（重写缓冲区aof_rewrite_buf_blocks中的）写入临时AOF文件，然后对临时AOF文件重命名替换当前AOF文件，完成AOF重写
注：日志标记9、10、11在此函数中打印。日志9表示子进程成功完成了重写；日志10统计了重写缓冲区的大小，此时重写缓冲区的内容已被成功追加到新AOF文件；日志11输出是在新AOF文件替换掉老AOF文件后，此日志标志着AOF重写工作完成

void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
    if (!bysignal && exitcode == 0) {
        int newfd, oldfd;
        char tmpfile[256];
        long long now = ustime();
        mstime_t latency;
         //日志标记-9
        redisLog(REDIS_NOTICE,
        latencyStartMonitor(latency);
            "Background AOF rewrite terminated with success");
 
        /* Flush the differences accumulated by the parent to the rewritten AOF. */
        // 开始将父进程中记录在重写缓冲区的数据追加到AOF文件中
        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",
            (int)server.aof_child_pid);
        // 打开临时文件
        newfd = open(tmpfile,O_WRONLY|O_APPEND);
        if (newfd == -1) {
            redisLog(REDIS_WARNING,
                "Unable to open the temporary AOF produced by the child: %s", strerror(errno));
            goto cleanup;
        }
        //追加
        if (aofRewriteBufferWrite(newfd) == -1) {//-1表示追加失败
            redisLog(REDIS_WARNING,
                "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno));
            close(newfd);
            goto cleanup;
        }
        latencyEndMonitor(latency);
        latencyAddSampleIfNeeded("aof-rewrite-diff-write",latency);
         //日志标记-10
        redisLog(REDIS_NOTICE,
            "Residual parent diff successfully flushed to the rewritten AOF (%.2f MB)", (double) aofRewriteBufferSize() / (1024*1024));
 
        /* The only remaining thing to do is to rename the temporary file to
         * the configured file and switch the file descriptor used to do AOF
         * writes. We don't want close(2) or rename(2) calls to block the
         * server on old file deletion.
         *  剩下的事情就是将临时文件重命名为指定的名称，并切换该文件的文件描述符为AOF重写文件。
         *  我们不想让close(2)和rename(2)函数在删除旧文件时阻塞服务器。
         *
         * There are two possible scenarios:
         *  这里有两个可能的情景：
         *
         * 1) AOF is DISABLED and this was a one time rewrite. The temporary
         * file will be renamed to the configured file. When this file already
         * exists, it will be unlinked, which may block the server.
         *  如果AOF被关闭，且这是单次重写操作，临时文件会被命名为指定的文件名。如果AOF文件已经存在，
         *  则会被unlink掉，这个操作可能会阻塞服务器。
         *
         * 2) AOF is ENABLED and the rewritten AOF will immediately start
         * receiving writes. After the temporary file is renamed to the
         * configured file, the original AOF file descriptor will be closed.
         * Since this will be the last reference to that file, closing it
         * causes the underlying file to be unlinked, which may block the
         * server.
         *  如果AOF被开启，并且重写后的AOF文件会马上被用来接收写命令。当临时文件被重命名为指定的名称后，原来
         *  旧的文件描述符将会被关闭。因为Redis是最后一个引用该文件的进程，所以关闭这个文件会造成该文件被
         *  unlink，这也可能阻塞服务器
         *
         * To mitigate the blocking effect of the unlink operation (either
         * caused by rename(2) in scenario 1, or by close(2) in scenario 2), we
         * use a background thread to take care of this. First, we
         * make scenario 1 identical to scenario 2 by opening the target file
         * when it exists. The unlink operation after the rename(2) will then
         * be executed upon calling close(2) for its descriptor. Everything to
         * guarantee atomicity for this switch has already happened by then, so
         * we don't care what the outcome or duration of that close operation
         * is, as long as the file descriptor is released again.
         *  为了避免unlink操作造成服务器阻塞，这里使用一个后台线程来执行close(2)操作。
         *  如果原来的文件存在，先打开原来文件这样就可以将场景1和场景2等同考虑。
         *  那么rename操作后，因为原来的文件是打开的，所以不会unlink。
         *  将unlink推迟到关闭原来文件的描述符时。
         *  最后，将close()操作放到异步IO线程执行
         */
 
        if (server.aof_fd == -1) {
            /* AOF disabled */
            // AOF关闭
 
             /* Don't care if this fails: oldfd will be -1 and we handle that.
              * One notable case of -1 return is if the old file does
              * not exist. */
             // 打开已存在的文件
             oldfd = open(server.aof_filename,O_RDONLY|O_NONBLOCK);
        } else {
            /* AOF enabled */
            // AOF开启
            oldfd = -1; /* We'll set this to the current AOF filedes later. */
        }
 
        /* Rename the temporary file. This will not unlink the target file if
         * it exists, because we reference it with "oldfd". */
        latencyStartMonitor(latency);
        // 对临时文件重命名。此时如果旧的AOF文件存在也不会被unlink掉，因为oldfd引用它
        if (rename(tmpfile,server.aof_filename) == -1) {
            redisLog(REDIS_WARNING,
                "Error trying to rename the temporary AOF file: %s", strerror(errno));
            close(newfd);
            if (oldfd != -1) close(oldfd);
            goto cleanup;
        }
        latencyEndMonitor(latency);
        latencyAddSampleIfNeeded("aof-rename",latency);
 
        if (server.aof_fd == -1) {
            /* AOF disabled, we don't need to set the AOF file descriptor
             * to this new file, so we can close it. */
            // 如果AOF被关闭，则直接关闭AOF文件
            close(newfd);
        } else {
            /* AOF enabled, replace the old fd with the new one. */
            // 如果AOF被开启，用新的AOF文件的fd替代旧的AOF文件的fd
            oldfd = server.aof_fd;
            server.aof_fd = newfd;
            // 同步AOF缓冲区的数据到新的AOF文件
            if (server.aof_fsync == AOF_FSYNC_ALWAYS)
                aof_fsync(newfd);
            else if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
                aof_background_fsync(newfd);
 
            // 强制引发SELECT
            server.aof_selected_db = -1; /* Make sure SELECT is re-issued */
            aofUpdateCurrentSize();
            server.aof_rewrite_base_size = server.aof_current_size;
 
            /* Clear regular AOF buffer since its contents was just written to
             * the new AOF from the background rewrite buffer. */
            // 清空AOF缓冲区，因为缓冲区中的内容已经写入到了AOF文件中了
            sdsfree(server.aof_buf);
            server.aof_buf = sdsempty();
        }
 
        server.aof_lastbgrewrite_status = REDIS_OK;
         //日志标记-11
        redisLog(REDIS_NOTICE, "Background AOF rewrite finished successfully");
        /* Change state from WAIT_REWRITE to ON if needed */
        if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
            server.aof_state = REDIS_AOF_ON;
 
        /* Asynchronously close the overwritten AOF. */
        // 异步关闭旧AOF文件
        if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL);
 
        redisLog(REDIS_VERBOSE,
            "Background AOF rewrite signal handler took %lldus", ustime()-now);
    }
    // AOF重写出错
    else if (!bysignal && exitcode != 0) {
        server.aof_lastbgrewrite_status = REDIS_ERR;
 
        redisLog(REDIS_WARNING,
            "Background AOF rewrite terminated with error");
    } else {
        server.aof_lastbgrewrite_status = REDIS_ERR;
     
        redisLog(REDIS_WARNING,
            "Background AOF rewrite terminated by signal %d", bysignal);
    }
 
cleanup:
    // 关闭通信管道
    aofClosePipes();
    // 重置AOF重写缓冲区
    aofRewriteBufferReset();
    // 移除临时文件
    aofRemoveTempFile(server.aof_child_pid);
    // 重置相关状态
    server.aof_child_pid = -1;
    server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start;
    server.aof_rewrite_time_start = -1;
    /* Schedule a new rewrite if we are waiting for it to switch the AOF ON. */
    if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
        server.aof_rewrite_scheduled = 1;
}

13. aofCreatePipes() 创建父子进程通信管道，此函数 rewriteAppendOnlyFileBackground() 中有调用

int aofCreatePipes(void) {
    // 这个函数最主要的就是解释一下fds中各元素的含义
    int fds[6] = {-1, -1, -1, -1, -1, -1};
    int j;
    // pipe函数是unix的一个系统调用，传入一个数组后，会创建一对管道，在下面的语句中，向fds[1]写入的数据可以从fds[0]中读取
    if (pipe(fds) == -1) goto error; /* parent -> children data. */
    if (pipe(fds+2) == -1) goto error; /* children -> parent ack. */
    if (pipe(fds+4) == -1) goto error; /* parent -> children ack. */
    /* Parent -> children data is non blocking. */
    // 设置父子进程管道通信为非阻塞的
    if (anetNonBlock(NULL,fds[0]) != ANET_OK) goto error;
    if (anetNonBlock(NULL,fds[1]) != ANET_OK) goto error;
    // fds[2]是aof_pipe_read_ack_from_child，这也是为什么在aofChildPipeReadable执行完回调后要将这个文件事件删除掉的原因
    if (aeCreateFileEvent(server.el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error;
    //父进程给子进程写数据的管道
    server.aof_pipe_write_data_to_child = fds[1];
    //子进程从父进程读数据的管道
    server.aof_pipe_read_data_from_parent = fds[0];
    // 子进程给父进程的ack
    server.aof_pipe_write_ack_to_parent = fds[3];
    server.aof_pipe_read_ack_from_child = fds[2];
    // 父进程给子进程的ack
    server.aof_pipe_write_ack_to_child = fds[5];
    server.aof_pipe_read_ack_from_parent = fds[4];
    server.aof_stop_sending_diff = 0;
    return C_OK;
error:
    serverLog(LL_WARNING,"Error opening /setting AOF rewrite IPC pipes: %s",
        strerror(errno));
    for (j = 0; j < 6; j++) if(fds[j] != -1) close(fds[j]);
    return C_ERR;
}

14. aofClosePipes() 关闭通信管道，此函数在 backgroundRewriteDoneHandler() 中有调用

void aofClosePipes(void) {
    aeDeleteFileEvent(server.el,server.aof_pipe_read_ack_from_child,AE_READABLE);
    aeDeleteFileEvent(server.el,server.aof_pipe_write_data_to_child,AE_WRITABLE);
    close(server.aof_pipe_write_data_to_child);
    close(server.aof_pipe_read_data_from_parent);
    close(server.aof_pipe_write_ack_to_parent);
    close(server.aof_pipe_read_ack_from_child);
    close(server.aof_pipe_write_ack_to_child);
    close(server.aof_pipe_read_ack_from_parent);
}