redis aof持久化的源码分析

       除了rdb持久化功能之外,redis还提供了aof(append only file)持久化功能。与rdb不同,aof持久化

是通过保存redis服务器所执行的写命令来记录数据库的状态。


AOF持久化的实现

        AOF持久化的实现可以分为命令追加、文件写入和文件同步三个步骤。

命令追加

       当AOF持久化功能处于打开状态时,服务器在执行完一个写命令之后,会以协议格式将被执行的写命

令追加到服务器状态的aof_buf缓冲区的末尾:

struct redisServer {
    sds aof_buf;/* AOF buffer, written before entering the event loop */
}
       服务器执行完写命令,调用propagate进行命令追加。

void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,
               int flags)
{
    if (server.aof_state != AOF_OFF && flags & PROPAGATE_AOF)
        feedAppendOnlyFile(cmd,dbid,argv,argc);
}//进行命令追加
void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
    if (dictid != server.aof_selected_db) {
        //切换dbid,追加select命令
        snprintf(seldb,sizeof(seldb),"%d",dictid);
        buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
            (unsigned long)strlen(seldb),seldb);
        server.aof_selected_db = dictid;
    }
    if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
        cmd->proc == expireatCommand) {
        /* 将EXPIRE/PEXPIRE/EXPIREAT转化成PEXPIREAT生成命令协议格式的字符串 */
        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
    } else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {
        /* 将SETEX/PSETEX转换成SET和PEXPIREAT生成命令协议格式的字符串 */
        tmpargv[0] = createStringObject("SET",3);
        tmpargv[1] = argv[1];
        tmpargv[2] = argv[3];
        buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
        decrRefCount(tmpargv[0]);
        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
    } else {
        //将写命令生成命令协议格式的字符串
        buf = catAppendOnlyGenericCommand(buf,argc,argv);
    }
    //将命令的协议格式的字符串追加到aof_buf
    if (server.aof_state == AOF_ON)
        server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));
    if (server.aof_child_pid != -1)
        aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));
    sdsfree(buf);
}/*1、将EXPIRE/PEXPIRE/EXPIREAT转化成PEXPIREAT生成命令协议格式的字符串
   2、SETEX/PSETEX的设置过期时间部分转化成PEXPIREAT生成命令协议格式的字符串*/
sds catAppendOnlyExpireAtCommand(sds buf, struct redisCommand *cmd, robj *key, robj *seconds) {
    ……
    buf = catAppendOnlyGenericCommand(buf, 3, argv);
    return buf;
}//生成命令的协议格式的字符串
sds catAppendOnlyGenericCommand(sds dst, int argc, robj **argv) {
    char buf[32];
    int len, j;
    robj *o;
    buf[0] = '*'; //参数个数
    len = 1+ll2string(buf+1,sizeof(buf)-1,argc);
    buf[len++] = '\r';
    buf[len++] = '\n';
    dst = sdscatlen(dst,buf,len);
    for (j = 0; j < argc; j++) {
        o = getDecodedObject(argv[j]);
        buf[0] = '$';//参数长度
        len = 1+ll2string(buf+1,sizeof(buf)-1,sdslen(o->ptr));
        buf[len++] = '\r';
        buf[len++] = '\n';//参数
        dst = sdscatlen(dst,buf,len);
        dst = sdscatlen(dst,o->ptr,sdslen(o->ptr));
        dst = sdscatlen(dst,"\r\n",2);
        decrRefCount(o);
    }
    return dst;
}
文件写入和同步

       redis的服务器进程是一个事件循环,文件事件负责处理客户端的命令请求,而时间事件负责执行serverCron

函数这样的定时运行的函数。在处理文件事件执行写命令,使得命令被追加到aof_buf中,然后在处理时间事件执

行serverCron函数会调用flushAppendOnlyFile函数进行文件的写入和同步。

      flushAppendOnlyFile函数的行为由服务器配置的appendfsync选项的值决定。

always:将aof_buf中的所有内容写入并同步到aof文件。

everysec:将aof_buf中的所有内容写入到aof文件,如果上次同步的时间距离现在超过1s,那么对aof文件进行同

                  步,同步操作由一个线程专门负责执行。

no:将aof_buf中的所有内容写入到aof文件,但不对aof文件同步,同步有操作系统执行。

void flushAppendOnlyFile(int force) {    
    if (sdslen(server.aof_buf) == 0) return;
    if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
        sync_in_progress = bioPendingJobsOfType(BIO_AOF_FSYNC) != 0;
    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
        if (sync_in_progress) {
            if (server.aof_flush_postponed_start == 0) {
                server.aof_flush_postponed_start = server.unixtime;
                return;
            } else if (server.unixtime - server.aof_flush_postponed_start < 2) {
                return;
            }
            server.aof_delayed_fsync++;
        }
    }
    //将aof_buf中的内容写入到aof文件
    nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
    server.aof_flush_postponed_start = 0;
    ……
    server.aof_current_size += nwritten;
    if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
        sdsclear(server.aof_buf);
    } else {
        sdsfree(server.aof_buf);
        server.aof_buf = sdsempty();
    }
    //appendfsync为no或者有后台进程在进行aof或rdb,不进行文件同步
    if (server.aof_no_fsync_on_rewrite &&
        (server.aof_child_pid != -1 || server.rdb_child_pid != -1))
            return;
    /* appendfsync为always */
    if (server.aof_fsync == AOF_FSYNC_ALWAYS) {/
        aof_fsync(server.aof_fd); //同步aof文件
        server.aof_last_fsync = server.unixtime;//记录同步时间
    } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
                server.unixtime > server.aof_last_fsync)) {
        /* appendfsync为EVERYSEC*/
        if (!sync_in_progress) aof_background_fsync(server.aof_fd);
        server.aof_last_fsync = server.unixtime;
    }
}
void aof_background_fsync(int fd) {
    bioCreateBackgroundJob(BIO_AOF_FSYNC,(void*)(long)fd,NULL,NULL);
}

AOF文件的载入和数据还原

        服务器读入并重新执行一遍aof文件里面保存的写命令,就可以还原服务器关闭之前的数据库状态。

服务器读取aof文件并还原数据库状态的流程:


int loadAppendOnlyFile(char *filename) {
    ……
    server.aof_state = AOF_OFF;
    //创建伪客户端
    fakeClient = createFakeClient();
    startLoading(fp);
    //解析aof文件
    while(1) {
        /* Serve the clients from time to time */
        if (!(loops++ % 1000)) {
            loadingProgress(ftello(fp));
            processEventsWhileBlocked();
        }

        if (fgets(buf,sizeof(buf),fp) == NULL) {
        }
        if (buf[0] != '*') goto fmterr;
        if (buf[1] == '\0') goto readerr;
        argc = atoi(buf+1);//命令的参数个数
        argv = zmalloc(sizeof(robj*)*argc);
        fakeClient->argc = argc;
        fakeClient->argv = argv;
        //读取命令的参数
        for (j = 0; j < argc; j++) {
            if (fgets(buf,sizeof(buf),fp) == NULL) {
                fakeClient->argc = j; /* Free up to j-1. */
                freeFakeClientArgv(fakeClient);
                goto readerr;
            }
            if (buf[0] != '$') goto fmterr;
            len = strtol(buf+1,NULL,10);
            argsds = sdsnewlen(NULL,len);
            if (len && fread(argsds,len,1,fp) == 0) {
                sdsfree(argsds);
                fakeClient->argc = j; /* Free up to j-1. */
                freeFakeClientArgv(fakeClient);
                goto readerr;
            }
            argv[j] = createObject(OBJ_STRING,argsds);
            if (fread(buf,2,1,fp) == 0) {
                fakeClient->argc = j+1; /* Free up to j. */
                freeFakeClientArgv(fakeClient);
                goto readerr; /* discard CRLF */
            }
        }//执行写命令
        cmd = lookupCommand(argv[0]->ptr);
        fakeClient->cmd = cmd;
        cmd->proc(fakeClient);
    }
}

AOF重写

       由于aof是通过不断追加写命令来记录数据库状态,所以服务器执行比较久之后,aof文件中的内容会越来越

多,磁盘占有量越来越大,同时也是使通过过aof文件还原数据库的需要的时间也变得很久。所以就需要通过读

取服务器当前的数据库状态来重写新的aof文件。

AOF的重写实现

      由于AOF重写是会进行大量写写入操作,势必为长时间阻塞主进程,因此redis把重写程序放到子进程执行。

这样做有两点好处:

      1)子进程重写期间,主进程可以继续处理命令。

      2)子进程带有主进程的数据副本,这样就可以避免与主进程竞争db->dict,这是线程实现不了的。

      重写期间,主进程继续处理命令,对数据库状态进行修改,这样使得当前的数据库状态与重写的AOF文件

所保存的数据库状态不一致。因此,redis设置了AOF重写缓冲区,在创建子进程后,主进程每执行一个写命令

都会写到重写缓冲区。在子进程完成重写后,主进程会将AOF重写缓冲区的数据写入到重写的AOF文件,保证

数据状态的一致。

重写aof文件的命令

void bgrewriteaofCommand(client *c) {
    if (server.aof_child_pid != -1) {
    } else if (server.rdb_child_pid != -1) {
        server.aof_rewrite_scheduled = 1;
    } else if (rewriteAppendOnlyFileBackground() == C_OK) {
    } else {
    }
}
serverCron定时程序,触发AOF重写

int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
    if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 ||
        ldbPendingChildren())
    {
        ……
    } else {
        ……//检查是否触发AOF重写
         if (server.rdb_child_pid == -1 &&server.aof_child_pid == -1 &&
             server.aof_rewrite_perc &&server.aof_current_size > server.aof_rewrite_min_size)
         {
            long long base = server.aof_rewrite_base_size ?server.aof_rewrite_base_size : 1;
            long long growth = (server.aof_current_size*100/base) - 100;
            if (growth >= server.aof_rewrite_perc) {
                rewriteAppendOnlyFileBackground();
            }
         }
    }

}
后台重写的实现

//后台重写AOF文件
int rewriteAppendOnlyFileBackground(void) {
    if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;
    if (aofCreatePipes() != C_OK) return C_ERR;//创建父进程与子进程的管道
    openChildInfoPipe();
    start = ustime();
    if ((childpid = fork()) == 0) {
        char tmpfile[256];
        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
        if (rewriteAppendOnlyFile(tmpfile) == C_OK) {
            ……
        } 
    } else {
        /* Parent */ ……
    }
    return C_OK; /* unreached */
}//重写AOF文件的程序
int rewriteAppendOnlyFile(char *filename) {
    snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
    server.aof_child_diff = sdsempty();
    rioInitWithFile(&aof,fp);
    if (server.aof_rewrite_incremental_fsync)
        rioSetAutoSync(&aof,AOF_AUTOSYNC_BYTES);
    ……//进行重写操作
    if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;
    if (fflush(fp) == EOF) goto werr;
    if (fsync(fileno(fp)) == -1) goto werr;
    //重写期间,从父进程的重写缓冲区获取部分写命令
    ……
    if (rename(tmpfile,filename) == -1) {
    }
    return C_OK;
}//重写操作
int rewriteAppendOnlyFileRio(rio *aof) {
    ……// 遍历所有的数据库
    for (j = 0; j < server.dbnum; j++) {
        char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
        redisDb *db = server.db+j;
        dict *d = db->dict;
        if (dictSize(d) == 0) continue;
        di = dictGetSafeIterator(d);
        if (rioWrite(aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
        if (rioWriteBulkLongLong(aof,j) == 0) goto werr;
        //遍历dict
        while((de = dictNext(di)) != NULL) {
            ……//检查key-value是否过期,过期就不需要重写到AOF文件
            if (expiretime != -1 && expiretime < now) continue;
            // 根据value类型,进行对应的重写逻辑
            if (o->type == OBJ_STRING) {
                char cmd[]="*3\r\n$3\r\nSET\r\n";
                if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr;
                if (rioWriteBulkObject(aof,&key) == 0) goto werr;
                if (rioWriteBulkObject(aof,o) == 0) goto werr;
            } else if (o->type == OBJ_LIST) {
                if (rewriteListObject(aof,&key,o) == 0) goto werr;
            } else if (o->type == OBJ_SET) {
                if (rewriteSetObject(aof,&key,o) == 0) goto werr;
            } else if (o->type == OBJ_ZSET) {
                if (rewriteSortedSetObject(aof,&key,o) == 0) goto werr;
            } else if (o->type == OBJ_HASH) {
                if (rewriteHashObject(aof,&key,o) == 0) goto werr;
            } else if (o->type == OBJ_MODULE) {
                if (rewriteModuleObject(aof,&key,o) == 0) goto werr;
            }//写入key-value的过期时间
            if (expiretime != -1) {
                char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";
                if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr;
                if (rioWriteBulkObject(aof,&key) == 0) goto werr;
                if (rioWriteBulkLongLong(aof,expiretime) == 0) goto werr;
            }
            ……
        }
        dictReleaseIterator(di);
        di = NULL;
    }
    return C_OK;
}
子进程重写完成后,父进程进行处理

int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
    if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 ||
        ldbPendingChildren())
    {
        if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
            if(pid == server.aof_child_pid) {
            //子进程完成重写,父进程进行重写AOF文件的处理
                backgroundRewriteDoneHandler(exitcode,bysignal);
            } 
        }
    } 
}
void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
    if (!bysignal && exitcode == 0) {
        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",
            (int)server.aof_child_pid);
        newfd = open(tmpfile,O_WRONLY|O_APPEND);
        if (aofRewriteBufferWrite(newfd) == -1) {
            ……//将重写缓冲区的数据写入到重写AOF文件
        }
        if (rename(tmpfile,server.aof_filename) == -1) {
            ……//覆盖旧的AOF文件
        }
        ……
    } 
}




展开阅读全文

没有更多推荐了,返回首页