除了rdb持久化功能之外,redis还提供了aof(append only file)持久化功能。与rdb不同,aof持久化是通过保存redis服务器所执行的写命令来记录数据库的状态。
AOF持久化的实现
AOF持久化的实现可以分为命令追加、文件写入和文件同步三个步骤。
命令追加
当AOF持久化功能处于打开状态时,服务器在执行完一个写命令之后,会以协议格式将被执行的写命令追加到服务器状态的aof_buf缓冲区的末尾:
struct redisServer {
sds aof_buf;/* AOF buffer, written before entering the event loop */
}
服务器执行完写命令,调用propagate进行命令追加。
void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,
int flags)
{
if (server.aof_state != AOF_OFF && flags & PROPAGATE_AOF)
feedAppendOnlyFile(cmd,dbid,argv,argc);
}//进行命令追加
void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
if (dictid != server.aof_selected_db) {
//切换dbid,追加select命令
snprintf(seldb,sizeof(seldb),"%d",dictid);
buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
(unsigned long)strlen(seldb),seldb);
server.aof_selected_db = dictid;
}
if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
cmd->proc == expireatCommand) {
/* 将EXPIRE/PEXPIRE/EXPIREAT转化成PEXPIREAT生成命令协议格式的字符串 */
buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
} else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {
/* 将SETEX/PSETEX转换成SET和PEXPIREAT生成命令协议格式的字符串 */
tmpargv[0] = createStringObject("SET",3);
tmpargv[1] = argv[1];
tmpargv[2] = argv[3];
buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
decrRefCount(tmpargv[0]);
buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
} else {
//将写命令生成命令协议格式的字符串
buf = catAppendOnlyGenericCommand(buf,argc,argv);
}
//将命令的协议格式的字符串追加到aof_buf
if (server.aof_state == AOF_ON)
server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));
if (server.aof_child_pid != -1)
aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));
sdsfree(buf);
}/*1、将EXPIRE/PEXPIRE/EXPIREAT转化成PEXPIREAT生成命令协议格式的字符串
2、SETEX/PSETEX的设置过期时间部分转化成PEXPIREAT生成命令协议格式的字符串*/
sds catAppendOnlyExpireAtCommand(sds buf, struct redisCommand *cmd, robj *key, robj *seconds) {
……
buf = catAppendOnlyGenericCommand(buf, 3, argv);
return buf;
}//生成命令的协议格式的字符串
sds catAppendOnlyGenericCommand(sds dst, int argc, robj **argv) {
char buf[32];
int len, j;
robj *o;
buf[0] = '*'; //参数个数
len = 1+ll2string(buf+1,sizeof(buf)-1,argc);
buf[len++] = '\r';
buf[len++] = '\n';
dst = sdscatlen(dst,buf,len);
for (j = 0; j < argc; j++) {
o = getDecodedObject(argv[j]);
buf[0] = '$';//参数长度
len = 1+ll2string(buf+1,sizeof(buf)-1,sdslen(o->ptr));
buf[len++] = '\r';
buf[len++] = '\n';//参数
dst = sdscatlen(dst,buf,len);
dst = sdscatlen(dst,o->ptr,sdslen(o->ptr));
dst = sdscatlen(dst,"\r\n",2);
decrRefCount(o);
}
return dst;
}
文件写入和同步
redis的服务器进程是一个事件循环,文件事件负责处理客户端的命令请求,而时间事件负责执行serverCron函数这样的定时运行的函数。在处理文件事件执行写命令,使得命令被追加到aof_buf中,然后在处理时间事件执行serverCron函数会调用flushAppendOnlyFile函数进行文件的写入和同步。
flushAppendOnlyFile函数的行为由服务器配置的appendfsync选项的值决定。
always:将aof_buf中的所有内容写入并同步到aof文件。
everysec:将aof_buf中的所有内容写入到aof文件,如果上次同步的时间距离现在超过1s,那么对aof文件进行同步,同步操作由一个线程专门负责执行。
no:将aof_buf中的所有内容写入到aof文件,但不对aof文件同步,同步有操作系统执行。
void flushAppendOnlyFile(int force) {
if (sdslen(server.aof_buf) == 0) return;
if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
sync_in_progress = bioPendingJobsOfType(BIO_AOF_FSYNC) != 0;
if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
if (sync_in_progress) {
if (server.aof_flush_postponed_start == 0) {
server.aof_flush_postponed_start = server.unixtime;
return;
} else if (server.unixtime - server.aof_flush_postponed_start < 2) {
return;
}
server.aof_delayed_fsync++;
}
}
//将aof_buf中的内容写入到aof文件
nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
server.aof_flush_postponed_start = 0;
……
server.aof_current_size += nwritten;
if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
sdsclear(server.aof_buf);
} else {
sdsfree(server.aof_buf);
server.aof_buf = sdsempty();
}
//appendfsync为no或者有后台进程在进行aof或rdb,不进行文件同步
if (server.aof_no_fsync_on_rewrite &&
(server.aof_child_pid != -1 || server.rdb_child_pid != -1))
return;
/* appendfsync为always */
if (server.aof_fsync == AOF_FSYNC_ALWAYS) {/
aof_fsync(server.aof_fd); //同步aof文件
server.aof_last_fsync = server.unixtime;//记录同步时间
} else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
server.unixtime > server.aof_last_fsync)) {
/* appendfsync为EVERYSEC*/
if (!sync_in_progress) aof_background_fsync(server.aof_fd);
server.aof_last_fsync = server.unixtime;
}
}
void aof_background_fsync(int fd) {
bioCreateBackgroundJob(BIO_AOF_FSYNC,(void*)(long)fd,NULL,NULL);
}
AOF文件的载入和数据还原
服务器读入并重新执行一遍aof文件里面保存的写命令,就可以还原服务器关闭之前的数据库状态。
服务器读取aof文件并还原数据库状态的流程:
int loadAppendOnlyFile(char *filename) {
……
server.aof_state = AOF_OFF;
//创建伪客户端
fakeClient = createFakeClient();
startLoading(fp);
//解析aof文件
while(1) {
/* Serve the clients from time to time */
if (!(loops++ % 1000)) {
loadingProgress(ftello(fp));
processEventsWhileBlocked();
}
if (fgets(buf,sizeof(buf),fp) == NULL) {
}
if (buf[0] != '*') goto fmterr;
if (buf[1] == '\0') goto readerr;
argc = atoi(buf+1);//命令的参数个数
argv = zmalloc(sizeof(robj*)*argc);
fakeClient->argc = argc;
fakeClient->argv = argv;
//读取命令的参数
for (j = 0; j < argc; j++) {
if (fgets(buf,sizeof(buf),fp) == NULL) {
fakeClient->argc = j; /* Free up to j-1. */
freeFakeClientArgv(fakeClient);
goto readerr;
}
if (buf[0] != '$') goto fmterr;
len = strtol(buf+1,NULL,10);
argsds = sdsnewlen(NULL,len);
if (len && fread(argsds,len,1,fp) == 0) {
sdsfree(argsds);
fakeClient->argc = j; /* Free up to j-1. */
freeFakeClientArgv(fakeClient);
goto readerr;
}
argv[j] = createObject(OBJ_STRING,argsds);
if (fread(buf,2,1,fp) == 0) {
fakeClient->argc = j+1; /* Free up to j. */
freeFakeClientArgv(fakeClient);
goto readerr; /* discard CRLF */
}
}//执行写命令
cmd = lookupCommand(argv[0]->ptr);
fakeClient->cmd = cmd;
cmd->proc(fakeClient);
}
}
AOF重写
由于aof是通过不断追加写命令来记录数据库状态,所以服务器执行比较久之后,aof文件中的内容会越来越多,磁盘占有量越来越大,同时也是使通过过aof文件还原数据库的需要的时间也变得很久。所以就需要通过读取服务器当前的数据库状态来重写新的aof文件。
AOF的重写实现
由于AOF重写是会进行大量写写入操作,势必为长时间阻塞主进程,因此redis把重写程序放到子进程执行。
这样做有两点好处:
1)子进程重写期间,主进程可以继续处理命令。
2)子进程带有主进程的数据副本,这样就可以避免与主进程竞争db->dict,这是线程实现不了的。
重写期间,主进程继续处理命令,对数据库状态进行修改,这样使得当前的数据库状态与重写的AOF文件所保存的数据库状态不一致。因此,redis设置了AOF重写缓冲区,在创建子进程后,主进程每执行一个写命令都会写到重写缓冲区。在子进程完成重写后,主进程会将AOF重写缓冲区的数据写入到重写的AOF文件,保证数据状态的一致。
重写aof文件的命令
void bgrewriteaofCommand(client *c) {
if (server.aof_child_pid != -1) {
} else if (server.rdb_child_pid != -1) {
server.aof_rewrite_scheduled = 1;
} else if (rewriteAppendOnlyFileBackground() == C_OK) {
} else {
}
}
serverCron定时程序,触发AOF重写
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 ||
ldbPendingChildren())
{
……
} else {
……//检查是否触发AOF重写
if (server.rdb_child_pid == -1 &&server.aof_child_pid == -1 &&
server.aof_rewrite_perc &&server.aof_current_size > server.aof_rewrite_min_size)
{
long long base = server.aof_rewrite_base_size ?server.aof_rewrite_base_size : 1;
long long growth = (server.aof_current_size*100/base) - 100;
if (growth >= server.aof_rewrite_perc) {
rewriteAppendOnlyFileBackground();
}
}
}
}
后台重写的实现
//后台重写AOF文件
int rewriteAppendOnlyFileBackground(void) {
if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;
if (aofCreatePipes() != C_OK) return C_ERR;//创建父进程与子进程的管道
openChildInfoPipe();
start = ustime();
if ((childpid = fork()) == 0) {
char tmpfile[256];
snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
if (rewriteAppendOnlyFile(tmpfile) == C_OK) {
……
}
} else {
/* Parent */ ……
}
return C_OK; /* unreached */
}//重写AOF文件的程序
int rewriteAppendOnlyFile(char *filename) {
snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
server.aof_child_diff = sdsempty();
rioInitWithFile(&aof,fp);
if (server.aof_rewrite_incremental_fsync)
rioSetAutoSync(&aof,AOF_AUTOSYNC_BYTES);
……//进行重写操作
if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;
if (fflush(fp) == EOF) goto werr;
if (fsync(fileno(fp)) == -1) goto werr;
//重写期间,从父进程的重写缓冲区获取部分写命令
……
if (rename(tmpfile,filename) == -1) {
}
return C_OK;
}//重写操作
int rewriteAppendOnlyFileRio(rio *aof) {
……// 遍历所有的数据库
for (j = 0; j < server.dbnum; j++) {
char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
redisDb *db = server.db+j;
dict *d = db->dict;
if (dictSize(d) == 0) continue;
di = dictGetSafeIterator(d);
if (rioWrite(aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
if (rioWriteBulkLongLong(aof,j) == 0) goto werr;
//遍历dict
while((de = dictNext(di)) != NULL) {
……//检查key-value是否过期,过期就不需要重写到AOF文件
if (expiretime != -1 && expiretime < now) continue;
// 根据value类型,进行对应的重写逻辑
if (o->type == OBJ_STRING) {
char cmd[]="*3\r\n$3\r\nSET\r\n";
if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr;
if (rioWriteBulkObject(aof,&key) == 0) goto werr;
if (rioWriteBulkObject(aof,o) == 0) goto werr;
} else if (o->type == OBJ_LIST) {
if (rewriteListObject(aof,&key,o) == 0) goto werr;
} else if (o->type == OBJ_SET) {
if (rewriteSetObject(aof,&key,o) == 0) goto werr;
} else if (o->type == OBJ_ZSET) {
if (rewriteSortedSetObject(aof,&key,o) == 0) goto werr;
} else if (o->type == OBJ_HASH) {
if (rewriteHashObject(aof,&key,o) == 0) goto werr;
} else if (o->type == OBJ_MODULE) {
if (rewriteModuleObject(aof,&key,o) == 0) goto werr;
}//写入key-value的过期时间
if (expiretime != -1) {
char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";
if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr;
if (rioWriteBulkObject(aof,&key) == 0) goto werr;
if (rioWriteBulkLongLong(aof,expiretime) == 0) goto werr;
}
……
}
dictReleaseIterator(di);
di = NULL;
}
return C_OK;
}
子进程重写完成后,父进程进行处理
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 ||
ldbPendingChildren())
{
if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
if(pid == server.aof_child_pid) {
//子进程完成重写,父进程进行重写AOF文件的处理
backgroundRewriteDoneHandler(exitcode,bysignal);
}
}
}
}
void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
if (!bysignal && exitcode == 0) {
snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",
(int)server.aof_child_pid);
newfd = open(tmpfile,O_WRONLY|O_APPEND);
if (aofRewriteBufferWrite(newfd) == -1) {
……//将重写缓冲区的数据写入到重写AOF文件
}
if (rename(tmpfile,server.aof_filename) == -1) {
……//覆盖旧的AOF文件
}
……
}
}