源码版本:
redis 2.4.4
AOF(append-only fashion)是redis持久化利器之一。通过写log的方式,以满足在需要的时候,重建数据的需求。
the AOF persistence logs every write operation received by the server, that will be played again at server startup, reconstructing the original dataset. Commands are logged using the same format as the Redis protocol itself, in an append-only fashion. Redis is able to rewrite the log on background when it gets too big.
Redis写日志包括:
1. 根据用户的配置,以不同的粒度写AOF log
2. 在日志数据量达到一定量的时候,根据配置或外部发送的命令,重建AOF log
AOF(append-only fashion)是redis持久化利器之一。通过写log的方式,以满足在需要的时候,重建数据的需求。
the AOF persistence logs every write operation received by the server, that will be played again at server startup, reconstructing the original dataset. Commands are logged using the same format as the Redis protocol itself, in an append-only fashion. Redis is able to rewrite the log on background when it gets too big.
Redis写日志包括:
1. 根据用户的配置,以不同的粒度写AOF log
2. 在日志数据量达到一定量的时候,根据配置或外部发送的命令,重建AOF log
Aof相关配置(redis.conf):
appendonly yes
是否开启AOF持久化,yes开启,no不开启
appendfilename appendonly.aof
指定AOF日志文件名称,默认名称:appendonly.aof
appendfsync everysec
什么时候将数据写入disk,redis提供三种模式:
no : 不进行fsync,有OS决定数据刷盘的时间粒度, 性能高
always : 每次写操作都做fsync , 安全
everysec : 上一个fsync后至少1s, 折中
no-appendfsync-on-rewrite no
当Aof log进行重写时,是否写日志时fsync。如果系统遇到latency问题,建议设为yes(rewrite时不强制fsync)
auto-aof-rewrite-percentage 100
当Aof log增长超过指定比例时,重写log file, 设置为0表示不自动重写Aof log
auto-aof-rewrite-min-size 64mb
启动重写Aof log时,Aof log的最小大小
更新数据时写Aof log:
执行一个客户端命令过程
/* Call() is the core of Redis execution of a command */ void call(redisClient *c) { long long dirty, start = ustime(), duration; dirty = server.dirty; c->cmd->proc(c); //命令执行 dirty = server.dirty-dirty; //计算dirty值,更新操作将改变dirty值 duration = ustime()-start; slowlogPushEntryIfNeeded(c->argv,c->argc,duration); if (server.appendonly && dirty > 0) //启动了AOF,并且数据有更新 feedAppendOnlyFile(c->cmd,c->db->id,c->argv,c->argc); if ((dirty > 0 || c->cmd->flags & REDIS_CMD_FORCE_REPLICATION) && listLength(server.slaves)) replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc); if (listLength(server.monitors)) replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc); server.stat_numcommands++; }
feedAppendOnlyFile并不真正写Aof log,写log操作发生在返回给用户请求之前的flushAppendOnlyFile函数void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) { if (dictid != server.appendseldb) { //当前操作的数据库和之前的数据库不一致,则写一条改变数据库的命令 char seldb[64]; snprintf(seldb,sizeof(seldb),"%d",dictid); buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n", (unsigned long)strlen(seldb),seldb); server.appendseldb = dictid; } ..... server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf)); //数据放入atobuf中 if (server.bgrewritechildpid != -1) //如果子进程在进行Aof log rewrite,则同时将数据放入缓冲区bgrewritebuf server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf)); sdsfree(buf); }
void flushAppendOnlyFile(int force) { ssize_t nwritten; int sync_in_progress = 0; if (sdslen(server.aofbuf) == 0) return; if (server.appendfsync == APPENDFSYNC_EVERYSEC) //当appendfsync设置为everysec,check是否有数据等待fsync sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0; //如果appendfsync设置为everysec,阻塞时如果主线程等待fsync时间不超过2s,则返回(数据缓存在aofbuf中) if (server.appendfsync == APPENDFSYNC_EVERYSEC && !force) { if (sync_in_progress) { if (server.aof_flush_postponed_start == 0) { server.aof_flush_postponed_start = server.unixtime; return; } else if (server.unixtime - server.aof_flush_postponed_start < 2) { return; } redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis."); } } server.aof_flush_postponed_start = 0; //写Aof log nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf)); if (nwritten != (signed)sdslen(server.aofbuf)) { if (nwritten == -1) { redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno)); } else { redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno)); } exit(1); } server.appendonly_current_size += nwritten; //记录log文件大小 //清空aofbuf,如果aofbuf较小时,复用之 if ((sdslen(server.aofbuf)+sdsavail(server.aofbuf)) < 4000) { sdsclear(server.aofbuf); } else { sdsfree(server.aofbuf); server.aofbuf = sdsempty(); } //如果设置no-appendfsync-on-rewrite 为yes,且当前有子进程进行rewrite,则直接返回 if (server.no_appendfsync_on_rewrite && (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1)) return; /* Perform the fsync if needed. */ if (server.appendfsync == APPENDFSYNC_ALWAYS) { /* aof_fsync is defined as fdatasync() for Linux in order to avoid * flushing metadata. */ aof_fsync(server.appendfd); /* Let's try to get this data on the disk */ server.lastfsync = server.unixtime; } else if ((server.appendfsync == APPENDFSYNC_EVERYSEC && server.unixtime > server.lastfsync)) { if (!sync_in_progress) aof_background_fsync(server.appendfd); server.lastfsync = server.unixtime; } }
void aof_background_fsync(int fd) { bioCreateBackgroundJob(REDIS_BIO_AOF_FSYNC,(void*)(long)fd,NULL,NULL); }
对于appendfsync设置为everysec的情况,fsync是由独立线程完成rewrite Aof log:在两种情况下,redis会对aof log做rewrite1. 配置自动rewrite的阈值出现2. 客户端发送bgrewriteaof命令void bioCreateBackgroundJob(int type, void *arg1, void *arg2, void *arg3) { struct bio_job *job = zmalloc(sizeof(*job)); job->time = time(NULL); job->arg1 = arg1; job->arg2 = arg2; job->arg3 = arg3; pthread_mutex_lock(&bio_mutex[type]); listAddNodeTail(bio_jobs[type],job); bio_pending[type]++; pthread_cond_signal(&bio_condvar[type]); pthread_mutex_unlock(&bio_mutex[type]); }
接收到bgrewriteaof命令执行函数:
void bgrewriteaofCommand(redisClient *c) { if (server.bgrewritechildpid != -1) { addReplyError(c,"Background append only file rewriting already in progress"); } else if (server.bgsavechildpid != -1) { server.aofrewrite_scheduled = 1; addReplyStatus(c,"Background append only file rewriting scheduled"); } else if (rewriteAppendOnlyFileBackground() == REDIS_OK) { addReplyStatus(c,"Background append only file rewriting started"); } else { addReply(c,shared.err); } }
执行重写:int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { ...... /* Start a scheduled AOF rewrite if this was requested by the user while * a BGSAVE was in progress. */ if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 && server.aofrewrite_scheduled) { rewriteAppendOnlyFileBackground(); } //等待后台rewrite子进程结束并做后处理 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) { int statloc; pid_t pid; if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) { if (pid == server.bgsavechildpid) { backgroundSaveDoneHandler(statloc); } else { backgroundRewriteDoneHandler(statloc); } updateDictResizePolicy(); } } else { time_t now = time(NULL); ...... //check重写aof log的配置条件是否出现 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 && server.auto_aofrewrite_perc && server.appendonly_current_size > server.auto_aofrewrite_min_size) { long long base = server.auto_aofrewrite_base_size ? server.auto_aofrewrite_base_size : 1; long long growth = (server.appendonly_current_size*100/base) - 100; if (growth >= server.auto_aofrewrite_perc) { redisLog(REDIS_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth); rewriteAppendOnlyFileBackground(); } } } ...... }
子进程重写采用copy on write,将当前子进程看到的数据状态写入日志:int rewriteAppendOnlyFileBackground(void) { ...... if ((childpid = fork()) == 0) { //创建子进程,由子进程 char tmpfile[256]; //以下为子进程执行 if (server.vm_enabled) vmReopenSwapFile(); if (server.ipfd > 0) close(server.ipfd); if (server.sofd > 0) close(server.sofd); snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) { _exit(0); } else { _exit(1); } } else { //以下父进程执行 ...... server.bgrewritechildpid = childpid; ...... return REDIS_OK; } return REDIS_OK; /* unreached */ }
int rewriteAppendOnlyFile(char *filename) { ...... //创建临时文件 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid()); fp = fopen(tmpfile,"w"); if (!fp) { redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno)); return REDIS_ERR; } //遍历所有数据库 for (j = 0; j < server.dbnum; j++) { char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n"; redisDb *db = server.db+j; dict *d = db->dict; if (dictSize(d) == 0) continue; di = dictGetSafeIterator(d); if (!di) { fclose(fp); return REDIS_ERR; } //写选择数据库命令 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr; if (fwriteBulkLongLong(fp,j) == 0) goto werr; //写数据库所有元素 while((de = dictNext(di)) != NULL) { ..... } } //数据写入disk fflush(fp); aof_fsync(fileno(fp)); fclose(fp); ......