redis3.0源码学习之可持久化化操作
前言
这一章主要介绍redis的三种可持久化操作,分别是rdb,aof,以及aof重写机制!
一、简单介绍RDB,AOF,REWRITEAOF
RDB:类似于快照存储某一时刻的数据,对数据进行压缩,体积小。
AOF:存储的是到目前为止的数据,记录的是原始数据,丢失数据概率少,体积大。
AOF重写:是为了防止AOF文件过大的一种机制
二、源码分析
1.RDB源码分析
1.1、RDB触发:Rdb思想是把当前进程数据生成快照保存到硬盘的过程,保存数据库的键值对。触发RDB持久化过程分为手动触发和自动触发。
由于配置文件自动触发(子进程异步)
struct saveparam {
time_t seconds;
int changes;
};
struct redisServer server; /* Server global state */
/* RDB persistence RDB持久化相关的对象*/
//表示最后一次save之后数据变化
long long dirty; /* Changes to DB from the last save */
long long dirty_before_bgsave; /* Used to restore dirty on failed BGSAVE */
pid_t rdb_child_pid; /* PID of RDB saving child */
struct saveparam *saveparams; /* Save points array for RDB */
int saveparamslen; /* Number of saving points */
char *rdb_filename; /* Name of RDB file */
int rdb_compression; /* Use compression in RDB? */
int rdb_checksum; /* Use RDB checksum? */
//表示上一次save时间
time_t lastsave; /* Unix time of last successful save */
time_t lastbgsave_try; /* Unix time of last attempted bgsave */
time_t rdb_save_time_last; /* Time used by last RDB save run. */
time_t rdb_save_time_start; /* Current RDB save start time. */
int rdb_bgsave_scheduled; /* BGSAVE when possible if true. */
int rdb_child_type; /* Type of save by active child. */
int lastbgsave_status; /* C_OK or C_ERR */
int stop_writes_on_bgsave_err; /* Don't allow writes if can't BGSAVE */
int rdb_pipe_write_result_to_parent; /* RDB pipes used to return the state */
int rdb_pipe_read_result_from_child; /* of each slave in diskless SYNC. */
/* Pipe and data structures for child -> parent info sharing. */
for (j = 0; j < server.saveparamslen; j++) {
struct saveparam *sp = server.saveparams+j;
/* Save if we reached the given amount of changes,
* the given amount of seconds, and if the latest bgsave was
* successful or if, in case of an error, at least
* CONFIG_BGSAVE_RETRY_DELAY seconds already elapsed. */
if (server.dirty >= sp->changes &&
server.unixtime-server.lastsave > sp->seconds &&
(server.unixtime-server.lastbgsave_try >
CONFIG_BGSAVE_RETRY_DELAY ||
server.lastbgsave_status == C_OK))
{
serverLog(LL_NOTICE,"%d changes in %d seconds. Saving...",
sp->changes, (int)sp->seconds);
rdbSaveInfo rsi, *rsiptr;
rsiptr = rdbPopulateSaveInfo(&rsi);
rdbSaveBackground(server.rdb_filename,rsiptr);
break;
}
}
由于主从复制的自动触发(子进程异步):
void syncCommand(redisClient *c) {
...........
/* Ok we don't have a BGSAVE in progress, let's start one */
// 没有 BGSAVE 在进行,开始一个新的 BGSAVE
redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
if (rdbSaveBackground(server.rdb_filename) != REDIS_OK) {
redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
addReplyError(c,"Unable to perform background save");
return;
}
// 设置状态
c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
/* Flush the script cache for the new slave. */
.............
}
debug reload触发(同步):
void debugCommand(redisClient *c) {
..........
else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
if (rdbSave(server.rdb_filename) != REDIS_OK) {
addReply(c,shared.err);
return;
}
..............
shutdown触发(同步):
ServerCron:
if (server.shutdown_asap) {
// 尝试关闭服务器
if (prepareForShutdown(0) == REDIS_OK) exit(0);
// 如果关闭失败,那么打印 LOG ,并移除关闭标识
redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
server.shutdown_asap = 0;
}
prepareForShutDown:
int prepareForShutdown(int flags) {
........
// 如果客户端执行的是 SHUTDOWN save ,或者 SAVE 功能被打开
// 那么执行 SAVE 操作
if ((server.saveparamslen > 0 && !nosave) || save) {
redisLog(REDIS_NOTICE,"Saving the final RDB snapshot before exiting.");
/* Snapshotting. Perform a SYNC SAVE and exit */
if (rdbSave(server.rdb_filename) != REDIS_OK) {
/* Ooops.. error saving! The best we can do is to continue
* operating. Note that if there was a background saving process,
* in the next cron() Redis will be notified that the background
* saving aborted, handling special stuff like slaves pending for
* synchronization... */
redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit.");
return REDIS_ERR;
}
}
...........
return REDIS_OK;
}
save(同步):
void saveCommand(client *c) {
if (server.rdb_child_pid != -1) {
addReplyError(c,"Background save already in progress");
return;
}
rdbSaveInfo rsi, *rsiptr;
rsiptr = rdbPopulateSaveInfo(&rsi);
if (rdbSave(server.rdb_filename,rsiptr) == C_OK) {
addReply(c,shared.ok);
} else {
addReply(c,shared.err);
}
}
bgsave(子进程异步):
1. 当已经在进行bgsave时,不允许在进行
2. 当正在进行aof rewrite时,如果schedule设置为1,这进行延迟执行;否则直接回复错误。
if (server.rdb_child_pid != -1) {
addReplyError(c,"Background save already in progress");
} else if (server.aof_child_pid != -1) {
if (schedule) {
server.rdb_bgsave_scheduled = 1;
addReplyStatus(c,"Background saving scheduled");
} else {
addReplyError(c,
"An AOF log rewriting in progress: can't BGSAVE right now. "
"Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever "
"possible.");
}
} else if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK) {
addReplyStatus(c,"Background saving started");
} else {
addReply(c,shared.err);
}
2.1、bgsave执行流程:
1:判断当前是否有rdb子线程,如果有则返回错误
2:判断当前是否有bgrewrite,如果有记录进行延时
3:创建子进程进行数据压缩写入文件持久化
4:完成数据快照后向父进程发送子进程结束信号
5:父进程在ServerCorn定时器中wait3进程回收子进程避免僵尸进程的出现
if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
int exitcode = WEXITSTATUS(statloc);
int bysignal = 0;
if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc);
if (pid == -1) {
serverLog(LL_WARNING,"wait3() returned an error: %s. "
"rdb_child_pid = %d, aof_child_pid = %d",
strerror(errno),
(int) server.rdb_child_pid,
(int) server.aof_child_pid);
} else if (pid == server.rdb_child_pid) {
backgroundSaveDoneHandler(exitcode,bysignal);
if (!bysignal && exitcode == 0) receiveChildInfo();
} else if (pid == server.aof_child_pid) {
backgroundRewriteDoneHandler(exitcode,bysignal);
if (!bysignal && exitcode == 0) receiveChildInfo();
} else {
if (!ldbRemoveChild(pid)) {
serverLog(LL_WARNING,
"Warning, detected child with unmatched pid: %ld",
(long)pid);
}
}
updateDictResizePolicy();
closeChildInfoPipe();
}
}
2.3、RDB结构体(5.0版本)
针对于数据压缩部分:因为我们知道整型要比字符串更省内存,因此在保存对象时,都是优先尝试是否可以INT编码
2.4、对于过期键值的处理
- 创建rdb时候会过滤掉过期的键
- 读取时:
- 主:只会载入未过期的;
- 从:全部载入,因为后期主从会同步
2.AOF源码分析
AOF(append only file):以独立日志的方式记录每次写命令,重启时再重新执行AOF文件中的命令达到恢复数据的目的。AOF的主要作用是解决了数据持久化的实时性,目前已经是Redis持久化的主流方式。
数据结构:
redis中aof,是先保存在aof缓冲区中的,数据结构见redisServer:
// aof 缓冲区
struct redisServer {
...
/* AOF persistence */
int aof_state; /* AOF_(ON|OFF|WAIT_REWRITE)
AOF 功能从 off switch 到 on 后,aof_state 会从 AOF_OFF 变为 AOF_WAIT_REWRITE,startAppendOnly 函数完成该逻辑。在 aofrewrite 一次之后,该变量才会从 AOF_WAIT_REWRITE 变为 AOF_ON。*/
int aof_fsync; /* Kind of fsync() policy
同步策略
*/
char *aof_filename; /* Name of the AOF file
名字*/
int aof_no_fsync_on_rewrite; /* Don't fsync if a rewrite is in prog. */
int aof_rewrite_perc; /* Rewrite AOF if % growth is > M and... */
off_t aof_rewrite_min_size; /* the AOF file is at least N bytes. */
off_t aof_rewrite_base_size; /* AOF size on latest startup or rewrite. */
off_t aof_current_size; /* AOF current size. */
int aof_rewrite_scheduled; /* Rewrite once BGSAVE terminates. */
pid_t aof_child_pid; /* PID if rewriting process */
list *aof_rewrite_buf_blocks; /* Hold changes during an AOF rewrite. */
sds aof_buf; /* AOF buffer, written before entering the event loop */
int aof_fd; /* File descriptor of currently selected AOF file */
int aof_selected_db; /* Currently selected DB in AOF */
time_t aof_flush_postponed_start; /* UNIX time of postponed AOF flush */
time_t aof_last_fsync; /* UNIX time of last fsync() */
time_t aof_rewrite_time_last; /* Time used by last AOF rewrite run. */
time_t aof_rewrite_time_start; /* Current AOF rewrite start time. */
int aof_lastbgrewrite_status; /* C_OK or C_ERR */
unsigned long aof_delayed_fsync; /* delayed AOF fsync() counter */
int aof_rewrite_incremental_fsync;/* fsync incrementally while aof rewriting? */
int rdb_save_incremental_fsync; /* fsync incrementally while rdb saving? */
int aof_last_write_status; /* C_OK or C_ERR */
int aof_last_write_errno; /* Valid if aof_last_write_status is ERR */
int aof_load_truncated; /* Don't stop on unexpected AOF EOF. */
int aof_use_rdb_preamble; /* Use RDB preamble on AOF rewrites. */
/* AOF pipes used to communicate between parent and child during rewrite. */
int aof_pipe_write_data_to_child;
int aof_pipe_read_data_from_parent;
int aof_pipe_write_ack_to_parent;
int aof_pipe_read_ack_from_child;
int aof_pipe_write_ack_to_child;
int aof_pipe_read_ack_from_parent;
int aof_stop_sending_diff; /* If true stop sending accumulated diffs
to child process. */
sds aof_child_diff; /* AOF diff accumulator child side. */
...
}
2.1、持久化参数解析
redis中aof刷盘有三种策略,由参数appendfsync控制:
- always: 每次写入都要同步AOF文件(每次在servercron中同步),在一般的SATA硬盘上,Redis只能支持大约几百TPS写入,显然跟Redis高性能特性背道而驰,不建议配置。
- no:由于操作系统每次同步AOF文件的周期不可控(其实差不多也就1s),而且会加大每次同步硬盘的数据量,虽然提升了性能,但数据安全性无法保证。
- everysec,是建议的同步策略,也是默认配置,做到兼顾性能和数据安全性。理论上只有在系统突然宕机的情况下丢失1秒的数据。需要注意的是使用everysec选项时,是利用异步线程来处理的(还记得我们之前讨论的三类子线程吗)。
void flushAppendOnlyFile(int force) {
...
//这个是条件
else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
server.unixtime > server.aof_last_fsync)) {
// 每秒刷新缓存到磁盘一次。
if (!sync_in_progress) {
// 添加任务到后台线程。
aof_background_fsync(server.aof_fd);
server.aof_fsync_offset = server.aof_current_size;
}
server.aof_last_fsync = server.unixtime;
}
...
}
// 添加异步任务
void aof_background_fsync(int fd) {
bioCreateBackgroundJob(BIO_AOF_FSYNC,(void*)(long)fd,NULL,NULL);
}
2.2 具体流程
1:feedAppendOnlyFile 开启aof持久化时,会将命令保存到aof_buf缓冲区中,然后再定时任务中定期的将数据持久化到磁盘。
2:flushAppendOnlyFile根据策略进行刷盘
void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
sds buf = sdsempty();
robj *tmpargv[3];
/* The DB this command was targeting is not the same as the last command
* we appended. To issue a SELECT command is needed. */
if (dictid != server.aof_selected_db) {
char seldb[64];
snprintf(seldb,sizeof(seldb),"%d",dictid);
buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
(unsigned long)strlen(seldb),seldb);
server.aof_selected_db = dictid;
}
if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
cmd->proc == expireatCommand) {
/* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */
buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
} else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {
/* Translate SETEX/PSETEX to SET and PEXPIREAT */
tmpargv[0] = createStringObject("SET",3);
tmpargv[1] = argv[1];
tmpargv[2] = argv[3];
buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
decrRefCount(tmpargv[0]);
buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
} else if (cmd->proc == setCommand && argc > 3) {
int i;
robj *exarg = NULL, *pxarg = NULL;
/* Translate SET [EX seconds][PX milliseconds] to SET and PEXPIREAT */
buf = catAppendOnlyGenericCommand(buf,3,argv);
for (i = 3; i < argc; i ++) {
if (!strcasecmp(argv[i]->ptr, "ex")) exarg = argv[i+1];
if (!strcasecmp(argv[i]->ptr, "px")) pxarg = argv[i+1];
}
serverAssert(!(exarg && pxarg));
if (exarg)
buf = catAppendOnlyExpireAtCommand(buf,server.expireCommand,argv[1],
exarg);
if (pxarg)
buf = catAppendOnlyExpireAtCommand(buf,server.pexpireCommand,argv[1],
pxarg);
} else {
/* All the other commands don't need translation or need the
* same translation already operated in the command vector
* for the replication itself. */
buf = catAppendOnlyGenericCommand(buf,argc,argv);
}
/* 重点在这里 */
if (server.aof_state == AOF_ON)
server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));
/*如果正在rewrite重写,此时为了保证数据的一致性,需要将这个增量数据写入增量数据链表后期进行同步 */
if (server.aof_child_pid != -1)
aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));
sdsfree(buf);
}
- flushAppendOnlyFile(在servercron中调用)函数有两种模式,即参数force=0|1
- force=0:表示可以启用一些优化操作
- 在AOF_FSYNC_EVERYSEC模式下,为了在减少fsync同步操作,如果正在同步,那么将会再延时2s同步的优化。
#define AOF_WRITE_LOG_ERROR_RATE 30 /* Seconds between errors logging. */
void flushAppendOnlyFile(int force) {
ssize_t nwritten;
int sync_in_progress = 0;
mstime_t latency;
if (sdslen(server.aof_buf) == 0) return;
//如果是AOF_FSYNC_EVERYSEC模式,因为aof写入操作是在单独线程完成的,所以要看是否有处于正在
//同步中
if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
sync_in_progress = bioPendingJobsOfType(BIO_AOF_FSYNC) != 0;
if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
if (sync_in_progress) {
//我们可以推迟1,2s,但是也不能一直推迟
// 前面没有推迟过 write 操作,这里将推迟写操作的起始时间记录下来
// 然后就返回,不执行 write 或者 fsync
if (server.aof_flush_postponed_start == 0) {
/* No previous write postponing, remember that we are
* postponing the flush and return. */
server.aof_flush_postponed_start = server.unixtime;
return;
} else if (server.unixtime - server.aof_flush_postponed_start < 2) {
/* We were already waiting for fsync to finish, but for less
* than two seconds this is still ok. Postpone again. */
return;
}
/* Otherwise fall trough, and go write since we can't wait
* over two seconds.
* 如果超过2s了,那么write将会阻塞
*
* */
server.aof_delayed_fsync++;
serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
}
}
//计算延时,开启monitor下,才会启用
latencyStartMonitor(latency);
//具体刷盘操作
nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
latencyEndMonitor(latency);
//不同情况输出不同的内容引起的超时
if (sync_in_progress) {
latencyAddSampleIfNeeded("aof-write-pending-fsync",latency);
} else if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) {
latencyAddSampleIfNeeded("aof-write-active-child",latency);
} else {
latencyAddSampleIfNeeded("aof-write-alone",latency);
}
latencyAddSampleIfNeeded("aof-write",latency);
//清零延迟 write 的时间记录
server.aof_flush_postponed_start = 0;
//写入的文件出错
if (nwritten != (ssize_t)sdslen(server.aof_buf)) {
static time_t last_write_error_log = 0;
int can_log = 0;
/* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */
if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) {
can_log = 1;
last_write_error_log = server.unixtime;
}
/* Log the AOF write error and record the error code. */
if (nwritten == -1) {
if (can_log) {
serverLog(LL_WARNING,"Error writing to the AOF file: %s",
strerror(errno));
server.aof_last_write_errno = errno;
}
} else {
if (can_log) {
serverLog(LL_WARNING,"Short write while writing to "
"the AOF file: (nwritten=%lld, "
"expected=%lld)",
(long long)nwritten,
(long long)sdslen(server.aof_buf));
}
//如果写入的文件问题,我们将其移除,有点类似回滚机制
if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {
if (can_log) {
serverLog(LL_WARNING, "Could not remove short write "
"from the append-only file. Redis may refuse "
"to load the AOF the next time it starts. "
"ftruncate: %s", strerror(errno));
}
} else {
/* If the ftruncate() succeeded we can set nwritten to
* -1 since there is no longer partial data into the AOF. */
nwritten = -1;
}
server.aof_last_write_errno = ENOSPC;
}
//如果出问题了,是alway那么就直接退出,因为他无法回滚,已经写入到磁盘了
/* Handle the AOF write error. */
if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
/* We can't recover when the fsync policy is ALWAYS since the
* reply for the client is already in the output buffers, and we
* have the contract with the user that on acknowledged write data
* is synced on disk. */
serverLog(LL_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");
exit(1);
} else {
/* Recover from failed write leaving data into the buffer. However
* set an error to stop accepting writes as long as the error
* condition is not cleared. */
server.aof_last_write_status = C_ERR;
/* Trim the sds buffer if there was a partial write, and there
* was no way to undo it with ftruncate(2). */
if (nwritten > 0) {
server.aof_current_size += nwritten;
sdsrange(server.aof_buf,nwritten,-1);
}
return; /* We'll try again on the next call... */
}
} else {
/* Successful write(2). If AOF was in error state, restore the
* OK state and log the event. */
if (server.aof_last_write_status == C_ERR) {
serverLog(LL_WARNING,
"AOF write error looks solved, Redis can write again.");
server.aof_last_write_status = C_OK;
}
}
server.aof_current_size += nwritten;
//重复使用aof_buf,小于4K的话只是清空,如果大于4K直接释放再进行分配
if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
sdsclear(server.aof_buf);
} else {
sdsfree(server.aof_buf);
server.aof_buf = sdsempty();
}
// no-appendfsync-on-rewrite参数设置了,表示在rewrite截断不能进行fsync
/* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
* children doing I/O in the background. */
if (server.aof_no_fsync_on_rewrite &&
(server.aof_child_pid != -1 || server.rdb_child_pid != -1))
return;
//如果是always,那么执行redis_fsync,linux下是fdatasync
/* Perform the fsync if needed. */
if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
/* redis_fsync is defined as fdatasync() for Linux in order to avoid
* flushing metadata. */
latencyStartMonitor(latency);
redis_fsync(server.aof_fd); /* Let's try to get this data on the disk */
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("aof-fsync-always",latency);
server.aof_last_fsync = server.unixtime;
} else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
server.unixtime > server.aof_last_fsync)) {
if (!sync_in_progress) aof_background_fsync(server.aof_fd);
server.aof_last_fsync = server.unixtime;
}
}
2.3 AOF读取
1:Redis启动时优先加载aof文件
2:debug loadaof加载aof
int main(int argc, char **argv) {
...
loadDataFromDisk();
...
}
void loadDataFromDisk(void) {
...
long long start = ustime();
if (server.aof_state == AOF_ON) {
if (loadAppendOnlyFile(server.aof_filename) == C_OK)
serverLog(LL_NOTICE,"DB loaded from append only file: %.3f seconds",(float)(ustime()-start)/1000000);
}
...
}
int loadAppendOnlyFile(char *filename) {
...
// 程序模拟一个客户端执行从 aof 文件读出的命令。
fakeClient = createAOFClient();
...
...
}
else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
if (server.aof_state != AOF_OFF) flushAppendOnlyFile(1);
emptyDb(-1,EMPTYDB_NO_FLAGS,NULL);
protectClient(c);
int ret = loadAppendOnlyFile(server.aof_filename);
unprotectClient(c);
if (ret != C_OK) {
addReply(c,shared.err);
return;
}
server.dirty = 0; /* Prevent AOF / replication */
serverLog(LL_WARNING,"Append Only File loaded by DEBUG LOADAOF");
addReply(c,shared.ok);
}
2.4、AOF文件结构
2.5 AOF过期键的处理:
- 被惰性或者定期删除后,会追加一条del指令至aof文件
- 因为AOF重写:会去掉过期键
3.AOF重写源码分析
AOF持久化是通过保存被执行的写命令来记录数据库状态的,所以随着服务器运行时间的增长,AOF文件会越来越大,这样导致使用大文件还原所需的时间也就越多。重写并不是一条条分析aof文件中的日志,而是从数据库中读取现在的值,然后用一条命令来记录键值对,代替之前记录这个键值对的多条命令。
rewriteAppendOnlyFile—–>rewriteAppendOnlyFileBackground
3.1 、重写后的aof具备如下特点:
1. 过期的键不会写入(AOF持久化中被惰性或者定期删除后的键,会追加一条del指令至aof文件,并向客户端返回空;具体清除是在AOF重写时期。具体见函数rewriteAppendOnlyFileRio)
2. 重写使用进程内数据直接生成,这样新的AOF文件只保留最终数据的写入命令。
3. 多条写命令可以合并为一个
4. 单独开辟一个子进程执行rewrite
3.2 、触发条件
1.手动触发:(可以被延时)
void bgrewriteaofCommand(redisClient *c) {
// 不能重复运行 BGREWRITEAOF
if (server.aof_child_pid != -1) {
addReplyError(c,"Background append only file rewriting already in progress");
// 如果正在执行 BGSAVE ,那么预定 BGREWRITEAOF
// 等 BGSAVE 完成之后, BGREWRITEAOF 就会开始执行
} else if (server.rdb_child_pid != -1) {
server.aof_rewrite_scheduled = 1;//delay
addReplyStatus(c,"Background append only file rewriting scheduled");
// 执行 BGREWRITEAOF
} else if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
addReplyStatus(c,"Background append only file rewriting started");
} else {
addReply(c,shared.err);
}
}
2.主动触发:
根据配置文件auto-aof-rewrite-min-size和auto-aof-rewrite-percentage参数确定自动触发时机。即aof_current_size>auto-aof-rewrite-minsize和 (aof_current_size-aof_base_size)/aof_base_size>=auto-aof-rewritepercentage
检测时间是在serverCron函数中
/* Trigger an AOF rewrite if needed. */
if (server.aof_state == AOF_ON &&
server.rdb_child_pid == -1 &&
server.aof_child_pid == -1 &&
server.aof_rewrite_perc &&
server.aof_current_size > server.aof_rewrite_min_size)
{
long long base = server.aof_rewrite_base_size ?
server.aof_rewrite_base_size : 1;
long long growth = (server.aof_current_size*100/base) - 100;
if (growth >= server.aof_rewrite_perc) {
serverLog(LL_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);
rewriteAppendOnlyFileBackground();
}
}
3.3 、执行流程
1:bgrewrite/达到主动触发条件
2:开启子进程进行重写
3:若有rdb子进程执行则延时
4:若有重写子进程返回错误
5:开启子进程
6:备份完成结束子进程,父进程wait3收到子进程结束信号,将重写缓冲区同步到重写AOF文件中
7:在父进程回收子进程中,会将剩余的差异数据全部写入到临时文件中,最后调用rename原子性的替换文件
注意:AOF重写时以及RDB,也会尽可能避免rehash操作
3.4、重写缓冲区
list *aof_rewrite_buf_blocks; /* Hold changes during an AOF rewrite. */
#define AOF_RW_BUF_BLOCK_SIZE (1024*1024*10) /* 10 MB per block */
typedef struct aofrwblock {
unsigned long used, free;
char buf[AOF_RW_BUF_BLOCK_SIZE];
} aofrwblock;
//返回当前aof重写缓冲区的大小
/* Return the current size of the AOF rewrite buffer. */
unsigned long aofRewriteBufferSize(void) {
listNode *ln;
listIter li;
unsigned long size = 0;
listRewind(server.aof_rewrite_buf_blocks,&li);
while((ln = listNext(&li))) {
aofrwblock *block = listNodeValue(ln);
size += block->used;
}
return size;
}
总结
一些需要注意的点:RDB子进程以及AOF子进程互斥,不能同时存在,通过delay来避免。AOF可持久化,只有在每次定时函数时候根据指定的策略进行刷盘。