aof
配置文件
appendonly yes
appendfsync always
#appendfsync everysec
# appendfsync no
/*
* 将命令追加到 AOF 文件中,
* 如果 AOF 重写正在进行,那么也将命令追加到 AOF 重写缓存中。
*/
void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
sds buf = sdsempty();
robj *tmpargv[3];
/* The DB this command was targeting is not the same as the last command
* we appendend. To issue a SELECT command is needed.
*
* 使用 SELECT 命令,显式设置数据库,确保之后的命令被设置到正确的数据库
*/
if (dictid != server.aof_selected_db)
{
char seldb[64];
snprintf(seldb, sizeof(seldb), "%d", dictid);
buf = sdscatprintf(buf, "*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",(unsigned long)strlen(seldb), seldb);
server.aof_selected_db = dictid;
}
// EXPIRE 、 PEXPIRE 和 EXPIREAT 命令
if (cmd->proc == expireCommand ||
cmd->proc == pexpireCommand ||
cmd->proc == expireatCommand) {
/* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT
*
* 将 EXPIRE 、 PEXPIRE 和 EXPIREAT 都翻译成 PEXPIREAT
*/
buf = catAppendOnlyExpireAtCommand(buf, cmd, argv[1], argv[2]);
// SETEX 和 PSETEX 命令
}
else if (cmd->proc == setexCommand
|| cmd->proc == psetexCommand) {
/* Translate SETEX/PSETEX to SET and PEXPIREAT
*
* 将两个命令都翻译成 SET 和 PEXPIREAT
*/
// SET
tmpargv[0] = createStringObject("SET", 3);
tmpargv[1] = argv[1];
tmpargv[2] = argv[3];
buf = catAppendOnlyGenericCommand(buf, 3, tmpargv);
// PEXPIREAT
decrRefCount(tmpargv[0]);
buf = catAppendOnlyExpireAtCommand(buf, cmd, argv[1], argv[2]);
// 其他命令
}
else {
/* All the other commands don't need translation or need the
* same translation already operated in the command vector
* for the replication itself. */
buf = catAppendOnlyGenericCommand(buf, argc, argv);
}
/* Append to the AOF buffer. This will be flushed on disk just before
* of re-entering the event loop, so before the client will get a
* positive reply about the operation performed.
*
* 将命令追加到 AOF 缓存中,
* 在重新进入事件循环之前,这些命令会被冲洗到磁盘上,
* 并向客户端返回一个回复。
*/
if (server.aof_state == REDIS_AOF_ON)
server.aof_buf = sdscatlen(server.aof_buf, buf, sdslen(buf));
/* If a background append only file rewriting is in progress we want to
* accumulate the differences between the child DB and the current one
* in a buffer, so that when the child process will do its work we
* can append the differences to the new append only file.
*
* 如果 BGREWRITEAOF 正在进行,
* 那么我们还需要将命令追加到重写缓存中,
* 从而记录当前正在重写的 AOF 文件和数据库当前状态的差异。
*/
if (server.aof_child_pid != -1)
aofRewriteBufferAppend((unsigned char*)buf, sdslen(buf));
// 释放
sdsfree(buf);
}
void flushAppendOnlyFile(int force) {
int nwritten;
int sync_in_progress = 0;
// 缓冲区中没有任何内容,直接返回
if (sdslen(server.aof_buf) == 0) return;
// 策略为每秒 FSYNC
if (server.iaof_fsync == AOF_FSYNC_EVERYSEC)
// 是否有 SYNC 正在后台进行?
sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0;
// 每秒 fsync ,并且强制写入为假
if (server.iaof_fsync == AOF_FSYNC_EVERYSEC && !force)
{
/* With this append fsync policy we do background fsyncing.
* 当 fsync 策略为每秒钟一次时, fsync 在后台执行。
* If the fsync is still in progress we can try to delay
* the write for a couple of seconds.
* 如果后台仍在执行 FSYNC ,那么我们可以延迟写操作一两秒
* (如果强制执行 write 的话,服务器主线程将阻塞在 write 上面)
*/
if (sync_in_progress)
{
// 有 fsync 正在后台进行 。。。
if (server.aof_flush_postponed_start == 0)
{
/* No previous write postponinig, remember that we are postponing the flush and return.
*
* 前面没有推迟过 write 操作,这里将推迟写操作的时间记录下来 然后就返回,不执行 write 或者 fsync
*/
server.aof_flush_postponed_start = server.unixtime;
return;
}
else if (server.unixtime - server.aof_flush_postponed_start < 2)
{
/* We were already waiting for fsync to finish, but for less than two seconds this is still ok. Postpone again.
* 如果之前已经因为 fsync 而推迟了 write 操作 但是推迟的时间不超过 2 秒,那么直接返回 不执行 write 或者 fsync
*/
return;
}
/* Otherwise fall trough, and go write since we can't wait over two seconds.
* 如果后台还有 fsync 在执行,并且 write 已经推迟 >= 2 秒 那么执行写操作(write 将被阻塞)
*/
server.aof_delayed_fsync++;
redisLog(REDIS_NOTICE, "Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
}
}
/* If you are following this code path, then we are going to write so set reset the postponed flush sentinel to zero.
* 执行到这里,程序会对 AOF 文件进行写入。 清零延迟 write 的时间记录
*/
server.aof_flush_postponed_start = 0;
/* We want to perform a single write. This should be guaranteed atomic at least if the filesystem we are writing is a real physical one.
*
* 执行单个 write 操作,如果写入设备是物理的话,那么这个操作应该是原子的
*
* While this will save us against the server being killed I don't think
* there is much to do about the whole server stopping for power problems
* or alike
*
* 当然,如果出现像电源中断这样的不可抗现象,那么 AOF 文件也是可能会出现问题的
* 这时就要用 redis-check-aof 程序来进行修复。
*/
nwritten = fwrite(server.aof_buf,1, sdslen(server.aof_buf),server.aof_fd );
if (nwritten != (signed)sdslen(server.aof_buf))
{
static time_t last_write_error_log = 0;
int can_log = 0;
/* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */
// 将日志的记录频率限制在每行 AOF_WRITE_LOG_ERROR_RATE 秒
if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE)
{
can_log = 1;
last_write_error_log = server.unixtime;
}
/* Lof the AOF write error and record the error code. */
// 如果写入出错,那么尝试将该情况写入到日志里面
if (nwritten == -1)
{
if (can_log) {
redisLog(REDIS_WARNING, "Error writing to the AOF file: %s",strerror(errno));
server.aof_last_write_errno = errno;
}
}
else
{
if (can_log)
{
redisLog(REDIS_WARNING, "Short write while writing to the AOF file: (nwritten=%lld,expected=%lld)",(long long)nwritten,(long long)sdslen(server.aof_buf));
}
// 尝试移除新追加的不完整内容
//if (truncate(server.aof_fd, server.aof_current_size) == -1)
{
if (can_log)
{
redisLog(REDIS_WARNING, "Could not remove short write from the append-only file. Redis may refuse "
"to load the AOF the next time it starts. ftruncate: %s", strerror(errno));
}
}
//else
{
/* If the ftrunacate() succeeded we can set nwritten to
* -1 since there is no longer partial data into the AOF. */
nwritten = -1;
}
server.aof_last_write_errno = ENOSPC;
}
/* Handle the AOF write error. */
// 处理写入 AOF 文件时出现的错误
if (server.iaof_fsync == AOF_FSYNC_ALWAYS) {
/* We can't recover when the fsync policy is ALWAYS since the reply for the client is already in the output buffers, and we
* have the contract with the user that on acknowledged write data is synched on disk. */
redisLog(REDIS_WARNING, "Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");
exit(1);
}
else {
/* Recover from failed write leaving data into the buffer. However
* set an error to stop accepting writes as long as the error
* condition is not cleared. */
server.aof_last_write_status = REDIS_ERR;
/* Trim the sds buffer if there was a partial write, and there
* was no way to undo it with ftruncate(2). */
if (nwritten > 0)
{
server.aof_current_size += nwritten;
sdsrange(server.aof_buf, nwritten, -1);
}
return; /* We'll try again on the next call... */
}
}
else {
/* Successful write(2). If AOF was in error state, restore the
* OK state and log the event. */
// 写入成功,更新最后写入状态
if (server.aof_last_write_status == REDIS_ERR) {
redisLog(REDIS_WARNING,"AOF write error looks solved, Redis can write again.");
server.aof_last_write_status = REDIS_OK;
}
}
// 更新写入后的 AOF 文件大小
server.aof_current_size += nwritten;
/* Re-use AOF buffer when it is small enough. The maximum comes from the
* arena size of 4k minus some overhead (but is otherwise arbitrary).
*
* 如果 AOF 缓存的大小足够小的话,那么重用这个缓存,否则的话,释放 AOF 缓存。
*/
if ((sdslen(server.aof_buf) + sdsavail(server.aof_buf)) < 4000) {
// 清空缓存中的内容,等待重用
sdsclear(server.aof_buf);
}
else {
// 释放缓存
sdsfree(server.aof_buf);
server.aof_buf = sdsempty();
}
/* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are children doing I/O in the background.
*
* 如果 no-appendfsync-on-rewrite 选项为开启状态,
* 并且有 BGSAVE 或者 BGREWRITEAOF 正在进行的话,
* 那么不执行 fsync
*/
if (server.aof_no_fsync_on_rewrite &&(server.aof_child_pid != -1 || server.rdb_child_pid != -1))
return;
/* Perform the fsync if needed. */
// 总是执行 fsnyc
if (server.iaof_fsync == AOF_FSYNC_ALWAYS)
{
/* aof_fsync is defined as fdatasync() for Linux in order to avoid
* flushing metadata. */
aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */
// 更新最后一次执行 fsnyc 的时间
server.aof_last_fsync = server.unixtime;
// 策略为每秒 fsnyc ,并且距离上次 fsync 已经超过 1 秒
}
else if ((server.iaof_fsync == AOF_FSYNC_EVERYSEC && server.unixtime > server.aof_last_fsync))
{
// 放到后台执行
if (!sync_in_progress) aof_background_fsync(server.aof_fd);
// 更新最后一次执行 fsync 的时间
server.aof_last_fsync = server.unixtime;
}
// 其实上面无论执行 if 部分还是 else 部分都要更新 fsync 的时间
// 可以将代码挪到下面来
server.aof_last_fsync = server.unixtime;
}
int rewriteAppendOnlyFile(char *filename) {
rio aof;
FILE *fp;
char tmpfile[256];
char byte;
/* Note that we have to use a different temp name here compared to the
* one used by rewriteAppendOnlyFileBackground() function. */
snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
fp = fopen(tmpfile,"w");
if (!fp) {
serverLog(LL_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));
return C_ERR;
}
server.aof_child_diff = sdsempty();
rioInitWithFile(&aof,fp);
if (server.aof_rewrite_incremental_fsync)
rioSetAutoSync(&aof,REDIS_AUTOSYNC_BYTES);
if (server.aof_use_rdb_preamble) {
int error;
if (rdbSaveRio(&aof,&error,RDB_SAVE_AOF_PREAMBLE,NULL) == C_ERR) {
errno = error;
goto werr;
}
} else {
if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;
}
/* Do an initial slow fsync here while the parent is still sending
* data, in order to make the next final fsync faster. */
if (fflush(fp) == EOF) goto werr;
if (fsync(fileno(fp)) == -1) goto werr;
/* Read again a few times to get more data from the parent.
* We can't read forever (the server may receive data from clients
* faster than it is able to send data to the child), so we try to read
* some more data in a loop as soon as there is a good chance more data
* will come. If it looks like we are wasting time, we abort (this
* happens after 20 ms without new data). */
int nodata = 0;
mstime_t start = mstime();
while(mstime()-start < 1000 && nodata < 20) {
if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0)
{
nodata++;
continue;
}
nodata = 0; /* Start counting from zero, we stop on N *contiguous*
timeouts. */
aofReadDiffFromParent();
}
/* Ask the master to stop sending diffs. */
if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr;
if (anetNonBlock(NULL,server.aof_pipe_read_ack_from_parent) != ANET_OK)
goto werr;
/* We read the ACK from the server using a 10 seconds timeout. Normally
* it should reply ASAP, but just in case we lose its reply, we are sure
* the child will eventually get terminated. */
if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 ||
byte != '!') goto werr;
serverLog(LL_NOTICE,"Parent agreed to stop sending diffs. Finalizing AOF...");
/* Read the final diff if any. */
aofReadDiffFromParent();
/* Write the received diff to the file. */
serverLog(LL_NOTICE,
"Concatenating %.2f MB of AOF diff received from parent.",
(double) sdslen(server.aof_child_diff) / (1024*1024));
if (rioWrite(&aof,server.aof_child_diff,sdslen(server.aof_child_diff)) == 0)
goto werr;
/* Make sure data will not remain on the OS's output buffers */
if (fflush(fp) == EOF) goto werr;
if (fsync(fileno(fp)) == -1) goto werr;
if (fclose(fp) == EOF) goto werr;
/* Use RENAME to make sure the DB file is changed atomically only
* if the generate DB file is ok. */
if (rename(tmpfile,filename) == -1) {
serverLog(LL_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
unlink(tmpfile);
return C_ERR;
}
serverLog(LL_NOTICE,"SYNC append only file rewrite performed");
return C_OK;
werr:
serverLog(LL_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
fclose(fp);
unlink(tmpfile);
return C_ERR;
}