Redis_aof

42 篇文章 0 订阅
15 篇文章 0 订阅

aof

 

配置文件

appendonly yes

appendfsync always

#appendfsync everysec

# appendfsync no

 

/*

* 将命令追加到 AOF 文件中,

* 如果 AOF 重写正在进行,那么也将命令追加到 AOF 重写缓存中。

*/

void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {

sds buf = sdsempty();

robj *tmpargv[3];

 

/* The DB this command was targeting is not the same as the last command

* we appendend. To issue a SELECT command is needed.

*

* 使用 SELECT 命令,显式设置数据库,确保之后的命令被设置到正确的数据库

*/

if (dictid != server.aof_selected_db)

{

char seldb[64];

snprintf(seldb, sizeof(seldb), "%d", dictid);

buf = sdscatprintf(buf, "*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",(unsigned long)strlen(seldb), seldb);

server.aof_selected_db = dictid;

}

 

// EXPIRE 、 PEXPIRE 和 EXPIREAT 命令

if (cmd->proc == expireCommand ||

cmd->proc == pexpireCommand ||

cmd->proc == expireatCommand) {

/* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT

*

* 将 EXPIRE 、 PEXPIRE 和 EXPIREAT 都翻译成 PEXPIREAT

*/

buf = catAppendOnlyExpireAtCommand(buf, cmd, argv[1], argv[2]);

// SETEX 和 PSETEX 命令

}

else if (cmd->proc == setexCommand

|| cmd->proc == psetexCommand) {

/* Translate SETEX/PSETEX to SET and PEXPIREAT

*

* 将两个命令都翻译成 SET 和 PEXPIREAT

*/

// SET

tmpargv[0] = createStringObject("SET", 3);

tmpargv[1] = argv[1];

tmpargv[2] = argv[3];

buf = catAppendOnlyGenericCommand(buf, 3, tmpargv);

// PEXPIREAT

decrRefCount(tmpargv[0]);

buf = catAppendOnlyExpireAtCommand(buf, cmd, argv[1], argv[2]);

// 其他命令

}

else {

/* All the other commands don't need translation or need the

* same translation already operated in the command vector

* for the replication itself. */

buf = catAppendOnlyGenericCommand(buf, argc, argv);

}

 

/* Append to the AOF buffer. This will be flushed on disk just before

* of re-entering the event loop, so before the client will get a

* positive reply about the operation performed.

*

* 将命令追加到 AOF 缓存中,

* 在重新进入事件循环之前,这些命令会被冲洗到磁盘上,

* 并向客户端返回一个回复。

*/

if (server.aof_state == REDIS_AOF_ON)

server.aof_buf = sdscatlen(server.aof_buf, buf, sdslen(buf));

 

/* If a background append only file rewriting is in progress we want to

* accumulate the differences between the child DB and the current one

* in a buffer, so that when the child process will do its work we

* can append the differences to the new append only file.

*

* 如果 BGREWRITEAOF 正在进行,

* 那么我们还需要将命令追加到重写缓存中,

* 从而记录当前正在重写的 AOF 文件和数据库当前状态的差异。

*/

if (server.aof_child_pid != -1)

aofRewriteBufferAppend((unsigned char*)buf, sdslen(buf));

 

// 释放

sdsfree(buf);

}

 

void flushAppendOnlyFile(int force) {

int nwritten;

int sync_in_progress = 0;

 

// 缓冲区中没有任何内容,直接返回

if (sdslen(server.aof_buf) == 0) return;

 

// 策略为每秒 FSYNC

if (server.iaof_fsync == AOF_FSYNC_EVERYSEC)

// 是否有 SYNC 正在后台进行?

sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0;

 

// 每秒 fsync ,并且强制写入为假

if (server.iaof_fsync == AOF_FSYNC_EVERYSEC && !force)

{

/* With this append fsync policy we do background fsyncing.

* 当 fsync 策略为每秒钟一次时, fsync 在后台执行。

* If the fsync is still in progress we can try to delay

* the write for a couple of seconds.

* 如果后台仍在执行 FSYNC ,那么我们可以延迟写操作一两秒

* (如果强制执行 write 的话,服务器主线程将阻塞在 write 上面)

*/

if (sync_in_progress)

{

// 有 fsync 正在后台进行 。。。

if (server.aof_flush_postponed_start == 0)

{

/* No previous write postponinig, remember that we are postponing the flush and return.

*

* 前面没有推迟过 write 操作,这里将推迟写操作的时间记录下来 然后就返回,不执行 write 或者 fsync

*/

server.aof_flush_postponed_start = server.unixtime;

return;

}

else if (server.unixtime - server.aof_flush_postponed_start < 2)

{

/* We were already waiting for fsync to finish, but for less than two seconds this is still ok. Postpone again.

* 如果之前已经因为 fsync 而推迟了 write 操作 但是推迟的时间不超过 2 秒,那么直接返回 不执行 write 或者 fsync

*/

return;

}

/* Otherwise fall trough, and go write since we can't wait over two seconds.

* 如果后台还有 fsync 在执行,并且 write 已经推迟 >= 2 秒 那么执行写操作(write 将被阻塞)

*/

server.aof_delayed_fsync++;

redisLog(REDIS_NOTICE, "Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");

}

}

 

/* If you are following this code path, then we are going to write so set reset the postponed flush sentinel to zero.

* 执行到这里,程序会对 AOF 文件进行写入。 清零延迟 write 的时间记录

*/

server.aof_flush_postponed_start = 0;

 

/* We want to perform a single write. This should be guaranteed atomic at least if the filesystem we are writing is a real physical one.

*

* 执行单个 write 操作,如果写入设备是物理的话,那么这个操作应该是原子的

*

* While this will save us against the server being killed I don't think

* there is much to do about the whole server stopping for power problems

* or alike

*

* 当然,如果出现像电源中断这样的不可抗现象,那么 AOF 文件也是可能会出现问题的

* 这时就要用 redis-check-aof 程序来进行修复。

*/

nwritten = fwrite(server.aof_buf,1, sdslen(server.aof_buf),server.aof_fd );

if (nwritten != (signed)sdslen(server.aof_buf))

{

static time_t last_write_error_log = 0;

int can_log = 0;

/* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */

// 将日志的记录频率限制在每行 AOF_WRITE_LOG_ERROR_RATE 秒

if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE)

{

can_log = 1;

last_write_error_log = server.unixtime;

}

/* Lof the AOF write error and record the error code. */

// 如果写入出错,那么尝试将该情况写入到日志里面

if (nwritten == -1)

{

if (can_log) {

redisLog(REDIS_WARNING, "Error writing to the AOF file: %s",strerror(errno));

server.aof_last_write_errno = errno;

}

}

else

{

if (can_log)

{

redisLog(REDIS_WARNING, "Short write while writing to the AOF file: (nwritten=%lld,expected=%lld)",(long long)nwritten,(long long)sdslen(server.aof_buf));

}

 

// 尝试移除新追加的不完整内容

//if (truncate(server.aof_fd, server.aof_current_size) == -1)

{

if (can_log)

{

redisLog(REDIS_WARNING, "Could not remove short write from the append-only file. Redis may refuse "

"to load the AOF the next time it starts. ftruncate: %s", strerror(errno));

}

}

//else

{

/* If the ftrunacate() succeeded we can set nwritten to

* -1 since there is no longer partial data into the AOF. */

nwritten = -1;

}

server.aof_last_write_errno = ENOSPC;

}

 

/* Handle the AOF write error. */

// 处理写入 AOF 文件时出现的错误

if (server.iaof_fsync == AOF_FSYNC_ALWAYS) {

/* We can't recover when the fsync policy is ALWAYS since the reply for the client is already in the output buffers, and we

* have the contract with the user that on acknowledged write data is synched on disk. */

redisLog(REDIS_WARNING, "Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");

exit(1);

}

else {

/* Recover from failed write leaving data into the buffer. However

* set an error to stop accepting writes as long as the error

* condition is not cleared. */

server.aof_last_write_status = REDIS_ERR;

 

/* Trim the sds buffer if there was a partial write, and there

* was no way to undo it with ftruncate(2). */

if (nwritten > 0)

{

server.aof_current_size += nwritten;

sdsrange(server.aof_buf, nwritten, -1);

}

return; /* We'll try again on the next call... */

}

}

else {

/* Successful write(2). If AOF was in error state, restore the

* OK state and log the event. */

// 写入成功,更新最后写入状态

if (server.aof_last_write_status == REDIS_ERR) {

redisLog(REDIS_WARNING,"AOF write error looks solved, Redis can write again.");

server.aof_last_write_status = REDIS_OK;

}

}

 

// 更新写入后的 AOF 文件大小

server.aof_current_size += nwritten;

 

/* Re-use AOF buffer when it is small enough. The maximum comes from the

* arena size of 4k minus some overhead (but is otherwise arbitrary).

*

* 如果 AOF 缓存的大小足够小的话,那么重用这个缓存,否则的话,释放 AOF 缓存。

*/

if ((sdslen(server.aof_buf) + sdsavail(server.aof_buf)) < 4000) {

// 清空缓存中的内容,等待重用

sdsclear(server.aof_buf);

}

else {

// 释放缓存

sdsfree(server.aof_buf);

server.aof_buf = sdsempty();

}

 

/* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are children doing I/O in the background.

*

* 如果 no-appendfsync-on-rewrite 选项为开启状态,

* 并且有 BGSAVE 或者 BGREWRITEAOF 正在进行的话,

* 那么不执行 fsync

*/

if (server.aof_no_fsync_on_rewrite &&(server.aof_child_pid != -1 || server.rdb_child_pid != -1))

return;

 

/* Perform the fsync if needed. */

// 总是执行 fsnyc

if (server.iaof_fsync == AOF_FSYNC_ALWAYS)

{

/* aof_fsync is defined as fdatasync() for Linux in order to avoid

* flushing metadata. */

aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */

// 更新最后一次执行 fsnyc 的时间

server.aof_last_fsync = server.unixtime;

// 策略为每秒 fsnyc ,并且距离上次 fsync 已经超过 1 秒

}

else if ((server.iaof_fsync == AOF_FSYNC_EVERYSEC && server.unixtime > server.aof_last_fsync))

{

// 放到后台执行

if (!sync_in_progress) aof_background_fsync(server.aof_fd);

// 更新最后一次执行 fsync 的时间

server.aof_last_fsync = server.unixtime;

}

 

// 其实上面无论执行 if 部分还是 else 部分都要更新 fsync 的时间

// 可以将代码挪到下面来

server.aof_last_fsync = server.unixtime;

}

 

int rewriteAppendOnlyFile(char *filename) {

rio aof;

FILE *fp;

char tmpfile[256];

char byte;

 

/* Note that we have to use a different temp name here compared to the

* one used by rewriteAppendOnlyFileBackground() function. */

snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());

fp = fopen(tmpfile,"w");

if (!fp) {

serverLog(LL_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));

return C_ERR;

}

 

server.aof_child_diff = sdsempty();

rioInitWithFile(&aof,fp);

 

if (server.aof_rewrite_incremental_fsync)

rioSetAutoSync(&aof,REDIS_AUTOSYNC_BYTES);

 

if (server.aof_use_rdb_preamble) {

int error;

if (rdbSaveRio(&aof,&error,RDB_SAVE_AOF_PREAMBLE,NULL) == C_ERR) {

errno = error;

goto werr;

}

} else {

if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;

}

 

/* Do an initial slow fsync here while the parent is still sending

* data, in order to make the next final fsync faster. */

if (fflush(fp) == EOF) goto werr;

if (fsync(fileno(fp)) == -1) goto werr;

 

/* Read again a few times to get more data from the parent.

* We can't read forever (the server may receive data from clients

* faster than it is able to send data to the child), so we try to read

* some more data in a loop as soon as there is a good chance more data

* will come. If it looks like we are wasting time, we abort (this

* happens after 20 ms without new data). */

int nodata = 0;

mstime_t start = mstime();

while(mstime()-start < 1000 && nodata < 20) {

if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0)

{

nodata++;

continue;

}

nodata = 0; /* Start counting from zero, we stop on N *contiguous*

timeouts. */

aofReadDiffFromParent();

}

 

/* Ask the master to stop sending diffs. */

if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr;

if (anetNonBlock(NULL,server.aof_pipe_read_ack_from_parent) != ANET_OK)

goto werr;

/* We read the ACK from the server using a 10 seconds timeout. Normally

* it should reply ASAP, but just in case we lose its reply, we are sure

* the child will eventually get terminated. */

if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 ||

byte != '!') goto werr;

serverLog(LL_NOTICE,"Parent agreed to stop sending diffs. Finalizing AOF...");

 

/* Read the final diff if any. */

aofReadDiffFromParent();

 

/* Write the received diff to the file. */

serverLog(LL_NOTICE,

"Concatenating %.2f MB of AOF diff received from parent.",

(double) sdslen(server.aof_child_diff) / (1024*1024));

if (rioWrite(&aof,server.aof_child_diff,sdslen(server.aof_child_diff)) == 0)

goto werr;

 

/* Make sure data will not remain on the OS's output buffers */

if (fflush(fp) == EOF) goto werr;

if (fsync(fileno(fp)) == -1) goto werr;

if (fclose(fp) == EOF) goto werr;

 

/* Use RENAME to make sure the DB file is changed atomically only

* if the generate DB file is ok. */

if (rename(tmpfile,filename) == -1) {

serverLog(LL_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));

unlink(tmpfile);

return C_ERR;

}

serverLog(LL_NOTICE,"SYNC append only file rewrite performed");

return C_OK;

 

werr:

serverLog(LL_WARNING,"Write error writing append only file on disk: %s", strerror(errno));

fclose(fp);

unlink(tmpfile);

return C_ERR;

}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值