sever.h
struct redisServer{
int aof_fd; /* File descriptor of currently selected AOF file */
sds aof_buf; /* AOF buffer, written before entering the event loop */
......
}
sever.c
// 主线程处理完用户请求会调用 ---- TODO:返包给用户在前,还是beforeSleep在前?
void beforeSleep(struct aeEventLoop *eventLoop) {
......
if (server.aof_state == AOF_ON)
flushAppendOnlyFile(0);
......
}
// 主线程定时会调用
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
......
/* AOF postponed flush: Try at every cron cycle if the slow fsync
* completed. */
if (server.aof_state == AOF_ON && server.aof_flush_postponed_start)
flushAppendOnlyFile(0);
/* AOF write errors: in this case we have a buffer to flush as well and
* clear the AOF error in case of success to make the DB writable again,
* however to try every second is enough in case of 'hz' is set to
* a higher frequency. */
run_with_period(1000) {
if (server.aof_state == AOF_ON && server.aof_last_write_status == C_ERR)
flushAppendOnlyFile(0);
}
......
}
// 退出redis会调用
int prepareForShutdown(int flags) {
......
if (server.aof_state != AOF_OFF) {
/* Kill the AOF saving child as the AOF we already have may be longer
* but contains the full dataset anyway. */
if (server.child_type == CHILD_TYPE_AOF) {
/* If we have AOF enabled but haven't written the AOF yet, don't
* shutdown or else the dataset will be lost. */
if (server.aof_state == AOF_WAIT_REWRITE) {
serverLog(LL_WARNING, "Writing initial AOF, can't exit.");
return C_ERR;
}
serverLog(LL_WARNING,
"There is a child rewriting the AOF. Killing it!");
killAppendOnlyChild();
}
/* Append only file: flush buffers and fsync() the AOF at exit */
serverLog(LL_NOTICE,"Calling fsync() on the AOF file.");
flushAppendOnlyFile(1);
if (redis_fsync(server.aof_fd) == -1) {
serverLog(LL_WARNING,"Fail to fsync the AOF file: %s.",
strerror(errno));
}
}
......
}
aof.c
void flushAppendOnlyFile(int force) {
ssize_t nwritten;
int sync_in_progress = 0;
mstime_t latency;
if (sdslen(server.aof_buf) == 0) { // aof日志没有新增内容
if (server.aof_fsync == AOF_FSYNC_EVERYSEC &&
server.aof_fsync_offset != server.aof_current_size &&
server.unixtime > server.aof_last_fsync &&
!(sync_in_progress = aofFsyncInProgress())) {
goto try_fsync; // 配置了everysec策略,即使无文件写入也会执行
} else {
return;
}
}
if (server.aof_fsync == AOF_FSYNC_EVERYSEC) // 提取fsync线程的状态
sync_in_progress = aofFsyncInProgress();
if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) { // 配测aof刷盘策略是everysec(注意force参数随调用地方不同而不同)
if (sync_in_progress) { // fsync线程在运行中 ---- 根据已推迟的时长,决定是否能再推迟,后续再write
if (server.aof_flush_postponed_start == 0) { // 第一次推迟,记录第一次推迟的时间戳,继续推迟
server.aof_flush_postponed_start = server.unixtime;
return;
} else if (server.unixtime - server.aof_flush_postponed_start < 2) { // 上一次推迟时间距今未超过2秒,继续推迟
return;
}
// 无法再推迟了,必须write
// 只有write有推迟机制,fsync没有推迟机制
// write操作会因为fsync的进展而推迟
// fsync执行与否的判定标准很固定 ---- fsync线程空闲,上一次fsync的时间是1秒前
server.aof_delayed_fsync++;
serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
}
}
......
// 走到这个逻辑,要写入aof日志,当前状态要么:未有推迟情况现在要write;推迟超过2秒了强制write
server.aof_flush_postponed_start = 0; // 重置推迟时间戳
// 处理write
if (nwritten != (ssize_t)sdslen(server.aof_buf)) { // 未能完全写入
static time_t last_write_error_log = 0;
int can_log = 0;
if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) { // 间隔周期记录错误日志
can_log = 1;
last_write_error_log = server.unixtime;
}
if (nwritten == -1) { // TODO: 写入文件失败
if (can_log) {
serverLog(LL_WARNING,"Error writing to the AOF file: %s",
strerror(errno));
server.aof_last_write_errno = errno;
}
} else { // 部分写入
if (can_log) {
serverLog(LL_WARNING,"Short write while writing to "
"the AOF file: (nwritten=%lld, "
"expected=%lld)",
(long long)nwritten,
(long long)sdslen(server.aof_buf));
}
if (ftruncate(server.aof_fd, server.aof_current_size) == -1) { // 回滚擦除部分写入的内容
if (can_log) {
serverLog(LL_WARNING, "Could not remove short write "
"from the append-only file. Redis may refuse "
"to load the AOF the next time it starts. "
"ftruncate: %s", strerror(errno));
}
} else {
/* If the ftruncate() succeeded we can set nwritten to
* -1 since there is no longer partial data into the AOF. */
nwritten = -1; // 回滚成功
}
server.aof_last_write_errno = ENOSPC;
}
if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
serverLog(LL_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");
exit(1); //TODO: always策略如果不能全部write成功,直接挂了,部分写入也不重试吗?
} else { // 记录错误状态
server.aof_last_write_status = C_ERR;
if (nwritten > 0) { // 如果无法回滚之前部分写入的内容,那就得承认这部分写入的内容
server.aof_current_size += nwritten;
sdsrange(server.aof_buf,nwritten,-1); // TODO: 擦除部分写入内容,是为了节省操作aof_buf开销?
}
// 等待下次再尝试写入
return;
}
} else { // 完全写入,更新为成功状态
/* Successful write(2). If AOF was in error state, restore the
* OK state and log the event. */
if (server.aof_last_write_status == C_ERR) {
serverLog(LL_WARNING,
"AOF write error looks solved, Redis can write again.");
server.aof_last_write_status = C_OK;
}
}
server.aof_current_size += nwritten; // 更新write进度
......
// 处理fsync
try_fsync:
if (server.aof_no_fsync_on_rewrite && hasActiveChildProcess()) // 配置在aof重写时不对aof执行fsync,放弃fsync
return;
if (server.aof_fsync == AOF_FSYNC_ALWAYS) { // always策略,主线程来fsync
......
// 执行fsync
if (redis_fsync(server.aof_fd) == -1) {
serverLog(LL_WARNING,"Can't persist AOF for fsync error when the "
"AOF fsync policy is 'always': %s. Exiting...", strerror(errno));
exit(1); // always策略fsync失败就挂掉
}
......
// 同步一下刷盘进度
server.aof_fsync_offset = server.aof_current_size;
server.aof_last_fsync = server.unixtime;
} else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
server.unixtime > server.aof_last_fsync)) { // 1s内只允许执行一次fsync
if (!sync_in_progress) { // fsync线程空闲,交付给fsync线程
aof_background_fsync(server.aof_fd);
server.aof_fsync_offset = server.aof_current_size; // 更新fsync进度 ---- TODO: 如果fsync线程执行失败了呢?直接就认为成功更新了?
}
server.aof_last_fsync = server.unixtime;
}
}
流程:
- aeEventLoop主线程处理网络请求,生成的aof日志会放到aof_buf中,处理完后在进入epoll_wait之前,会调用before_sleep
- 在before_sleep中调用flushAppendOnlyFile方法,来将aof_buf持久化到aof日志中
- write阶段
- 没有新增aof日志,中止操作 ---- 如果是everysec策略,还是会跳到fsync阶段,保证每秒执行fsync
- 配置everysec:判定fsync线程状态,如果fsync线程在执行中,则推迟本次write。交由定时任务severCron去推动;或者下次新aof日志生成的时候才推动
- write异常处理:
- 配置always: 退出redis进程,报错
- 配置everysec / no: 交由serverCron后续定时重试,打印日志
- fsync阶段
- 如果配置了aof_no_fsync_on_rewrite,则当aof重写进程在运行中,则跳过fsync,此时掉电的话会丢失数据
- always策略,主线程来执行fsync,失败则退出redis进程报错
- everysec策略,判定距离上一次fsync是否已过了1s,满1s以上才交付给fsync线程去执行fsync
- 进度管理
- redis会维护文件已write,已fsync的文件偏移,每次操作成功就会去修改偏移
- 如果write是部分写入,redis会尝试回滚,在下次写入的时候再统一写 ---- TODO: 不太理解,为了避免aof_buf字符串操作开销?
- write阶段
- severCron定时被主线程执行,主线程会检测aof_buf的写入状态,进行重试。重试的逻辑跟上面是一样的
总结:
- 主线程执行write,一般比较快不会阻塞
- fsync操作会阻塞线程,因此fsync操作的具体执行线程会根据配置而变化:
- always:主线程执行fsync,保证不丢日志
- everysec:交付给fsync线程去执行,掉电的话有可能会丢1秒钟内的日志。注:不掉电的话是不会丢数据的,因为主线程会阻塞write,无法响应后续的请求,也就没有新日志产生了。
- no:没有fsync阶段,交付给OS去调度刷盘,掉电的话丢失的日志量是不可预估的
展开说说
https://time.geekbang.org/column/article/287819 里面写到
AOF 重写会对磁盘进行大量 IO 操作,同时,fsync 又需要等到数据写到磁盘后才能返回,所以,当 AOF 重写的压力比较大时,就会导致 fsync 被阻塞。虽然 fsync 是由后台子线程负责执行的,但是,主线程会监控 fsync 的执行进度。
当主线程使用后台子线程执行了一次 fsync,需要再次把新接收的操作记录写回磁盘时,如果主线程发现上一次的 fsync 还没有执行完,那么它就会阻塞。所以,如果后台子线程执行的 fsync 频繁阻塞的话(比如 AOF 重写占用了大量的磁盘 IO 带宽),主线程也会阻塞,导致 Redis 性能变慢。
这里说得不严谨。事实上,这里主线程会阻塞,主要原因还是I/O压力大,fsync线程还未执行完毕只是表征。
- 源码中,如果fsync线程在运行中,主线程会推迟write操作,但推迟最终还是要执行的,隔1s后就必定要执行write来避免丢数据了。
- 注意到fsync线程还未执行完,导致它未执行完的原因就是I/O压力大。
- 此时主线程执行write又增加了I/O压力 ---- write写入到pageCache,pageCache空间达到上限值没法容纳新内容,此时必须要将pageCache的内容刷到磁盘来腾出空间,进而才能继续执行write
- I/O压力大,无法快速响应write和fsync请求,此时就体现为主线程阻塞,进而redis无响应