Redis源码解析：不同策略的aof日志写入流程

stevenGsocute

已于 2023-12-02 14:55:53 修改

阅读量886

点赞数 26

文章标签： redis 数据库

于 2023-12-02 14:52:25 首次发布

本文链接：https://blog.csdn.net/stevenGsocute/article/details/134614907

版权

本文详细解析了Redis中AOF日志的写入策略，包括主进程如何处理AOF缓冲、fsync操作的时机选择以及在高I/O压力下的阻塞机制。讨论了always、everysec和nofsync策略对数据完整性和性能的影响。

摘要由CSDN通过智能技术生成

sever.h

struct redisServer{
	int aof_fd;       /* File descriptor of currently selected AOF file */
	sds aof_buf;      /* AOF buffer, written before entering the event loop */
	......
}

sever.c

// 主线程处理完用户请求会调用 ---- TODO：返包给用户在前，还是beforeSleep在前?
void beforeSleep(struct aeEventLoop *eventLoop) {
	......
	
    if (server.aof_state == AOF_ON)
       flushAppendOnlyFile(0);
       
	......
}

// 主线程定时会调用
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
	......
	
	/* AOF postponed flush: Try at every cron cycle if the slow fsync
     * completed. */
    if (server.aof_state == AOF_ON && server.aof_flush_postponed_start)
        flushAppendOnlyFile(0);

    /* AOF write errors: in this case we have a buffer to flush as well and
     * clear the AOF error in case of success to make the DB writable again,
     * however to try every second is enough in case of 'hz' is set to
     * a higher frequency. */
    run_with_period(1000) {
        if (server.aof_state == AOF_ON && server.aof_last_write_status == C_ERR)
            flushAppendOnlyFile(0);
    }
    
    ......
}

// 退出redis会调用
int prepareForShutdown(int flags) {
	......
	
	if (server.aof_state != AOF_OFF) {
        /* Kill the AOF saving child as the AOF we already have may be longer
         * but contains the full dataset anyway. */
        if (server.child_type == CHILD_TYPE_AOF) {
            /* If we have AOF enabled but haven't written the AOF yet, don't
             * shutdown or else the dataset will be lost. */
            if (server.aof_state == AOF_WAIT_REWRITE) {
                serverLog(LL_WARNING, "Writing initial AOF, can't exit.");
                return C_ERR;
            }
            serverLog(LL_WARNING,
                "There is a child rewriting the AOF. Killing it!");
            killAppendOnlyChild();
        }
        /* Append only file: flush buffers and fsync() the AOF at exit */
        serverLog(LL_NOTICE,"Calling fsync() on the AOF file.");
        flushAppendOnlyFile(1);
        if (redis_fsync(server.aof_fd) == -1) {
            serverLog(LL_WARNING,"Fail to fsync the AOF file: %s.",
                                 strerror(errno));
        }
    }
    
    ......
}

aof.c

void flushAppendOnlyFile(int force) {
    ssize_t nwritten;
    int sync_in_progress = 0;
    mstime_t latency;

    if (sdslen(server.aof_buf) == 0) { // aof日志没有新增内容
        if (server.aof_fsync == AOF_FSYNC_EVERYSEC &&
            server.aof_fsync_offset != server.aof_current_size &&
            server.unixtime > server.aof_last_fsync &&
            !(sync_in_progress = aofFsyncInProgress())) {
            goto try_fsync; // 配置了everysec策略，即使无文件写入也会执行
        } else {
            return;
        }
    }

    if (server.aof_fsync == AOF_FSYNC_EVERYSEC) // 提取fsync线程的状态
        sync_in_progress = aofFsyncInProgress();

    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) { // 配测aof刷盘策略是everysec(注意force参数随调用地方不同而不同)
        if (sync_in_progress) { // fsync线程在运行中 ---- 根据已推迟的时长，决定是否能再推迟，后续再write
            if (server.aof_flush_postponed_start == 0) { // 第一次推迟，记录第一次推迟的时间戳，继续推迟
                server.aof_flush_postponed_start = server.unixtime;
                return;
            } else if (server.unixtime - server.aof_flush_postponed_start < 2) { // 上一次推迟时间距今未超过2秒，继续推迟
                return;
            }
			// 无法再推迟了，必须write
			// 只有write有推迟机制，fsync没有推迟机制
			// write操作会因为fsync的进展而推迟
			// fsync执行与否的判定标准很固定 ---- fsync线程空闲，上一次fsync的时间是1秒前
            server.aof_delayed_fsync++; 
            serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
        }
    }
    
    ......

    // 走到这个逻辑，要写入aof日志，当前状态要么：未有推迟情况现在要write；推迟超过2秒了强制write
    server.aof_flush_postponed_start = 0; // 重置推迟时间戳
	
	// 处理write
    if (nwritten != (ssize_t)sdslen(server.aof_buf)) { // 未能完全写入
        static time_t last_write_error_log = 0;
        int can_log = 0;

        if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) { // 间隔周期记录错误日志
            can_log = 1;
            last_write_error_log = server.unixtime;
        }

        if (nwritten == -1) { // TODO: 写入文件失败
            if (can_log) {
                serverLog(LL_WARNING,"Error writing to the AOF file: %s",
                          strerror(errno));
                server.aof_last_write_errno = errno;
            }
        } else { // 部分写入
            if (can_log) {
                serverLog(LL_WARNING,"Short write while writing to "
                                     "the AOF file: (nwritten=%lld, "
                                     "expected=%lld)",
                          (long long)nwritten,
                          (long long)sdslen(server.aof_buf));
            }

            if (ftruncate(server.aof_fd, server.aof_current_size) == -1) { // 回滚擦除部分写入的内容
                if (can_log) {
                    serverLog(LL_WARNING, "Could not remove short write "
                                          "from the append-only file.  Redis may refuse "
                                          "to load the AOF the next time it starts.  "
                                          "ftruncate: %s", strerror(errno));
                }
            } else {
                /* If the ftruncate() succeeded we can set nwritten to
                 * -1 since there is no longer partial data into the AOF. */
                nwritten = -1; // 回滚成功
            }
            server.aof_last_write_errno = ENOSPC;
        }

        if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
            serverLog(LL_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");
            exit(1); //TODO: always策略如果不能全部write成功，直接挂了，部分写入也不重试吗?
        } else { // 记录错误状态
            server.aof_last_write_status = C_ERR;

            if (nwritten > 0) { // 如果无法回滚之前部分写入的内容，那就得承认这部分写入的内容
                server.aof_current_size += nwritten;
                sdsrange(server.aof_buf,nwritten,-1); // TODO: 擦除部分写入内容，是为了节省操作aof_buf开销?
            }
            // 等待下次再尝试写入
            return; 
        }
    } else { // 完全写入，更新为成功状态
        /* Successful write(2). If AOF was in error state, restore the
         * OK state and log the event. */
        if (server.aof_last_write_status == C_ERR) {
            serverLog(LL_WARNING,
                      "AOF write error looks solved, Redis can write again.");
            server.aof_last_write_status = C_OK;
        }
    }
    server.aof_current_size += nwritten; // 更新write进度

    ......
    
	// 处理fsync
    try_fsync:
    if (server.aof_no_fsync_on_rewrite && hasActiveChildProcess()) // 配置在aof重写时不对aof执行fsync，放弃fsync
        return;

    if (server.aof_fsync == AOF_FSYNC_ALWAYS) { // always策略，主线程来fsync
		......
		// 执行fsync
        if (redis_fsync(server.aof_fd) == -1) {
            serverLog(LL_WARNING,"Can't persist AOF for fsync error when the "
                                 "AOF fsync policy is 'always': %s. Exiting...", strerror(errno));
            exit(1); // always策略fsync失败就挂掉
        }
		......
        // 同步一下刷盘进度
        server.aof_fsync_offset = server.aof_current_size;
        server.aof_last_fsync = server.unixtime;
    } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
                server.unixtime > server.aof_last_fsync)) { // 1s内只允许执行一次fsync
        if (!sync_in_progress) { // fsync线程空闲，交付给fsync线程
            aof_background_fsync(server.aof_fd);
            server.aof_fsync_offset = server.aof_current_size; // 更新fsync进度 ---- TODO: 如果fsync线程执行失败了呢?直接就认为成功更新了?
        }
        server.aof_last_fsync = server.unixtime;
    }
}

流程：

aeEventLoop主线程处理网络请求，生成的aof日志会放到aof_buf中，处理完后在进入epoll_wait之前，会调用before_sleep
在before_sleep中调用flushAppendOnlyFile方法，来将aof_buf持久化到aof日志中
- write阶段
  - 没有新增aof日志，中止操作 ---- 如果是everysec策略，还是会跳到fsync阶段，保证每秒执行fsync
  - 配置everysec：判定fsync线程状态，如果fsync线程在执行中，则推迟本次write。交由定时任务severCron去推动；或者下次新aof日志生成的时候才推动
  - write异常处理：
    - 配置always: 退出redis进程，报错
    - 配置everysec / no: 交由serverCron后续定时重试，打印日志
- fsync阶段
  - 如果配置了aof_no_fsync_on_rewrite，则当aof重写进程在运行中，则跳过fsync，此时掉电的话会丢失数据
  - always策略，主线程来执行fsync，失败则退出redis进程报错
  - everysec策略，判定距离上一次fsync是否已过了1s，满1s以上才交付给fsync线程去执行fsync
- 进度管理
  - redis会维护文件已write，已fsync的文件偏移，每次操作成功就会去修改偏移
  - 如果write是部分写入，redis会尝试回滚，在下次写入的时候再统一写 ---- TODO: 不太理解，为了避免aof_buf字符串操作开销？
severCron定时被主线程执行，主线程会检测aof_buf的写入状态，进行重试。重试的逻辑跟上面是一样的

总结：

主线程执行write，一般比较快不会阻塞
fsync操作会阻塞线程，因此fsync操作的具体执行线程会根据配置而变化：
- always：主线程执行fsync，保证不丢日志
- everysec：交付给fsync线程去执行，掉电的话有可能会丢1秒钟内的日志。注：不掉电的话是不会丢数据的，因为主线程会阻塞write，无法响应后续的请求，也就没有新日志产生了。
- no：没有fsync阶段，交付给OS去调度刷盘，掉电的话丢失的日志量是不可预估的

展开说说

https://time.geekbang.org/column/article/287819 里面写到

AOF 重写会对磁盘进行大量 IO 操作，同时，fsync 又需要等到数据写到磁盘后才能返回，所以，当 AOF 重写的压力比较大时，就会导致 fsync 被阻塞。虽然 fsync 是由后台子线程负责执行的，但是，主线程会监控 fsync 的执行进度。
当主线程使用后台子线程执行了一次 fsync，需要再次把新接收的操作记录写回磁盘时，如果主线程发现上一次的 fsync 还没有执行完，那么它就会阻塞。所以，如果后台子线程执行的 fsync 频繁阻塞的话（比如 AOF 重写占用了大量的磁盘 IO 带宽），主线程也会阻塞，导致 Redis 性能变慢。

这里说得不严谨。事实上，这里主线程会阻塞，主要原因还是I/O压力大，fsync线程还未执行完毕只是表征。

源码中，如果fsync线程在运行中，主线程会推迟write操作，但推迟最终还是要执行的，隔1s后就必定要执行write来避免丢数据了。
注意到fsync线程还未执行完，导致它未执行完的原因就是I/O压力大。
此时主线程执行write又增加了I/O压力 ---- write写入到pageCache，pageCache空间达到上限值没法容纳新内容，此时必须要将pageCache的内容刷到磁盘来腾出空间，进而才能继续执行write
I/O压力大，无法快速响应write和fsync请求，此时就体现为主线程阻塞，进而redis无响应