redis stream持久化_[灌水] Redis 的持久化

关于 Redis 的 Persistence, 最好的入门材料应该是:

它有两种形式,AOF 和 RDB:RDB 相对来说是一个数据库的“snapshot”,通过 SAVE 或者 BGSAVE 存储,在这个贴 指出 Redis 有一个 redis.conf,存储了具体的dbfilename , 它的默认值是dump.rdb。具体来说,Redis 是靠 fork, 然后利用 fork 的 COW, 在子进程写一个 tmp rdb 文件,然后写入完成后 replace 原来的 rdb 文件

AOF 又叫 append-only file. 它以一定粒度的范围来写入。这玩意需要配置 appendonly yes. 相当于一个顺序写入的文件Redis 每次受到 SET 等更改命令的时候,会 apply 到 AOF

restart redis的时候,它会重放日志

append 本身写盘是肯定比 random access 快的,但是每次 fsync 仍然要付出很高代价,所以 Redis 允许指定不同的 async 形式:设置appendfsync参数,可以always 总是刷新

everysec 一秒一次

appendfsync no: Never fsync. 这不代表不刷新,只是交给操作系统处理

Normally Linux will flush data every 30 seconds with this configuration, but it's up to the kernel exact tuning.

(我以为几秒刷一次,竟然这么慢...)

redis 官网建议一秒刷新一次,反正我也觉得如果做缓存的话是不是可以容忍这种不刷新...AOF 暗示它会无限增长,那么肯定需要某种策略处理:像 Raft 一样的系统支持将一批 Log 构建成一个 snap, 这个策略暗示 AOF 可能整个再度成为一个 rdb

BGREWRITEAOF 会重建 AOF。Redis 能够 check aof, 检查 aof 数据的完整性是否有问题

AOF 结构和 REPL 协议是一样的

AOF 的实现是通过:会有一个 appendonly.aof 文件,这个在 redis.conf的 dir 和 appendfilename 选项中被定义

一个进程被 fork 出来,承担功能,把新的 AOF 写入 文件,同时父进程一边写入一边在内存buffer 写

所以:AOF 和 RDB 是两种不同的策略(这和我之前了解到的不一样),AOF 逻辑类似 Raft Snapshot 那一套;RDB 是对整个系统备份,相对来说 RDB 结构肯定会小一些。

RDB 在同步数据方面有很大的价值,通常 RDB 的备份会从 master 被发送给 slave,作为一种初始化

RDB 和 AOF 实际 BGSAVE 或者 AOFREWRITE 的时候互相不会干扰

下面是几张和上述流程有关的图,从书上抄下来的。

RDB

rdb 的定义部分在 rdb.c 和 rdb.h 两个文件中,rdb.h 定义了很多 msb 及其他相关的宏。同时 server.h 本身也承担了一部分 RDB 的逻辑。

在这个版本的 1196行之后,标注了对应的逻辑:

/* RDB persistence */

long long dirty; /* Changes to DB from the last save */

long long dirty_before_bgsave; /* Used to restore dirty on failed BGSAVE */

pid_t rdb_child_pid; /* PID of RDB saving child */

struct saveparam *saveparams; /* Save points array for RDB */

int saveparamslen; /* Number of saving points */

char *rdb_filename; /* Name of RDB file */

int rdb_compression; /* Use compression in RDB? */

int rdb_checksum; /* Use RDB checksum? */

time_t lastsave; /* Unix time of last successful save */

time_t lastbgsave_try; /* Unix time of last attempted bgsave */

time_t rdb_save_time_last; /* Time used by last RDB save run. */

time_t rdb_save_time_start; /* Current RDB save start time. */

int rdb_bgsave_scheduled; /* BGSAVE when possible if true. */

int rdb_child_type; /* Type of save by active child. */

int lastbgsave_status; /* C_OK or C_ERR */

int stop_writes_on_bgsave_err; /* Don't allow writes if can't BGSAVE */

int rdb_pipe_write; /* RDB pipes used to transfer the rdb */

int rdb_pipe_read; /* data to the parent process in diskless repl. */

connection **rdb_pipe_conns; /* Connections which are currently the */

int rdb_pipe_numconns; /* target of diskless rdb fork child. */

int rdb_pipe_numconns_writing; /* Number of rdb conns with pending writes. */

char *rdb_pipe_buff; /* In diskless replication, this buffer holds data */

int rdb_pipe_bufflen; /* that was read from the the rdb pipe. */

int rdb_key_save_delay; /* Delay in microseconds between keys while

* writing the RDB. (for testings) */

int key_load_delay; /* Delay in microseconds between keys while

* loading aof or rdb. (for testings) */dirty 和 dirty_before_bgsave 对应两套逻辑,但是都是表示save对应的写入的 cnt.

由于之前说的 fork, 所以需要 child_pid

saveparams 是 save 相关的参数

同时,server.h 定义了 save 相关的 info, 表示 rdb 的 metadata.

/* This structure can be optionally passed to RDB save/load functions in

* order to implement additional functionalities, by storing and loading

* metadata to the RDB file.

*

* Currently the only use is to select a DB at load time, useful in

* replication in order to make sure that chained slaves (slaves of slaves)

* select the correct DB and are able to accept the stream coming from the

* top-level master. */

typedef struct rdbSaveInfo {

/* Used saving and loading. */

int repl_stream_db; /* DB to select in server.master client. */

/* Used only loading. */

int repl_id_is_set; /* True if repl_id field is set. */

char repl_id[CONFIG_RUN_ID_SIZE+1]; /* Replication ID. */

long long repl_offset; /* Replication offset. */

} rdbSaveInfo;

rdbSave 这个函数具体流程不长,可以看看:

/* Save the DB on disk. Return C_ERR on error, C_OK on success. */

int rdbSave(char *filename, rdbSaveInfo *rsi) {

char tmpfile[256];

char cwd[MAXPATHLEN]; /* Current working dir path for error messages. */

FILE *fp;

rio rdb;

int error = 0;

snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());

fp = fopen(tmpfile,"w");

if (!fp) {

char *cwdp = getcwd(cwd,MAXPATHLEN);

serverLog(LL_WARNING,

"Failed opening the RDB file %s (in server root dir %s) "

"for saving: %s",

filename,

cwdp ? cwdp : "unknown",

strerror(errno));

return C_ERR;

}

rioInitWithFile(&rdb,fp);

startSaving(RDBFLAGS_NONE);

if (server.rdb_save_incremental_fsync)

rioSetAutoSync(&rdb,REDIS_AUTOSYNC_BYTES);

if (rdbSaveRio(&rdb,&error,RDBFLAGS_NONE,rsi) == C_ERR) {

errno = error;

goto werr;

}

/* Make sure data will not remain on the OS's output buffers */

if (fflush(fp) == EOF) goto werr;

if (fsync(fileno(fp)) == -1) goto werr;

if (fclose(fp) == EOF) goto werr;

/* Use RENAME to make sure the DB file is changed atomically only

* if the generate DB file is ok. */

if (rename(tmpfile,filename) == -1) {

char *cwdp = getcwd(cwd,MAXPATHLEN);

serverLog(LL_WARNING,

"Error moving temp DB file %s on the final "

"destination %s (in server root dir %s): %s",

tmpfile,

filename,

cwdp ? cwdp : "unknown",

strerror(errno));

unlink(tmpfile);

stopSaving(0);

return C_ERR;

}

serverLog(LL_NOTICE,"DB saved on disk");

server.dirty = 0;

server.lastsave = time(NULL);

server.lastbgsave_status = C_OK;

stopSaving(1);

return C_OK;

werr:

serverLog(LL_WARNING,"Write error saving DB on disk: %s", strerror(errno));

fclose(fp);

unlink(tmpfile);

stopSaving(0);

return C_ERR;

}

其中,通过对 全局 server 的读写,完成操作。

/* Global vars */

struct redisServer server; /* Server global state */

volatile unsigned long lru_clock; /* Server global current LRU time. */

同时,这里有 rdbSaveRio, 这是最主要的代码段:

/* Produces a dump of the database in RDB format sending it to the specified

* Redis I/O channel. On success C_OK is returned, otherwise C_ERR

* is returned and part of the output, or all the output, can be

* missing because of I/O errors.

*

* When the function returns C_ERR and if 'error' is not NULL, the

* integer pointed by 'error' is set to the value of errno just after the I/O

* error. */

int rdbSaveRio(rio *rdb, int *error, int rdbflags, rdbSaveInfo *rsi) {

dictIterator *di = NULL;

dictEntry *de;

char magic[10];

int j;

uint64_t cksum;

size_t processed = 0;

if (server.rdb_checksum)

rdb->update_cksum = rioGenericUpdateChecksum;

snprintf(magic,sizeof(magic),"REDIS%04d",RDB_VERSION);

if (rdbWriteRaw(rdb,magic,9) == -1) goto werr;

if (rdbSaveInfoAuxFields(rdb,rdbflags,rsi) == -1) goto werr;

if (rdbSaveModulesAux(rdb, REDISMODULE_AUX_BEFORE_RDB) == -1) goto werr;

for (j = 0; j < server.dbnum; j++) {

redisDb *db = server.db+j;

dict *d = db->dict;

if (dictSize(d) == 0) continue;

di = dictGetSafeIterator(d);

/* Write the SELECT DB opcode */

if (rdbSaveType(rdb,RDB_OPCODE_SELECTDB) == -1) goto werr;

if (rdbSaveLen(rdb,j) == -1) goto werr;

/* Write the RESIZE DB opcode. We trim the size to UINT32_MAX, which

* is currently the largest type we are able to represent in RDB sizes.

* However this does not limit the actual size of the DB to load since

* these sizes are just hints to resize the hash tables. */

uint64_t db_size, expires_size;

db_size = dictSize(db->dict);

expires_size = dictSize(db->expires);

if (rdbSaveType(rdb,RDB_OPCODE_RESIZEDB) == -1) goto werr;

if (rdbSaveLen(rdb,db_size) == -1) goto werr;

if (rdbSaveLen(rdb,expires_size) == -1) goto werr;

/* Iterate this DB writing every entry */

while((de = dictNext(di)) != NULL) {

sds keystr = dictGetKey(de);

robj key, *o = dictGetVal(de);

long long expire;

initStaticStringObject(key,keystr);

expire = getExpire(db,&key);

if (rdbSaveKeyValuePair(rdb,&key,o,expire) == -1) goto werr;

/* When this RDB is produced as part of an AOF rewrite, move

* accumulated diff from parent to child while rewriting in

* order to have a smaller final write. */

if (rdbflags & RDBFLAGS_AOF_PREAMBLE &&

rdb->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES)

{

processed = rdb->processed_bytes;

aofReadDiffFromParent();

}

}

dictReleaseIterator(di);

di = NULL; /* So that we don't release it again on error. */

}

/* If we are storing the replication information on disk, persist

* the script cache as well: on successful PSYNC after a restart, we need

* to be able to process any EVALSHA inside the replication backlog the

* master will send us. */

if (rsi && dictSize(server.lua_scripts)) {

di = dictGetIterator(server.lua_scripts);

while((de = dictNext(di)) != NULL) {

robj *body = dictGetVal(de);

if (rdbSaveAuxField(rdb,"lua",3,body->ptr,sdslen(body->ptr)) == -1)

goto werr;

}

dictReleaseIterator(di);

di = NULL; /* So that we don't release it again on error. */

}

if (rdbSaveModulesAux(rdb, REDISMODULE_AUX_AFTER_RDB) == -1) goto werr;

/* EOF opcode */

if (rdbSaveType(rdb,RDB_OPCODE_EOF) == -1) goto werr;

/* CRC64 checksum. It will be zero if checksum computation is disabled, the

* loading code skips the check in this case. */

cksum = rdb->cksum;

memrev64ifbe(&cksum);

if (rioWrite(rdb,&cksum,8) == 0) goto werr;

return C_OK;

werr:

if (error) *error = errno;

if (di) dictReleaseIterator(di);

return C_ERR;

}

屏蔽 rio 相关的逻辑,然后这一段可以参考:snprintf(magic,sizeof(magic),"REDIS%04d",RDB_VERSION); 在 rdbWriteRaw 写入 REDIS 和 version

写入 redis 对应的 flag

针对每个 DB 写入

AOF

在beforeSleep 中,有:

flushAppendOnlyFile(0);

其中 0 表示 not force.

server.aof_buf 存储着对应的 aof 数据,出现 set 指令的时候,aof_buf 会被记录。

AOF 具体逻辑如下:

/* Write the append only file buffer on disk.

*

* Since we are required to write the AOF before replying to the client,

* and the only way the client socket can get a write is entering when the

* the event loop, we accumulate all the AOF writes in a memory

* buffer and write it on disk using this function just before entering

* the event loop again.

*

* About the 'force' argument:

*

* When the fsync policy is set to 'everysec' we may delay the flush if there

* is still an fsync() going on in the background thread, since for instance

* on Linux write(2) will be blocked by the background fsync anyway.

* When this happens we remember that there is some aof buffer to be

* flushed ASAP, and will try to do that in the serverCron() function.

*

* However if force is set to 1 we'll write regardless of the background

* fsync. */

#define AOF_WRITE_LOG_ERROR_RATE 30 /* Seconds between errors logging. */

void flushAppendOnlyFile(int force) {

ssize_t nwritten;

int sync_in_progress = 0;

mstime_t latency;

if (sdslen(server.aof_buf) == 0) {

/* Check if we need to do fsync even the aof buffer is empty,

* because previously in AOF_FSYNC_EVERYSEC mode, fsync is

* called only when aof buffer is not empty, so if users

* stop write commands before fsync called in one second,

* the data in page cache cannot be flushed in time. */

if (server.aof_fsync == AOF_FSYNC_EVERYSEC &&

server.aof_fsync_offset != server.aof_current_size &&

server.unixtime > server.aof_last_fsync &&

!(sync_in_progress = aofFsyncInProgress())) {

goto try_fsync;

} else {

return;

}

}

if (server.aof_fsync == AOF_FSYNC_EVERYSEC)

sync_in_progress = aofFsyncInProgress();

if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {

/* With this append fsync policy we do background fsyncing.

* If the fsync is still in progress we can try to delay

* the write for a couple of seconds. */

if (sync_in_progress) {

if (server.aof_flush_postponed_start == 0) {

/* No previous write postponing, remember that we are

* postponing the flush and return. */

server.aof_flush_postponed_start = server.unixtime;

return;

} else if (server.unixtime - server.aof_flush_postponed_start < 2) {

/* We were already waiting for fsync to finish, but for less

* than two seconds this is still ok. Postpone again. */

return;

}

/* Otherwise fall trough, and go write since we can't wait

* over two seconds. */

server.aof_delayed_fsync++;

serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");

}

}

/* We want to perform a single write. This should be guaranteed atomic

* at least if the filesystem we are writing is a real physical one.

* While this will save us against the server being killed I don't think

* there is much to do about the whole server stopping for power problems

* or alike */

if (server.aof_flush_sleep && sdslen(server.aof_buf)) {

usleep(server.aof_flush_sleep);

}

latencyStartMonitor(latency);

nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));

latencyEndMonitor(latency);

/* We want to capture different events for delayed writes:

* when the delay happens with a pending fsync, or with a saving child

* active, and when the above two conditions are missing.

* We also use an additional event name to save all samples which is

* useful for graphing / monitoring purposes. */

if (sync_in_progress) {

latencyAddSampleIfNeeded("aof-write-pending-fsync",latency);

} else if (hasActiveChildProcess()) {

latencyAddSampleIfNeeded("aof-write-active-child",latency);

} else {

latencyAddSampleIfNeeded("aof-write-alone",latency);

}

latencyAddSampleIfNeeded("aof-write",latency);

/* We performed the write so reset the postponed flush sentinel to zero. */

server.aof_flush_postponed_start = 0;

if (nwritten != (ssize_t)sdslen(server.aof_buf)) {

static time_t last_write_error_log = 0;

int can_log = 0;

/* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */

if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) {

can_log = 1;

last_write_error_log = server.unixtime;

}

/* Log the AOF write error and record the error code. */

if (nwritten == -1) {

if (can_log) {

serverLog(LL_WARNING,"Error writing to the AOF file: %s",

strerror(errno));

server.aof_last_write_errno = errno;

}

} else {

if (can_log) {

serverLog(LL_WARNING,"Short write while writing to "

"the AOF file: (nwritten=%lld, "

"expected=%lld)",

(long long)nwritten,

(long long)sdslen(server.aof_buf));

}

if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {

if (can_log) {

serverLog(LL_WARNING, "Could not remove short write "

"from the append-only file. Redis may refuse "

"to load the AOF the next time it starts. "

"ftruncate: %s", strerror(errno));

}

} else {

/* If the ftruncate() succeeded we can set nwritten to

* -1 since there is no longer partial data into the AOF. */

nwritten = -1;

}

server.aof_last_write_errno = ENOSPC;

}

/* Handle the AOF write error. */

if (server.aof_fsync == AOF_FSYNC_ALWAYS) {

/* We can't recover when the fsync policy is ALWAYS since the

* reply for the client is already in the output buffers, and we

* have the contract with the user that on acknowledged write data

* is synced on disk. */

serverLog(LL_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");

exit(1);

} else {

/* Recover from failed write leaving data into the buffer. However

* set an error to stop accepting writes as long as the error

* condition is not cleared. */

server.aof_last_write_status = C_ERR;

/* Trim the sds buffer if there was a partial write, and there

* was no way to undo it with ftruncate(2). */

if (nwritten > 0) {

server.aof_current_size += nwritten;

sdsrange(server.aof_buf,nwritten,-1);

}

return; /* We'll try again on the next call... */

}

} else {

/* Successful write(2). If AOF was in error state, restore the

* OK state and log the event. */

if (server.aof_last_write_status == C_ERR) {

serverLog(LL_WARNING,

"AOF write error looks solved, Redis can write again.");

server.aof_last_write_status = C_OK;

}

}

server.aof_current_size += nwritten;

/* Re-use AOF buffer when it is small enough. The maximum comes from the

* arena size of 4k minus some overhead (but is otherwise arbitrary). */

if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {

sdsclear(server.aof_buf);

} else {

sdsfree(server.aof_buf);

server.aof_buf = sdsempty();

}

try_fsync:

/* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are

* children doing I/O in the background. */

if (server.aof_no_fsync_on_rewrite && hasActiveChildProcess())

return;

/* Perform the fsync if needed. */

if (server.aof_fsync == AOF_FSYNC_ALWAYS) {

/* redis_fsync is defined as fdatasync() for Linux in order to avoid

* flushing metadata. */

latencyStartMonitor(latency);

redis_fsync(server.aof_fd); /* Let's try to get this data on the disk */

latencyEndMonitor(latency);

latencyAddSampleIfNeeded("aof-fsync-always",latency);

server.aof_fsync_offset = server.aof_current_size;

server.aof_last_fsync = server.unixtime;

} else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&

server.unixtime > server.aof_last_fsync)) {

if (!sync_in_progress) {

aof_background_fsync(server.aof_fd);

server.aof_fsync_offset = server.aof_current_size;

}

server.aof_last_fsync = server.unixtime;

}

}

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值