redis stream持久化_[灌水] Redis 的持久化

最新推荐文章于 2023-07-12 08:24:28 发布

希夷文化苑

最新推荐文章于 2023-07-12 08:24:28 发布

阅读量2.8k

点赞数 1

文章标签： redis stream持久化

本文链接：https://blog.csdn.net/weixin_42395219/article/details/112932719

版权

关于 Redis 的 Persistence, 最好的入门材料应该是：

它有两种形式，AOF 和 RDB：RDB 相对来说是一个数据库的“snapshot”，通过 SAVE 或者 BGSAVE 存储，在这个贴指出 Redis 有一个 redis.conf，存储了具体的dbfilename , 它的默认值是dump.rdb。具体来说，Redis 是靠 fork, 然后利用 fork 的 COW, 在子进程写一个 tmp rdb 文件，然后写入完成后 replace 原来的 rdb 文件

AOF 又叫 append-only file. 它以一定粒度的范围来写入。这玩意需要配置 appendonly yes. 相当于一个顺序写入的文件Redis 每次受到 SET 等更改命令的时候，会 apply 到 AOF

restart redis的时候，它会重放日志

append 本身写盘是肯定比 random access 快的，但是每次 fsync 仍然要付出很高代价，所以 Redis 允许指定不同的 async 形式：设置appendfsync参数，可以always 总是刷新

everysec 一秒一次

appendfsync no: Never fsync. 这不代表不刷新，只是交给操作系统处理

Normally Linux will flush data every 30 seconds with this configuration, but it's up to the kernel exact tuning.

(我以为几秒刷一次，竟然这么慢...)

redis 官网建议一秒刷新一次，反正我也觉得如果做缓存的话是不是可以容忍这种不刷新...AOF 暗示它会无限增长，那么肯定需要某种策略处理：像 Raft 一样的系统支持将一批 Log 构建成一个 snap, 这个策略暗示 AOF 可能整个再度成为一个 rdb

BGREWRITEAOF 会重建 AOF。Redis 能够 check aof, 检查 aof 数据的完整性是否有问题

AOF 结构和 REPL 协议是一样的

AOF 的实现是通过：会有一个 appendonly.aof 文件，这个在 redis.conf的 dir 和 appendfilename 选项中被定义

一个进程被 fork 出来，承担功能，把新的 AOF 写入文件，同时父进程一边写入一边在内存buffer 写

所以:AOF 和 RDB 是两种不同的策略(这和我之前了解到的不一样)，AOF 逻辑类似 Raft Snapshot 那一套；RDB 是对整个系统备份，相对来说 RDB 结构肯定会小一些。

RDB 在同步数据方面有很大的价值，通常 RDB 的备份会从 master 被发送给 slave，作为一种初始化

RDB 和 AOF 实际 BGSAVE 或者 AOFREWRITE 的时候互相不会干扰

下面是几张和上述流程有关的图，从书上抄下来的。

RDB

rdb 的定义部分在 rdb.c 和 rdb.h 两个文件中，rdb.h 定义了很多 msb 及其他相关的宏。同时 server.h 本身也承担了一部分 RDB 的逻辑。

在这个版本的 1196行之后，标注了对应的逻辑：

/* RDB persistence */

long long dirty; /* Changes to DB from the last save */

long long dirty_before_bgsave; /* Used to restore dirty on failed BGSAVE */

pid_t rdb_child_pid; /* PID of RDB saving child */

struct saveparam *saveparams; /* Save points array for RDB */

int saveparamslen; /* Number of saving points */

char *rdb_filename; /* Name of RDB file */

int rdb_compression; /* Use compression in RDB? */

int rdb_checksum; /* Use RDB checksum? */

time_t lastsave; /* Unix time of last successful save */

time_t lastbgsave_try; /* Unix time of last attempted bgsave */

time_t rdb_save_time_last; /* Time used by last RDB save run. */

time_t rdb_save_time_start; /* Current RDB save start time. */

int rdb_bgsave_scheduled; /* BGSAVE when possible if true. */

int rdb_child_type; /* Type of save by active child. */

int lastbgsave_status; /* C_OK or C_ERR */

int stop_writes_on_bgsave_err; /* Don't allow writes if can't BGSAVE */

int rdb_pipe_write; /* RDB pipes used to transfer the rdb */

int rdb_pipe_read; /* data to the parent process in diskless repl. */

connection **rdb_pipe_conns; /* Connections which are currently the */

int rdb_pipe_numconns; /* target of diskless rdb fork child. */

int rdb_pipe_numconns_writing; /* Number of rdb conns with pending writes. */

char *rdb_pipe_buff; /* In diskless replication, this buffer holds data */

int rdb_pipe_bufflen; /* that was read from the the rdb pipe. */

int rdb_key_save_delay; /* Delay in microseconds between keys while

* writing the RDB. (for testings) */

int key_load_delay; /* Delay in microseconds between keys while

* loading aof or rdb. (for testings) */dirty 和 dirty_before_bgsave 对应两套逻辑，但是都是表示save对应的写入的 cnt.

由于之前说的 fork, 所以需要 child_pid

saveparams 是 save 相关的参数

同时，server.h 定义了 save 相关的 info, 表示 rdb 的 metadata.

/* This structure can be optionally passed to RDB save/load functions in

* order to implement additional functionalities, by storing and loading

* metadata to the RDB file.

* Currently the only use is to select a DB at load time, useful in

* replication in order to make sure that chained slaves (slaves of slaves)

* select the correct DB and are able to accept the stream coming from the

* top-level master. */

typedef struct rdbSaveInfo {

/* Used saving and loading. */

int repl_stream_db; /* DB to select in server.master client. */

/* Used only loading. */

int repl_id_is_set; /* True if repl_id field is set. */

char repl_id[CONFIG_RUN_ID_SIZE+1]; /* Replication ID. */

long long repl_offset; /* Replication offset. */

} rdbSaveInfo;

rdbSave 这个函数具体流程不长，可以看看：

/* Save the DB on disk. Return C_ERR on error, C_OK on success. */

int rdbSave(char *filename, rdbSaveInfo *rsi) {

char tmpfile[256];

char cwd[MAXPATHLEN]; /* Current working dir path for error messages. */

FILE *fp;

rio rdb;

int error = 0;

snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());

fp = fopen(tmpfile,"w");

if (!fp) {

char *cwdp = getcwd(cwd,MAXPATHLEN);

serverLog(LL_WARNING,

"Failed opening the RDB file %s (in server root dir %s) "

"for saving: %s",

filename,

cwdp ? cwdp : "unknown",

strerror(errno));

return C_ERR;

}

rioInitWithFile(&rdb,fp);

startSaving(RDBFLAGS_NONE);

if (server.rdb_save_incremental_fsync)

rioSetAutoSync(&rdb,REDIS_AUTOSYNC_BYTES);

if (rdbSaveRio(&rdb,&error,RDBFLAGS_NONE,rsi) == C_ERR) {

errno = error;

goto werr;

}

/* Make sure data will not remain on the OS's output buffers */

if (fflush(fp) == EOF) goto werr;

if (fsync(fileno(fp)) == -1) goto werr;

if (fclose(fp) == EOF) goto werr;

/* Use RENAME to make sure the DB file is changed atomically only

* if the generate DB file is ok. */

if (rename(tmpfile,filename) == -1) {

char *cwdp = getcwd(cwd,MAXPATHLEN);

serverLog(LL_WARNING,

"Error moving temp DB file %s on the final "

"destination %s (in server root dir %s): %s",

tmpfile,

filename,

cwdp ? cwdp : "unknown",

strerror(errno));

unlink(tmpfile);

stopSaving(0);

return C_ERR;

}

serverLog(LL_NOTICE,"DB saved on disk");

server.dirty = 0;

server.lastsave = time(NULL);

server.lastbgsave_status = C_OK;

stopSaving(1);

return C_OK;

werr:

serverLog(LL_WARNING,"Write error saving DB on disk: %s", strerror(errno));

fclose(fp);

unlink(tmpfile);

stopSaving(0);

return C_ERR;

}

其中，通过对全局 server 的读写，完成操作。

/* Global vars */

struct redisServer server; /* Server global state */

volatile unsigned long lru_clock; /* Server global current LRU time. */

同时，这里有 rdbSaveRio, 这是最主要的代码段：

/* Produces a dump of the database in RDB format sending it to the specified

* Redis I/O channel. On success C_OK is returned, otherwise C_ERR

* is returned and part of the output, or all the output, can be

* missing because of I/O errors.

* When the function returns C_ERR and if 'error' is not NULL, the

* integer pointed by 'error' is set to the value of errno just after the I/O

* error. */

int rdbSaveRio(rio *rdb, int *error, int rdbflags, rdbSaveInfo *rsi) {

dictIterator *di = NULL;

dictEntry *de;

char magic[10];

int j;

uint64_t cksum;

size_t processed = 0;

if (server.rdb_checksum)

rdb->update_cksum = rioGenericUpdateChecksum;

snprintf(magic,sizeof(magic),"REDIS%04d",RDB_VERSION);

if (rdbWriteRaw(rdb,magic,9) == -1) goto werr;

if (rdbSaveInfoAuxFields(rdb,rdbflags,rsi) == -1) goto werr;

if (rdbSaveModulesAux(rdb, REDISMODULE_AUX_BEFORE_RDB) == -1) goto werr;

for (j = 0; j < server.dbnum; j++) {

redisDb *db = server.db+j;

dict *d = db->dict;

if (dictSize(d) == 0) continue;

di = dictGetSafeIterator(d);

/* Write the SELECT DB opcode */

if (rdbSaveType(rdb,RDB_OPCODE_SELECTDB) == -1) goto werr;

if (rdbSaveLen(rdb,j) == -1) goto werr;

/* Write the RESIZE DB opcode. We trim the size to UINT32_MAX, which

* is currently the largest type we are able to represent in RDB sizes.

* However this does not limit the actual size of the DB to load since

* these sizes are just hints to resize the hash tables. */

uint64_t db_size, expires_size;

db_size = dictSize(db->dict);

expires_size = dictSize(db->expires);

if (rdbSaveType(rdb,RDB_OPCODE_RESIZEDB) == -1) goto werr;

if (rdbSaveLen(rdb,db_size) == -1) goto werr;

if (rdbSaveLen(rdb,expires_size) == -1) goto werr;

/* Iterate this DB writing every entry */

while((de = dictNext(di)) != NULL) {

sds keystr = dictGetKey(de);

robj key, *o = dictGetVal(de);

long long expire;

initStaticStringObject(key,keystr);

expire = getExpire(db,&key);

if (rdbSaveKeyValuePair(rdb,&key,o,expire) == -1) goto werr;

/* When this RDB is produced as part of an AOF rewrite, move

* accumulated diff from parent to child while rewriting in

* order to have a smaller final write. */

if (rdbflags & RDBFLAGS_AOF_PREAMBLE &&

rdb->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES)

{

processed = rdb->processed_bytes;

aofReadDiffFromParent();

}

dictReleaseIterator(di);

di = NULL; /* So that we don't release it again on error. */

}

/* If we are storing the replication information on disk, persist

* the script cache as well: on successful PSYNC after a restart, we need

* to be able to process any EVALSHA inside the replication backlog the

* master will send us. */

if (rsi && dictSize(server.lua_scripts)) {

di = dictGetIterator(server.lua_scripts);

while((de = dictNext(di)) != NULL) {

robj *body = dictGetVal(de);

if (rdbSaveAuxField(rdb,"lua",3,body->ptr,sdslen(body->ptr)) == -1)

goto werr;

}

dictReleaseIterator(di);

di = NULL; /* So that we don't release it again on error. */

}

if (rdbSaveModulesAux(rdb, REDISMODULE_AUX_AFTER_RDB) == -1) goto werr;

/* EOF opcode */

if (rdbSaveType(rdb,RDB_OPCODE_EOF) == -1) goto werr;

/* CRC64 checksum. It will be zero if checksum computation is disabled, the

* loading code skips the check in this case. */

cksum = rdb->cksum;

memrev64ifbe(&cksum);

if (rioWrite(rdb,&cksum,8) == 0) goto werr;

return C_OK;

werr:

if (error) *error = errno;

if (di) dictReleaseIterator(di);

return C_ERR;

}

屏蔽 rio 相关的逻辑，然后这一段可以参考：snprintf(magic,sizeof(magic),"REDIS%04d",RDB_VERSION); 在 rdbWriteRaw 写入 REDIS 和 version

写入 redis 对应的 flag

针对每个 DB 写入

AOF

在beforeSleep 中，有：

flushAppendOnlyFile(0);

其中 0 表示 not force.

server.aof_buf 存储着对应的 aof 数据，出现 set 指令的时候，aof_buf 会被记录。

AOF 具体逻辑如下：

/* Write the append only file buffer on disk.

* Since we are required to write the AOF before replying to the client,

* and the only way the client socket can get a write is entering when the

* the event loop, we accumulate all the AOF writes in a memory

* buffer and write it on disk using this function just before entering

* the event loop again.

* About the 'force' argument:

* When the fsync policy is set to 'everysec' we may delay the flush if there

* is still an fsync() going on in the background thread, since for instance

* on Linux write(2) will be blocked by the background fsync anyway.

* When this happens we remember that there is some aof buffer to be

* flushed ASAP, and will try to do that in the serverCron() function.

* However if force is set to 1 we'll write regardless of the background

* fsync. */

#define AOF_WRITE_LOG_ERROR_RATE 30 /* Seconds between errors logging. */

void flushAppendOnlyFile(int force) {

ssize_t nwritten;

int sync_in_progress = 0;

mstime_t latency;

if (sdslen(server.aof_buf) == 0) {

/* Check if we need to do fsync even the aof buffer is empty,

* because previously in AOF_FSYNC_EVERYSEC mode, fsync is

* called only when aof buffer is not empty, so if users

* stop write commands before fsync called in one second,

* the data in page cache cannot be flushed in time. */

if (server.aof_fsync == AOF_FSYNC_EVERYSEC &&

server.aof_fsync_offset != server.aof_current_size &&

server.unixtime > server.aof_last_fsync &&

!(sync_in_progress = aofFsyncInProgress())) {

goto try_fsync;

} else {

return;

}

if (server.aof_fsync == AOF_FSYNC_EVERYSEC)

sync_in_progress = aofFsyncInProgress();

if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {

/* With this append fsync policy we do background fsyncing.

* If the fsync is still in progress we can try to delay

* the write for a couple of seconds. */

if (sync_in_progress) {

if (server.aof_flush_postponed_start == 0) {

/* No previous write postponing, remember that we are

* postponing the flush and return. */

server.aof_flush_postponed_start = server.unixtime;

return;

} else if (server.unixtime - server.aof_flush_postponed_start < 2) {

/* We were already waiting for fsync to finish, but for less

* than two seconds this is still ok. Postpone again. */

return;

}

/* Otherwise fall trough, and go write since we can't wait

* over two seconds. */

server.aof_delayed_fsync++;

serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");

}

/* We want to perform a single write. This should be guaranteed atomic

* at least if the filesystem we are writing is a real physical one.

* While this will save us against the server being killed I don't think

* there is much to do about the whole server stopping for power problems

* or alike */

if (server.aof_flush_sleep && sdslen(server.aof_buf)) {

usleep(server.aof_flush_sleep);

}

latencyStartMonitor(latency);

nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));

latencyEndMonitor(latency);

/* We want to capture different events for delayed writes:

* when the delay happens with a pending fsync, or with a saving child

* active, and when the above two conditions are missing.

* We also use an additional event name to save all samples which is

* useful for graphing / monitoring purposes. */

if (sync_in_progress) {

latencyAddSampleIfNeeded("aof-write-pending-fsync",latency);

} else if (hasActiveChildProcess()) {

latencyAddSampleIfNeeded("aof-write-active-child",latency);

} else {

latencyAddSampleIfNeeded("aof-write-alone",latency);

}

latencyAddSampleIfNeeded("aof-write",latency);

/* We performed the write so reset the postponed flush sentinel to zero. */

server.aof_flush_postponed_start = 0;

if (nwritten != (ssize_t)sdslen(server.aof_buf)) {

static time_t last_write_error_log = 0;

int can_log = 0;

/* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */

if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) {

can_log = 1;

last_write_error_log = server.unixtime;

}

/* Log the AOF write error and record the error code. */

if (nwritten == -1) {

if (can_log) {

serverLog(LL_WARNING,"Error writing to the AOF file: %s",

strerror(errno));

server.aof_last_write_errno = errno;

}

} else {

if (can_log) {

serverLog(LL_WARNING,"Short write while writing to "

"the AOF file: (nwritten=%lld, "

"expected=%lld)",

(long long)nwritten,

(long long)sdslen(server.aof_buf));

}

if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {

if (can_log) {

serverLog(LL_WARNING, "Could not remove short write "

"from the append-only file. Redis may refuse "

"to load the AOF the next time it starts. "

"ftruncate: %s", strerror(errno));

}

} else {

/* If the ftruncate() succeeded we can set nwritten to

* -1 since there is no longer partial data into the AOF. */

nwritten = -1;

}

server.aof_last_write_errno = ENOSPC;

}

/* Handle the AOF write error. */

if (server.aof_fsync == AOF_FSYNC_ALWAYS) {

/* We can't recover when the fsync policy is ALWAYS since the

* reply for the client is already in the output buffers, and we

* have the contract with the user that on acknowledged write data

* is synced on disk. */

serverLog(LL_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");

exit(1);

} else {

/* Recover from failed write leaving data into the buffer. However

* set an error to stop accepting writes as long as the error

* condition is not cleared. */

server.aof_last_write_status = C_ERR;

/* Trim the sds buffer if there was a partial write, and there

* was no way to undo it with ftruncate(2). */

if (nwritten > 0) {

server.aof_current_size += nwritten;

sdsrange(server.aof_buf,nwritten,-1);

}

return; /* We'll try again on the next call... */

}

} else {

/* Successful write(2). If AOF was in error state, restore the

* OK state and log the event. */

if (server.aof_last_write_status == C_ERR) {

serverLog(LL_WARNING,

"AOF write error looks solved, Redis can write again.");

server.aof_last_write_status = C_OK;

}

server.aof_current_size += nwritten;

/* Re-use AOF buffer when it is small enough. The maximum comes from the

* arena size of 4k minus some overhead (but is otherwise arbitrary). */

if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {

sdsclear(server.aof_buf);

} else {

sdsfree(server.aof_buf);

server.aof_buf = sdsempty();

}

try_fsync:

/* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are

* children doing I/O in the background. */

if (server.aof_no_fsync_on_rewrite && hasActiveChildProcess())

return;

/* Perform the fsync if needed. */

if (server.aof_fsync == AOF_FSYNC_ALWAYS) {

/* redis_fsync is defined as fdatasync() for Linux in order to avoid

* flushing metadata. */

latencyStartMonitor(latency);

redis_fsync(server.aof_fd); /* Let's try to get this data on the disk */

latencyEndMonitor(latency);

latencyAddSampleIfNeeded("aof-fsync-always",latency);

server.aof_fsync_offset = server.aof_current_size;

server.aof_last_fsync = server.unixtime;

} else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&

server.unixtime > server.aof_last_fsync)) {

if (!sync_in_progress) {

aof_background_fsync(server.aof_fd);

server.aof_fsync_offset = server.aof_current_size;

}

server.aof_last_fsync = server.unixtime;

}

希夷文化苑

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
redis stream持久化_[灌水] Redis 的持久化

关于 Redis 的 Persistence, 最好的入门材料应该是：它有两种形式，AOF 和 RDB：RDB 相对来说是一个数据库的“snapshot”，通过 SAVE 或者 BGSAVE 存储，在这个贴指出 Redis 有一个 redis.conf，存储了具体的dbfilename , 它的默认值是dump.rdb。具体来说，Redis 是靠 fork, 然后利用 fork 的 COW, 在...
复制链接

扫一扫