[redis] rdb持久化
rdb应用场景
redis有两种持久化方式:rdb和aof,rdb默认开启。
rdb持久化是异步执行的
bgsave命令 => rdbSaveBackground(breakpoint)
int rdbSaveBackground(int req, char *filename, rdbSaveInfo *rsi) {
pid_t childpid;
if (hasActiveChildProcess()) return C_ERR;
server.stat_rdb_saves++;
server.dirty_before_bgsave = server.dirty;
server.lastbgsave_try = time(NULL);
if ((childpid = redisFork(CHILD_TYPE_RDB)) == 0) {
int retval;
/* Child */
redisSetProcTitle("redis-rdb-bgsave");
redisSetCpuAffinity(server.bgsave_cpulist);
retval = rdbSave(req, filename,rsi);
if (retval == C_OK) {
sendChildCowInfo(CHILD_INFO_TYPE_RDB_COW_SIZE, "RDB");
}
exitFromChild((retval == C_OK) ? 0 : 1);
} else {
/* Parent */
if (childpid == -1) {
server.lastbgsave_status = C_ERR;
serverLog(LL_WARNING,"Can't save in background: fork: %s",
strerror(errno));
return C_ERR;
}
serverLog(LL_NOTICE,"Background saving started by pid %ld",(long) childpid);
server.rdb_save_time_start = time(NULL);
server.rdb_child_type = RDB_CHILD_TYPE_DISK;
return C_OK;
}
return C_OK; /* unreached */
}
rdb是如何将数据保存到磁盘的
/* Save the DB on disk. Return C_ERR on error, C_OK on success. */
int rdbSave(int req, char *filename, rdbSaveInfo *rsi) {
char tmpfile[256];
char cwd[MAXPATHLEN]; /* Current working dir path for error messages. */
FILE *fp = NULL;
rio rdb;
int error = 0;
char *err_op; /* For a detailed log */
snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
fp = fopen(tmpfile,"w");
if (!fp) {
char *str_err = strerror(errno);
char *cwdp = getcwd(cwd,MAXPATHLEN);
serverLog(LL_WARNING,
"Failed opening the temp RDB file %s (in server root dir %s) "
"for saving: %s",
tmpfile,
cwdp ? cwdp : "unknown",
str_err);
return C_ERR;
}
rioInitWithFile(&rdb,fp);
startSaving(RDBFLAGS_NONE);
if (server.rdb_save_incremental_fsync)
rioSetAutoSync(&rdb,REDIS_AUTOSYNC_BYTES);
if (rdbSaveRio(req,&rdb,&error,RDBFLAGS_NONE,rsi) == C_ERR) {
errno = error;
err_op = "rdbSaveRio";
goto werr;
}
/* Make sure data will not remain on the OS's output buffers */
if (fflush(fp)) { err_op = "fflush"; goto werr; }
if (fsync(fileno(fp))) { err_op = "fsync"; goto werr; }
if (fclose(fp)) { fp = NULL; err_op = "fclose"; goto werr; }
fp = NULL;
/* Use RENAME to make sure the DB file is changed atomically only
* if the generate DB file is ok. */
if (rename(tmpfile,filename) == -1) {
char *str_err = strerror(errno);
char *cwdp = getcwd(cwd,MAXPATHLEN);
serverLog(LL_WARNING,
"Error moving temp DB file %s on the final "
"destination %s (in server root dir %s): %s",
tmpfile,
filename,
cwdp ? cwdp : "unknown",
str_err);
unlink(tmpfile);
stopSaving(0);
return C_ERR;
}
if (fsyncFileDir(filename) == -1) { err_op = "fsyncFileDir"; goto werr; }
serverLog(LL_NOTICE,"DB saved on disk");
server.dirty = 0;
server.lastsave = time(NULL);
server.lastbgsave_status = C_OK;
stopSaving(1);
return C_OK;
werr:
serverLog(LL_WARNING,"Write error saving DB on disk(%s): %s", err_op, strerror(errno));
if (fp) fclose(fp);
unlink(tmpfile);
stopSaving(0);
return C_ERR;
}
实际将dict数据写入到磁盘的调用顺序为:
rdbSaveRio
=> rdbSaveDb
rdbSaveDb
ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) {
dictIterator *di;
dictEntry *de;
ssize_t written = 0;
ssize_t res;
static long long info_updated_time = 0;
char *pname = (rdbflags & RDBFLAGS_AOF_PREAMBLE) ? "AOF rewrite" : "RDB";
redisDb *db = server.db + dbid;
dict *d = db->dict;
if (dictSize(d) == 0) return 0;
di = dictGetSafeIterator(d);
/* Write the SELECT DB opcode */
if ((res = rdbSaveType(rdb,RDB_OPCODE_SELECTDB)) < 0) goto werr;
written += res;
if ((res = rdbSaveLen(rdb, dbid)) < 0) goto werr;
written += res;
/* Write the RESIZE DB opcode. */
uint64_t db_size, expires_size;
db_size = dictSize(db->dict);
expires_size = dictSize(db->expires);
if ((res = rdbSaveType(rdb,RDB_OPCODE_RESIZEDB)) < 0) goto werr;
written += res;
if ((res = rdbSaveLen(rdb,db_size)) < 0) goto werr;
written += res;
if ((res = rdbSaveLen(rdb,expires_size)) < 0) goto werr;
written += res;
/* Iterate this DB writing every entry */
while((de = dictNext(di)) != NULL) {
sds keystr = dictGetKey(de);
robj key, *o = dictGetVal(de);
long long expire;
size_t rdb_bytes_before_key = rdb->processed_bytes;
initStaticStringObject(key,keystr);
expire = getExpire(db,&key);
if ((res = rdbSaveKeyValuePair(rdb, &key, o, expire, dbid)) < 0) goto werr;
written += res;
/* In fork child process, we can try to release memory back to the
* OS and possibly avoid or decrease COW. We give the dismiss
* mechanism a hint about an estimated size of the object we stored. */
size_t dump_size = rdb->processed_bytes - rdb_bytes_before_key;
if (server.in_fork_child) dismissObject(o, dump_size);
/* Update child info every 1 second (approximately).
* in order to avoid calling mstime() on each iteration, we will
* check the diff every 1024 keys */
if (((*key_counter)++ & 1023) == 0) {
long long now = mstime();
if (now - info_updated_time >= 1000) {
sendChildInfo(CHILD_INFO_TYPE_CURRENT_INFO, *key_counter, pname);
info_updated_time = now;
}
}
}
dictReleaseIterator(di);
return written;
werr:
dictReleaseIterator(di);
return -1;
}
遍历dict中的所有kv对,然后写入文件。
最后利用rename的原子性,将tmp文件更换为最新的rdb即可。
redis启动时加载rdb
server.c loadDataFromDisk
/* Function called at startup to load RDB or AOF file in memory. */
void loadDataFromDisk(void) {
long long start = ustime();
if (server.aof_state == AOF_ON) {
int ret = loadAppendOnlyFiles(server.aof_manifest);
if (ret == AOF_FAILED || ret == AOF_OPEN_ERR)
exit(1);
if (ret != AOF_NOT_EXIST)
serverLog(LL_NOTICE, "DB loaded from append only file: %.3f seconds", (float)(ustime()-start)/1000000);
} else {
rdbSaveInfo rsi = RDB_SAVE_INFO_INIT;
errno = 0; /* Prevent a stale value from affecting error checking */
int rdb_flags = RDBFLAGS_NONE;
if (iAmMaster()) {
/* Master may delete expired keys when loading, we should
* propagate expire to replication backlog. */
createReplicationBacklog();
rdb_flags |= RDBFLAGS_FEED_REPL;
}
if (rdbLoad(server.rdb_filename,&rsi,rdb_flags) == C_OK) {
serverLog(LL_NOTICE,"DB loaded from disk: %.3f seconds",
(float)(ustime()-start)/1000000);
/* Restore the replication ID / offset from the RDB file. */
if (rsi.repl_id_is_set &&
rsi.repl_offset != -1 &&
/* Note that older implementations may save a repl_stream_db
* of -1 inside the RDB file in a wrong way, see more
* information in function rdbPopulateSaveInfo. */
rsi.repl_stream_db != -1)
{
if (!iAmMaster()) {
memcpy(server.replid,rsi.repl_id,sizeof(server.replid));
server.master_repl_offset = rsi.repl_offset;
/* If this is a replica, create a cached master from this
* information, in order to allow partial resynchronizations
* with masters. */
replicationCacheMasterUsingMyself();
selectDb(server.cached_master,rsi.repl_stream_db);
} else {
/* If this is a master, we can save the replication info
* as secondary ID and offset, in order to allow replicas
* to partial resynchronizations with masters. */
memcpy(server.replid2,rsi.repl_id,sizeof(server.replid));
server.second_replid_offset = rsi.repl_offset+1;
/* Rebase master_repl_offset from rsi.repl_offset. */
server.master_repl_offset += rsi.repl_offset;
serverAssert(server.repl_backlog);
server.repl_backlog->offset = server.master_repl_offset -
server.repl_backlog->histlen + 1;
rebaseReplicationBuffer(rsi.repl_offset);
server.repl_no_slaves_since = time(NULL);
}
}
} else if (errno != ENOENT) {
serverLog(LL_WARNING,"Fatal error loading the DB: %s. Exiting.",strerror(errno));
exit(1);
}
/* We always create replication backlog if server is a master, we need
* it because we put DELs in it when loading expired keys in RDB, but
* if RDB doesn't have replication info or there is no rdb, it is not
* possible to support partial resynchronization, to avoid extra memory
* of replication backlog, we drop it. */
if (server.master_repl_offset == 0 && server.repl_backlog)
freeReplicationBacklog();
}
}