本人 github 地址
github 地址 里面有注释好的代码,下载下来可以方便阅读。
本篇文章看点
- rdb什么时候执行,条件是如何触发。
- rdb为什么用子进程而不是线程。
- 什么是copy on write
- 为什么是fork而不是其它
rdb的触发条件
rdb 有两种触发方式一种是主动触发一种是被动触发
- 主动触发其实就是两个命令一个是sava 另外一个bgsave, 不同的是save会阻塞主线程,在save没有执行完的情况下,其它命令所有命令只能等待save 执行完才能被执行,redis的执行命令的线程还是单线程,所以一般情况下,请用bgsave代替save,bgsave和被动触发的逻辑相似都是会fork一个子线程来执行,bgsave 带有一个schedule 参数当有其它子进程在执行的时候会滞后执行。
- 被动触发的入口跟之前键过期的入口一样,都是从serverCron这里来触发
save 入口:
void saveCommand(client *c) {
//判断是否有rdb的子进程在运行
if (server.rdb_child_pid != -1) {
addReplyError(c,"Background save already in progress");
return;
}
rdbSaveInfo rsi, *rsiptr;
//这个是跟replication相关逻辑
rsiptr = rdbPopulateSaveInfo(&rsi);
//rdb的save主流程
if (rdbSave(server.rdb_filename,rsiptr) == C_OK) {
addReply(c,shared.ok);
} else {
addReply(c,shared.err);
}
}
bgsave 入口:
/* BGSAVE [SCHEDULE] */
void bgsaveCommand(client *c) {
int schedule = 0;
/* The SCHEDULE option changes the behavior of BGSAVE when an AOF rewrite
* is in progress. Instead of returning an error a BGSAVE gets scheduled. */
//如果带有schedule ,就会设置schedule边领
if (c->argc > 1) {
if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"schedule")) {
schedule = 1;
} else {
addReply(c,shared.syntaxerr);
return;
}
}
rdbSaveInfo rsi, *rsiptr;
//replication
rsiptr = rdbPopulateSaveInfo(&rsi);
//是否有rdb backgroud 正在执行
if (server.rdb_child_pid != -1) {
addReplyError(c,"Background save already in progress");
}
//如果有其它子进程在执行则进入下面的schedule判断
else if (hasActiveChildProcess()) {
if (schedule) {
server.rdb_bgsave_scheduled = 1;
addReplyStatus(c,"Background saving scheduled");
} else {
addReplyError(c,
"Another child process is active (AOF?): can't BGSAVE right now. "
"Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever "
"possible.");
}
}
//save backgroud 主流程
else if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK) {
addReplyStatus(c,"Background saving started");
} else {
addReply(c,shared.err);
}
}
被动触发入口:
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
..........
//判断是否有子线程在运行
if (hasActiveChildProcess() || ldbPendingChildren())
{
checkChildrenDone();
} else {
/* If there is not a background saving/rewrite in progress check if
* we have to save/rewrite now. */
//检查是否rdb的save和rewrite达到了条件
for (j = 0; j < server.saveparamslen; j++) {
//saveparams 对应config文件里面的次数和seconds
struct saveparam *sp = server.saveparams+j;
/* Save if we reached the given amount of changes,
* the given amount of seconds, and if the latest bgsave was
* successful or if, in case of an error, at least
* CONFIG_BGSAVE_RETRY_DELAY seconds already elapsed. */
//判断修改次数是否达到次数
//比较上次时间内是否满足了条数
//上次bgsave 时间大于设置的阈值5s或者上次bgsave状态是ok的
if (server.dirty >= sp->changes &&
server.unixtime-server.lastsave > sp->seconds &&
(server.unixtime-server.lastbgsave_try >
CONFIG_BGSAVE_RETRY_DELAY ||
server.lastbgsave_status == C_OK))
{
serverLog(LL_NOTICE,"%d changes in %d seconds. Saving...",
sp->changes, (int)sp->seconds);
rdbSaveInfo rsi, *rsiptr;
//这里主要是高可用数据逻辑
//这里暂时略过
rsiptr = rdbPopulateSaveInfo(&rsi);
rdbSaveBackground(server.rdb_filename,rsiptr);
break;
}
}
}
被动触发主要有以下几个判断
bgsave的主流程
上面介绍了rdb的触发主流程,下面就是正式进入bgsave的过程
int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) {
pid_t childpid;
//判断是否有其它子线程存在
if (hasActiveChildProcess()) return C_ERR;
//记录下dirty,dirty就是更改次数
//相关write的命令都会导致dirty++
//如set,hset。。。。
server.dirty_before_bgsave = server.dirty;
//获取当前时间
server.lastbgsave_try = time(NULL);
//建立起通道
//用父子进程的通信
openChildInfoPipe();
//fork一个子进程
if ((childpid = redisFork()) == 0) {
int retval;
/* Child */
//设置子进程的title
redisSetProcTitle("redis-rdb-bgsave");
//设置cpu亲和
redisSetCpuAffinity(server.bgsave_cpulist);
//rdb的主方法
retval = rdbSave(filename,rsi);
//保存成功后,向父进程发送消息
//告诉父进程已经这次rdb已经处理完毕
if (retval == C_OK) {
//这里主要发送的信息是说子进程自己
//单独享用的内存空间是多少
//这个信息能在命令info里面看到
sendChildCOWInfo(CHILD_INFO_TYPE_RDB, "RDB");
}
//子进程退出
exitFromChild((retval == C_OK) ? 0 : 1);
} else {
//父进程运行
/* Parent */
//如果建立子进程没成功则关闭通道
if (childpid == -1) {
closeChildInfoPipe();
server.lastbgsave_status = C_ERR;
serverLog(LL_WARNING,"Can't save in background: fork: %s",
strerror(errno));
return C_ERR;
}
serverLog(LL_NOTICE,"Background saving started by pid %d",childpid);
server.rdb_save_time_start = time(NULL);
server.rdb_child_pid = childpid;
server.rdb_child_type = RDB_CHILD_TYPE_DISK;
return C_OK;
}
return C_OK; /* unreached */
}
int redisFork() {
int childpid;
long long start = ustime();
if ((childpid = fork()) == 0) {
/* Child */
//这里开始是子线程在运行
setOOMScoreAdj(CONFIG_OOM_BGCHILD);
//设置信号handler
setupChildSignalHandlers();
//关闭sock端口
closeClildUnusedResourceAfterFork();
} else {
/* Parent */
server.stat_fork_time = ustime()-start;
server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */
//记录延时操作
latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000);
if (childpid == -1) {
return -1;
}
//如果rdb再运行,禁止rehash
updateDictResizePolicy();
}
return childpid;
}
void sendChildCOWInfo(int ptype, char *pname) {
//获取子进程单独使用的空间
size_t private_dirty = zmalloc_get_private_dirty(-1);
//如果private_dirty不为0
//则打印出子进程所使用的空间
if (private_dirty) {
serverLog(LL_NOTICE,
"%s: %zu MB of memory used by copy-on-write",
pname, private_dirty/(1024*1024));
}
//设置变量
server.child_info_data.cow_size = private_dirty;
sendChildInfo(ptype);
}
/* Send COW data to parent. The child should call this function after populating
* the corresponding fields it want to sent (according to the process type). */
void sendChildInfo(int ptype) {
if (server.child_info_pipe[1] == -1) return;
//客户端魔术
server.child_info_data.magic = CHILD_INFO_MAGIC;
server.child_info_data.process_type = ptype;
ssize_t wlen = sizeof(server.child_info_data);
//通知到parent 端
if (write(server.child_info_pipe[1],&server.child_info_data,wlen) != wlen) {
/* Nothing to do on error, this will be detected by the other side. */
}
}
以上代码主要做了以下几件事
- fork了一个子进程
- 创建了一个通道用于子进程和父进程的通信
- 父进程关闭了rehash的通道
- 子进程关闭socket的监听通道。
为什么要fork子进程而不是用子线程去做
如果子线程去做备份的时候没法保证数据的一致性,因为子线程会共享内存,如果需要单独的内存需要重新copy一份内存这样对于性能而言是非常不适合的
而用fork的话,子进程虽然会和父进程共享内存空间的页帧,但是fork的时候会把共享的页帧设置为只可读的状态,当任意一方尝试去写这个内存的时候,就产生一个异常,这时内核就把这个页复制到一个新的页帧中并标记为可写。这就是我们所说的copy on write.另外子进程退出的时候也会释放掉只属于子进程的内存空间,这样就创建了一个完美的snapshot。
这样也能解释为什么要在fork完子进程的时候,要把redis的键值空间设置为禁止rehash
,因为redis是采用的渐进式hash的方式,如果处于rehash 无论set 或者get方式都会对旧的空间进行更改,这样就会不断的触发页帧写的异常,而需要分配更多的内存空间,这样对于性能还是受比较多的影响,但是rdb触发并没有判断是否处于rehash状态,所以rdb的方案是在必须执行的条件下,在rdb过程中尽量减少对页帧的修改而不是完全禁止。
rdb保存数据到file的流程
rdb save file的主流程
/* Save the DB on disk. Return C_ERR on error, C_OK on success. */
int rdbSave(char *filename, rdbSaveInfo *rsi) {
char tmpfile[256];
char cwd[MAXPATHLEN]; /* Current working dir path for error messages. */
FILE *fp;
rio rdb;
int error = 0;
//打印信息
snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
//打开一个文件,并给予写入权限
fp = fopen(tmpfile,"w");
if (!fp) {
//get cwd 是获取当前路径
char *cwdp = getcwd(cwd,MAXPATHLEN);
serverLog(LL_WARNING,
"Failed opening the RDB file %s (in server root dir %s) "
"for saving: %s",
filename,
cwdp ? cwdp : "unknown",
strerror(errno));
return C_ERR;
}
//初始化rdb的信息
rioInitWithFile(&rdb,fp);
//通知开始rdb事件
startSaving(RDBFLAGS_NONE);
//设置自动sync
if (server.rdb_save_incremental_fsync)
rioSetAutoSync(&rdb,REDIS_AUTOSYNC_BYTES);
//下面这个方法是rdb 写入的主要流程
if (rdbSaveRio(&rdb,&error,RDBFLAGS_NONE,rsi) == C_ERR) {
errno = error;
goto werr;
}
/* Make sure data will not remain on the OS's output buffers */
//flush到磁盘而不是只在buf中
if (fflush(fp) == EOF) goto werr;
if (fsync(fileno(fp)) == -1) goto werr;
//关闭文件执政
if (fclose(fp) == EOF) goto werr;
/* Use RENAME to make sure the DB file is changed atomically only
* if the generate DB file is ok. */
//保存文件在对应位置
if (rename(tmpfile,filename) == -1) {
char *cwdp = getcwd(cwd,MAXPATHLEN);
serverLog(LL_WARNING,
"Error moving temp DB file %s on the final "
"destination %s (in server root dir %s): %s",
tmpfile,
filename,
cwdp ? cwdp : "unknown",
strerror(errno));
unlink(tmpfile);
stopSaving(0);
return C_ERR;
}
serverLog(LL_NOTICE,"DB saved on disk");
//把dirty置为0
server.dirty = 0;
//设置last save time
server.lastsave = time(NULL);
//设置bgsave的状态
server.lastbgsave_status = C_OK;
stopSaving(1);
return C_OK;
werr:
serverLog(LL_WARNING,"Write error saving DB on disk: %s", strerror(errno));
fclose(fp);
unlink(tmpfile);
stopSaving(0);
return C_ERR;
}
上面要关注的几点,
- rdb可以设置autosync的条件,意思是写磁盘的时候首先还是写入buffer里面,为了防止在写入过程中因为没有及时的sync而导致数据全部丢失的情况还是需要每隔一段flush一次到磁盘。
- 要注意上面的方法同样用于非bgsave的状态,所以你可以看到dirty的更改等等,但是如果bgsave状态下面,实际上这些值是无法通知到parent,parent的判断会有地方判断
下面这段笔者没有特别的注释
下面这段代码是解释,父进程如何发现子进程已经结束,以及结束后做的一些动作。
void checkChildrenDone(void) {
int statloc;
pid_t pid;
/* If we have a diskless rdb child (note that we support only one concurrent
* child), we want to avoid collecting it's exit status and acting on it
* as long as we didn't finish to drain the pipe, since then we're at risk
* of starting a new fork and a new pipe before we're done with the previous
* one. */
//检查是否有rdb 在运行
if (server.rdb_child_pid != -1 && server.rdb_pipe_conns)
return;
//检查子进程是否已经完成
if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
int exitcode = WEXITSTATUS(statloc);
int bysignal = 0;
if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc);
/* sigKillChildHandler catches the signal and calls exit(), but we
* must make sure not to flag lastbgsave_status, etc incorrectly.
* We could directly terminate the child process via SIGUSR1
* without handling it, but in this case Valgrind will log an
* annoying error. */
if (exitcode == SERVER_CHILD_NOERROR_RETVAL) {
bysignal = SIGUSR1;
exitcode = 1;
}
if (pid == -1) {
serverLog(LL_WARNING,"wait3() returned an error: %s. "
"rdb_child_pid = %d, aof_child_pid = %d, module_child_pid = %d",
strerror(errno),
(int) server.rdb_child_pid,
(int) server.aof_child_pid,
(int) server.module_child_pid);
} else if (pid == server.rdb_child_pid) {
//这里是处理backgroud完成滞后的一些逻辑
backgroundSaveDoneHandler(exitcode,bysignal);
//这里接收子进程发送过来的信息
if (!bysignal && exitcode == 0) receiveChildInfo();
} else if (pid == server.aof_child_pid) {
backgroundRewriteDoneHandler(exitcode,bysignal);
if (!bysignal && exitcode == 0) receiveChildInfo();
} else if (pid == server.module_child_pid) {
ModuleForkDoneHandler(exitcode,bysignal);
if (!bysignal && exitcode == 0) receiveChildInfo();
} else {
if (!ldbRemoveChild(pid)) {
serverLog(LL_WARNING,
"Warning, detected child with unmatched pid: %ld",
(long)pid);
}
}
//调整rehash的策略
updateDictResizePolicy();
//关闭通道
closeChildInfoPipe();
}
}
/* A background saving child (BGSAVE) terminated its work. Handle this.
* This function covers the case of actual BGSAVEs. */
void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) {
//正常退出
if (!bysignal && exitcode == 0) {
serverLog(LL_NOTICE,
"Background saving terminated with success");
server.dirty = server.dirty - server.dirty_before_bgsave;
server.lastsave = time(NULL);
server.lastbgsave_status = C_OK;
}
//处理失败
else if (!bysignal && exitcode != 0) {
serverLog(LL_WARNING, "Background saving error");
server.lastbgsave_status = C_ERR;
}
//通过signal退出
else {
mstime_t latency;
serverLog(LL_WARNING,
"Background saving terminated by signal %d", bysignal);
latencyStartMonitor(latency);
rdbRemoveTempFile(server.rdb_child_pid);
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("rdb-unlink-temp-file",latency);
/* SIGUSR1 is whitelisted, so we have a way to kill a child without
* tirggering an error condition. */
if (bysignal != SIGUSR1)
server.lastbgsave_status = C_ERR;
}
server.rdb_child_pid = -1;
server.rdb_child_type = RDB_CHILD_TYPE_NONE;
server.rdb_save_time_last = time(NULL)-server.rdb_save_time_start;
server.rdb_save_time_start = -1;
/* Possibly there are slaves waiting for a BGSAVE in order to be served
* (the first stage of SYNC is a bulk transfer of dump.rdb) */
updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_DISK);
}
下面是rdb save写文件的一些流程判断,这个地方同样会在aof rewrite的时候被用到
/* Produces a dump of the database in RDB format sending it to the specified
* Redis I/O channel. On success C_OK is returned, otherwise C_ERR
* is returned and part of the output, or all the output, can be
* missing because of I/O errors.
*
* When the function returns C_ERR and if 'error' is not NULL, the
* integer pointed by 'error' is set to the value of errno just after the I/O
* error. */
int rdbSaveRio(rio *rdb, int *error, int rdbflags, rdbSaveInfo *rsi) {
dictIterator *di = NULL;
dictEntry *de;
char magic[10];
int j;
uint64_t cksum;
size_t processed = 0;
//设置rdb checksum
if (server.rdb_checksum)
rdb->update_cksum = rioGenericUpdateChecksum;
//设置rdbversion
snprintf(magic,sizeof(magic),"REDIS%04d",RDB_VERSION);
//写入魔数值
if (rdbWriteRaw(rdb,magic,9) == -1) goto werr;
//写入rdb一些前置信息
if (rdbSaveInfoAuxFields(rdb,rdbflags,rsi) == -1) goto werr;
//写入module相关信息
if (rdbSaveModulesAux(rdb, REDISMODULE_AUX_BEFORE_RDB) == -1) goto werr;
for (j = 0; j < server.dbnum; j++) {
//开始遍历键值空间
redisDb *db = server.db+j;
dict *d = db->dict;
if (dictSize(d) == 0) continue;
di = dictGetSafeIterator(d);
/* Write the SELECT DB opcode */
//写入db标示
if (rdbSaveType(rdb,RDB_OPCODE_SELECTDB) == -1) goto werr;
//db编号
if (rdbSaveLen(rdb,j) == -1) goto werr;
/* Write the RESIZE DB opcode. */
uint64_t db_size, expires_size;
db_size = dictSize(db->dict);
expires_size = dictSize(db->expires);
//写入字典的总共size
if (rdbSaveType(rdb,RDB_OPCODE_RESIZEDB) == -1) goto werr;
//第一个是键值空间
if (rdbSaveLen(rdb,db_size) == -1) goto werr;
//第二个是设置了过期时间的size
if (rdbSaveLen(rdb,expires_size) == -1) goto werr;
/* Iterate this DB writing every entry */
//开始来遍历db
while((de = dictNext(di)) != NULL) {
//获得key
sds keystr = dictGetKey(de);
//获得val
robj key, *o = dictGetVal(de);
long long expire;
//包装成一个redis obj
initStaticStringObject(key,keystr);
//获取key的expire time
expire = getExpire(db,&key);
//这里是存储键值的地方
if (rdbSaveKeyValuePair(rdb,&key,o,expire) == -1) goto werr;
/* When this RDB is produced as part of an AOF rewrite, move
* accumulated diff from parent to child while rewriting in
* order to have a smaller final write. */
//aof rewrite部分会调用到下面这个逻辑,后面我们讲aof部分再来详细看一段
if (rdbflags & RDBFLAGS_AOF_PREAMBLE &&
rdb->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES)
{
processed = rdb->processed_bytes;
aofReadDiffFromParent();
}
}
dictReleaseIterator(di);
di = NULL; /* So that we don't release it again on error. */
}
/* If we are storing the replication information on disk, persist
* the script cache as well: on successful PSYNC after a restart, we need
* to be able to process any EVALSHA inside the replication backlog the
* master will send us. */
//保存lua的脚本信息
if (rsi && dictSize(server.lua_scripts)) {
di = dictGetIterator(server.lua_scripts);
while((de = dictNext(di)) != NULL) {
robj *body = dictGetVal(de);
if (rdbSaveAuxField(rdb,"lua",3,body->ptr,sdslen(body->ptr)) == -1)
goto werr;
}
dictReleaseIterator(di);
di = NULL; /* So that we don't release it again on error. */
}
//完成后写入模块信息
if (rdbSaveModulesAux(rdb, REDISMODULE_AUX_AFTER_RDB) == -1) goto werr;
/* EOF opcode */
//rdb结束符号
if (rdbSaveType(rdb,RDB_OPCODE_EOF) == -1) goto werr;
/* CRC64 checksum. It will be zero if checksum computation is disabled, the
* loading code skips the check in this case. */
cksum = rdb->cksum;
memrev64ifbe(&cksum);
//写入cksum
if (rioWrite(rdb,&cksum,8) == 0) goto werr;
return C_OK;
werr:
if (error) *error = errno;
if (di) dictReleaseIterator(di);
return C_ERR;
}
/* Save a key-value pair, with expire time, type, key, value.
* On error -1 is returned.
* On success if the key was actually saved 1 is returned, otherwise 0
* is returned (the key was already expired). */
//rdb写入的主要方法
int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime) {
//使用的淘汰策略是啥
int savelru = server.maxmemory_policy & MAXMEMORY_FLAG_LRU;
int savelfu = server.maxmemory_policy & MAXMEMORY_FLAG_LFU;
/* Save the expire time */
//如果
if (expiretime != -1) {
if (rdbSaveType(rdb,RDB_OPCODE_EXPIRETIME_MS) == -1) return -1;
if (rdbSaveMillisecondTime(rdb,expiretime) == -1) return -1;
}
/* Save the LRU info. */
if (savelru) {
//存储lru的信息
uint64_t idletime = estimateObjectIdleTime(val);
idletime /= 1000; /* Using seconds is enough and requires less space.*/
if (rdbSaveType(rdb,RDB_OPCODE_IDLE) == -1) return -1;
if (rdbSaveLen(rdb,idletime) == -1) return -1;
}
/* Save the LFU info. */
//存储lfu的信息
if (savelfu) {
uint8_t buf[1];
buf[0] = LFUDecrAndReturn(val);
/* We can encode this in exactly two bytes: the opcode and an 8
* bit counter, since the frequency is logarithmic with a 0-255 range.
* Note that we do not store the halving time because to reset it
* a single time when loading does not affect the frequency much. */
if (rdbSaveType(rdb,RDB_OPCODE_FREQ) == -1) return -1;
if (rdbWriteRaw(rdb,buf,1) == -1) return -1;
}
/* Save type, key, value */
//写入vald的type
if (rdbSaveObjectType(rdb,val) == -1) return -1;
//写入key
if (rdbSaveStringObject(rdb,key) == -1) return -1;
//写入val的信息
if (rdbSaveObject(rdb,val,key) == -1) return -1;
/* Delay return if required (for testing) */
//如果是测试的时候这里可以设置sleep,delay整个运行
if (server.rdb_key_save_delay)
usleep(server.rdb_key_save_delay);
return 1;
}
可以看到rbd写的部分第一个要对键值空间做遍历,然后会有自己的魔术值编号,来确认rdb文件的版本。
内存数据转换到文件,基本写法都是,先写type,然后再写length,然后在写内容的方式到文件里面。
当然里面还有一些像rdb压缩算法并没有详细去讲,感兴趣的朋友可以自行下载源码来观看,因为这边主要讲的一个重点,是为什么要这么去backgroud,以及父子进程如何通信的一个过程。
总结
本章主要讲了redis,rdb的一个过程,着重讲了什么是rdb的copy on write 部分,和父子进程交互的部分,代码里面还有一些没有注释得特别详细,因为里面涉及到一些对于父子进程操作的逻辑,这些需要各位看官自行去搜索这些信息,没有在文章中详细的展开。