redis的主从复制,主要过程如下:
(1)从服务器的时间事件函数serverCron运行replicationCron函数来发起同步请求.
replicationCron()函数位于replication.c文件:
void replicationCron(void) {
/* Non blocking connection timeout? */
if (server.masterhost &&
(server.repl_state == REDIS_REPL_CONNECTING ||
server.repl_state == REDIS_REPL_RECEIVE_PONG) &&
(time(NULL)-server.repl_transfer_lastio) > server.repl_timeout)
{
redisLog(REDIS_WARNING,"Timeout connecting to the MASTER...");
undoConnectWithMaster();
}
/* Bulk transfer I/O timeout? */
if (server.masterhost && server.repl_state == REDIS_REPL_TRANSFER &&
(time(NULL)-server.repl_transfer_lastio) > server.repl_timeout)
{
redisLog(REDIS_WARNING,"Timeout receiving bulk data from MASTER... If the problem persists try to set the 'repl-timeout' parameter in redis.conf to a larger value.");
replicationAbortSyncTransfer();
}
/* Timed out master when we are an already connected slave? */
if (server.masterhost && server.repl_state == REDIS_REPL_CONNECTED &&
(time(NULL)-server.master->lastinteraction) > server.repl_timeout)
{
redisLog(REDIS_WARNING,"MASTER time out: no data nor PING received...");
freeClient(server.master);
}
/* Check if we should connect to a MASTER */
if (server.repl_state == REDIS_REPL_CONNECT) { //如果状态为REDIS_REPL_CONNECT则准备与主服务器建立连接
redisLog(REDIS_NOTICE,"Connecting to MASTER...");
if (connectWithMaster() == REDIS_OK) {
redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync started");
}
}
/* If we have attached slaves, PING them from time to time.
* So slaves can implement an explicit timeout to masters, and will
* be able to detect a link disconnection even if the TCP connection
* will not actually go down. */
if (!(server.cronloops % (server.repl_ping_slave_period * REDIS_HZ))) {
listIter li;
listNode *ln;
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
redisClient *slave = ln->value;
/* Don't ping slaves that are in the middle of a bulk transfer
* with the master for first synchronization. */
if (slave->replstate == REDIS_REPL_SEND_BULK) continue;
if (slave->replstate == REDIS_REPL_ONLINE) {
/* If the slave is online send a normal ping */
addReplySds(slave,sdsnew("*1\r\n$4\r\nPING\r\n"));
} else {
/* Otherwise we are in the pre-synchronization stage.
* Just a newline will do the work of refreshing the
* connection last interaction time, and at the same time
* we'll be sure that being a single char there are no
* short-write problems. */
if (write(slave->fd, "\n", 1) == -1) {
/* Don't worry, it's just a ping. */
}
}
}
}
}
int connectWithMaster(void) { //建立连接
int fd;
fd = anetTcpNonBlockConnect(NULL,server.masterhost,server.masterport);
if (fd == -1) {
redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
strerror(errno));
return REDIS_ERR;
}
if (aeCreateFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE,syncWithMaster,NULL) ==
AE_ERR)
{//绑定事件syncWithMaster,如果连接建立好则发送同步请求
close(fd);
redisLog(REDIS_WARNING,"Can't create readable event for SYNC");
return REDIS_ERR;
}
server.repl_transfer_lastio = server.unixtime;
server.repl_transfer_s = fd;
server.repl_state = REDIS_REPL_CONNECTING;
return REDIS_OK;
}
(2)主服务器在收到同步请求后执行syncCommand函数产生子进程进行rdb快照文件的生成。syncCommand函数位于replication.c文件
void syncCommand(redisClient *c) { /* ignore SYNC if aleady slave or in monitor mode */ // 客户端已经是附属节点时,直接返回 if (c->flags & REDIS_SLAVE) return; /* Refuse SYNC requests if we are a slave but the link with our master * is not ok... */ // 客户端是附属节点,但是主节点不可用时,直接返回 if (server.masterhost && server.repl_state != REDIS_REPL_CONNECTED) { addReplyError(c,"Can't SYNC while not connected with my master"); return; } /* SYNC can't be issued when the server has pending data to send to * the client about already issued commands. We need a fresh reply * buffer registering the differences between the BGSAVE and the current * dataset, so that we can copy to other slaves if needed. */ // 有回复等待时,不进行 SYNC if (listLength(c->reply) != 0) { addReplyError(c,"SYNC is invalid with pending input"); return; } redisLog(REDIS_NOTICE,"Slave ask for synchronization"); /* Here we need to check if there is a background saving operation * in progress, or if it is required to start one */ // 检查是否已经有 BGSAVE 在执行,否则就创建一个新的 BGSAVE 任务 if (server.rdb_child_pid != -1) { /* Ok a background save is in progress. Let's check if it is a good * one for replication, i.e. if there is another slave that is * registering differences since the server forked to save */ // 已有 BGSAVE 在执行,检查它能否用于当前客户端的 SYNC 操作 redisClient *slave; listNode *ln; listIter li; // 检查是否有其他客户端在等待 SYNC 进行 listRewind(server.slaves,&li); while((ln = listNext(&li))) { slave = ln->value; if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break; } if (ln) { /* Perfect, the server is already registering differences for * another slave. Set the right state, and copy the buffer. */ // 找到一个同样在等到 SYNC 的客户端 // 设置当前客户端的状态,并复制 buffer 。 copyClientOutputBuffer(c,slave); c->replstate = REDIS_REPL_WAIT_BGSAVE_END; redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC"); } else { /* No way, we need to wait for the next BGSAVE in order to * register differences */ // 没有客户端在等待 SYNC ,当前客户端只能等待下次 BGSAVE 进行 c->replstate = REDIS_REPL_WAIT_BGSAVE_START; redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC"); } } else { // 没有 BGSAVE 在进行,自己启动一个。 /* Ok we don't have a BGSAVE in progress, let's start one */ redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC"); if (rdbSaveBackground(server.rdb_filename) != REDIS_OK) { redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE"); addReplyError(c,"Unable to perform background save"); return; } // 等待 BGSAVE 结束 c->replstate = REDIS_REPL_WAIT_BGSAVE_END; } c->repldbfd = -1; c->flags |= REDIS_SLAVE; c->slaveseldb = 0; listAddNodeTail(server.slaves,c); return; }
(3)在以后的时间事件函数中会检查快照生成子进程是否结束,若结束更新服务器的状态,向所有slave服务器发送快照文件。/* Check if a background saving or AOF rewrite in progress terminated. */ // 如果 BGSAVE 或者 BGREWRITEAOF 正在进行 // 那么检查它们是否已经执行完毕 if (server.rdb_child_pid != -1 || server.aof_child_pid != -1) { int statloc; pid_t pid; if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) { int exitcode = WEXITSTATUS(statloc); int bysignal = 0; if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc); if (pid == server.rdb_child_pid) { //如果bgsave操作结束更改服务器状态,根据子进程的响应值进行下一步操作 backgroundSaveDoneHandler(exitcode,bysignal); } else if (pid == server.aof_child_pid) { backgroundRewriteDoneHandler(exitcode,bysignal); } else { redisLog(REDIS_WARNING, "Warning, detected child with unmatched pid: %ld", (long)pid); } // 如果 BGSAVE 和 BGREWRITEAOF 都已经完成,那么重新开始 REHASH updateDictResizePolicy(); } } /* A background saving child (BGSAVE) terminated its work. Handle this. */ /* * 根据 BGSAVE 子进程的返回值,对服务器状态进行更新 */ void backgroundSaveDoneHandler(int exitcode, int bysignal) { // 保存成功 if (!bysignal && exitcode == 0) { redisLog(REDIS_NOTICE, "Background saving terminated with success"); server.dirty = server.dirty - server.dirty_before_bgsave; server.lastsave = time(NULL); server.lastbgsave_status = REDIS_OK; // 保存失败 } else if (!bysignal && exitcode != 0) { redisLog(REDIS_WARNING, "Background saving error"); server.lastbgsave_status = REDIS_ERR; // 子进程被终结 } else { redisLog(REDIS_WARNING, "Background saving terminated by signal %d", bysignal); rdbRemoveTempFile(server.rdb_child_pid); server.lastbgsave_status = REDIS_ERR; } // 更新服务器状态 server.rdb_child_pid = -1; server.rdb_save_time_last = time(NULL)-server.rdb_save_time_start; server.rdb_save_time_start = -1; /* Possibly there are slaves waiting for a BGSAVE in order to be served * (the first stage of SYNC is a bulk transfer of dump.rdb) */ // 将 rdb 文件保存完毕的消息报告可能正在等待复制的附属节点 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR); } void updateSlavesWaitingBgsave(int bgsaveerr) { listNode *ln; int startbgsave = 0; listIter li; // 遍历所有附属节点 listRewind(server.slaves,&li); while((ln = listNext(&li))) { redisClient *slave = ln->value; if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) { // 告诉那些这次不能同步的客户端,可以等待下次 BGSAVE 了。 startbgsave = 1; slave->replstate = REDIS_REPL_WAIT_BGSAVE_END; } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) { // 这些是本次可以同步的客户端 struct redis_stat buf; // 如果 BGSAVE 失败,释放 slave 节点 if (bgsaveerr != REDIS_OK) { freeClient(slave); redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error"); continue; } // 打开 .rdb 文件 if ((slave->repldbfd = open(server.rdb_filename,O_RDONLY)) == -1 || // 如果打开失败,释放并清除 redis_fstat(slave->repldbfd,&buf) == -1) { freeClient(slave); redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno)); continue; } // 偏移量 slave->repldboff = 0; // 数据库大小(.rdb 文件的大小) slave->repldbsize = buf.st_size; // 状态 slave->replstate = REDIS_REPL_SEND_BULK; // 清除 slave->fd 的写事件 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); // 创建一个将 .rdb 文件内容发送到附属节点的写事件 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) { freeClient(slave); continue; } } } // 有客户端没有在这次 BGSAVE 中成功同步 if (startbgsave) { // 再启动一次 BGSAVE if (rdbSaveBackground(server.rdb_filename) != REDIS_OK) { // 如果 BGSAVE 失败,清空附属节点 listIter li; listRewind(server.slaves,&li); redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed"); while((ln = listNext(&li))) { redisClient *slave = ln->value; if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) freeClient(slave); } } } } /* * 将主节点的 .rdb 文件内容发送到附属节点 * * 每次最大发送的字节数量有 REDIS_IOBUF_LEN 决定, * 视乎文件的大小和服务器的状态,整个发送过程可能会执行多次 */ void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) { redisClient *slave = privdata; REDIS_NOTUSED(el); REDIS_NOTUSED(mask); char buf[REDIS_IOBUF_LEN]; ssize_t nwritten, buflen; // 刚开始执行 .rdb 文件的发送? if (slave->repldboff == 0) { /* Write the bulk write count before to transfer the DB. In theory here * we don't know how much room there is in the output buffer of the * socket, but in pratice SO_SNDLOWAT (the minimum count for output * operations) will never be smaller than the few bytes we need. */ sds bulkcount; // 首先将主节点 .rdb 文件的大小发送到附属节点 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long) slave->repldbsize); if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount)) { sdsfree(bulkcount); freeClient(slave); return; } sdsfree(bulkcount); } // 设置主节点 .rdb 文件的偏移量 lseek(slave->repldbfd,slave->repldboff,SEEK_SET); // 读取主节点 .rdb 文件的数据到 buf buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN); if (buflen <= 0) { // 主节点 .rdb 文件读取错误,返回 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s", (buflen == 0) ? "premature EOF" : strerror(errno)); freeClient(slave); return; } // 将 buf 发送给附属节点 if ((nwritten = write(fd,buf,buflen)) == -1) { // 附属节点写入出错,返回 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s", strerror(errno)); freeClient(slave); return; } // 更新偏移量 slave->repldboff += nwritten; // .rdb 文件全部发送完毕 if (slave->repldboff == slave->repldbsize) { // 关闭 .rdb 文件 close(slave->repldbfd); // 重置 slave->repldbfd = -1; // 删除发送事件 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); // 更新附属节点状态 slave->replstate = REDIS_REPL_ONLINE; // TODO: if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendReplyToClient, slave) == AE_ERR) { freeClient(slave); return; } redisLog(REDIS_NOTICE,"Synchronization with slave succeeded"); } }
(4)执行命令的过程中如果有写操作,也都会往slave服务器发送一份。
processCommand函数用于处理服务器收到的所有命令,其核心会调用call函数执行命令。
/* Call() is the core of Redis execution of a command */ /* * 执行客户端指定的命令 */ void call(redisClient *c, int flags) { long long dirty, start = ustime(), duration; /* Sent the command to clients in MONITOR mode, only if the commands are * not geneated from reading an AOF. */ // 如果命令不是来自于 AOF 文件,并且命令可以发送给 MONITOR // 那么将命令发送给 MONITOR if (listLength(server.monitors) && !server.loading && !(c->cmd->flags & REDIS_CMD_SKIP_MONITOR)) { replicationFeedMonitors(c,server.monitors,c->db->id,c->argv,c->argc); } /* Call the command. */ redisOpArrayInit(&server.also_propagate); dirty = server.dirty; // 执行命令 c->cmd->proc(c); // 计算命令造成多少个 key 变成 dirty dirty = server.dirty-dirty; // 计算执行命令耗费的时间 duration = ustime()-start; /* When EVAL is called loading the AOF we don't want commands called * from Lua to go into the slowlog or to populate statistics. */ // 命令由 AOF 文件在 lua 脚本中执行时 // 不开启 slowlog 和 统计功能 if (server.loading && c->flags & REDIS_LUA_CLIENT) flags &= ~(REDIS_CALL_SLOWLOG | REDIS_CALL_STATS); /* Log the command into the Slow log if needed, and populate the * per-command statistics that we show in INFO commandstats. */ // 根据命令执行耗费的时间,看是否需要将命令添加到 slowlog if (flags & REDIS_CALL_SLOWLOG) slowlogPushEntryIfNeeded(c->argv,c->argc,duration); // 添加命令到统计数据 if (flags & REDIS_CALL_STATS) { c->cmd->microseconds += duration; c->cmd->calls++; } /* Propagate the command into the AOF and replication link */ // 传播命令到 AOF 和附属节点 if (flags & REDIS_CALL_PROPAGATE) { int flags = REDIS_PROPAGATE_NONE; if (c->cmd->flags & REDIS_CMD_FORCE_REPLICATION) flags |= REDIS_PROPAGATE_REPL; if (dirty) flags |= (REDIS_PROPAGATE_REPL | REDIS_PROPAGATE_AOF); if (flags != REDIS_PROPAGATE_NONE) propagate(c->cmd,c->db->id,c->argv,c->argc,flags); } /* Commands such as LPUSH or BRPOPLPUSH may propagate an additional * PUSH command. */ if (server.also_propagate.numops) { int j; redisOp *rop; for (j = 0; j < server.also_propagate.numops; j++) { rop = &server.also_propagate.ops[j]; propagate(rop->cmd, rop->dbid, rop->argv, rop->argc, rop->target); } redisOpArrayFree(&server.also_propagate); } server.stat_numcommands++; }void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc, int flags) {//传播给定命令到 AOF 或附属节点 if (server.aof_state != REDIS_AOF_OFF && flags & REDIS_PROPAGATE_AOF) feedAppendOnlyFile(cmd,dbid,argv,argc); if (flags & REDIS_PROPAGATE_REPL && listLength(server.slaves)) replicationFeedSlaves(server.slaves,dbid,argv,argc); }