redis replication
slave /
1. Slave init方式当前有两种方式:启动一个server通过slaveof mip mport方式来将该server转换为一个slave节点;或者在配置文件里直接配置该命令。这两种方式都是先把replstate设置为: REDIS_REPL_CONNECT。然后就会在serverCron的timeevent里调用replicationCron来与master建立连接( REDIS_REPL_CONNECTING),然后发送sync命令( REDIS_REPL_TRANSFER)等待master发送rdb数据进行同步。
A. Client command slaveof mip mport
当一个客户端向一个server发送slaveof mip mport命令时,server调用下面的回调函数来启动slave服务(或者说让自己进入slave状态)
void slaveofCommand(redisClient *c) {
if (!strcasecmp(c->argv[1]->ptr,"no") &&
!strcasecmp(c->argv[2]->ptr,"one")) { //该slave将变成一个master
if (server.masterhost) {
sdsfree(server.masterhost);
server.masterhost = NULL;
if (server.master) freeClient(server.master);
if (server.replstate == REDIS_REPL_TRANSFER)
replicationAbortSyncTransfer();
server.replstate = REDIS_REPL_NONE;
redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
}
} else {
sdsfree(server.masterhost);
server.masterhost = sdsdup(c->argv[1]->ptr);
server.masterport = atoi(c->argv[2]->ptr);
if (server.master) freeClient(server.master);
if (server.replstate == REDIS_REPL_TRANSFER)
replicationAbortSyncTransfer();
server.replstate = REDIS_REPL_CONNECT;
redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
server.masterhost, server.masterport);
}
addReply(c,shared.ok);
}
注:该命令还可将一个slave变成一个master节点(slaveof no one)。
B. 读配置文件
第二种方式就是通过直接在redis.conf文件里配置,它所要连接的master,这样该节点就自然成为一个slave 节点。
loadServerConfig
{
else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
server.masterhost = sdsnew(argv[1]);
server.masterport = atoi(argv[2]);
server.replstate = REDIS_REPL_CONNECT;
}
2. Slave的状态转移
上面我们看到slave init的两种方式,它们都把slave初始化为 REDIS_REPL_CONNECT状态。下我们来看一下,后面的slave状态是如何转移的。首先slave节点启动后,在调用time event事件serverCron的时候,调用replicationCron函数来进行与master的连接
serverCron{
/* Replication cron function -- used to reconnect to master and
* to detect transfer failures. */
if (!(loops % 10)) replicationCron();
server.cronloops++;
}
void replicationCron(void) {
...
/* Check if we should connect to a MASTER */
if (server.replstate == REDIS_REPL_CONNECT) {
redisLog(REDIS_NOTICE,"Connecting to MASTER...");
if (connectWithMaster() == REDIS_OK) {
redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync started");
}
}
...
}
下面的函数完成与master的连接,并且注册file event事件及回调函数syncWithMaster,并且slave进入
REDIS_REPL_CONNECTING状态。
int connectWithMaster(void) {
int fd;
fd = anetTcpNonBlockConnect(NULL,server.masterhost,server.masterport);
...
if (aeCreateFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE,syncWithMaster,NULL) ==
AE_ERR)
...
server.repl_transfer_s = fd;
server.replstate = REDIS_REPL_CONNECTING;
return REDIS_OK;
}
连接成功之后,看是否需要密码验证,并且像master发送sync命令,注册新的file event事件来处理master发送rdb数据(readSyncBulkPayload),然后进入
REDIS_REPL_TRANSFER状态,等待数据的传输。
syncWithMaster (aeEventLoop *el, int fd, void *privdata, int mask) {
/* This event should only be triggered once since it is used to have a
* non-blocking connect(2) to the master. It has been triggered when this
* function is called, so we can delete it. */
aeDeleteFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE);
/*从客户端接收到slave no one,即该server将变成master,此时不再去sync数据*/
if (server.replstate == REDIS_REPL_NONE) {
close(fd);
return;
}
/* AUTH with the master if required. */
if(server.masterauth) {
authlen = snprintf(authcmd,sizeof(authcmd),"AUTH %s\r\n",server.masterauth);
if (syncWrite(fd,authcmd,authlen,server.repl_syncio_timeout) == -1) { //这是redis少有的阻塞io之一(通过select来实现),因为slave只有在完成sync之后才会对外提供服务
}
/* Read the AUTH result. */
if (syncReadLine(fd,buf,1024,server.repl_syncio_timeout) == -1)
…
}
/* 发送sync命令 */
if (syncWrite(fd,"SYNC \r\n",7,server.repl_syncio_timeout) == -1)
/* 创建一个临时文件来保存从master传送过来的rdb数据 */
while(maxtries--) {
snprintf(tmpfile,256,
"temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
if (dfd != -1) break;
sleep(1);
}
/*增加file event事件,来读取master的响应*/
if (aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL)
server.replstate = REDIS_REPL_TRANSFER;
server.repl_transfer_left = -1;
server.repl_transfer_fd = dfd;
server.repl_transfer_lastio = time(NULL);
server.repl_transfer_tmpfile = zstrdup(tmpfile);
return;
}
到此请先看下面的master处理流程,等看完的sendBulkToSlave处理,再回来看接下来的函数:readSyncBulkPayload,这个函数就是用来接收master send to slave的rdb数据的。
void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
if (server.repl_transfer_left == -1) { //还没有接收到master发送过来的第一个报文:rdb文件大小的报文
if (syncReadLine(fd,buf,1024,server.repl_syncio_timeout) == -1)
if (buf[0] == '-') { //master 出错
} else if (buf[0] == '\0') { //这是一个connection live的ping操作
server.repl_transfer_lastio = time(NULL);
return;
} else if (buf[0] != '$') { //其它报文,见master过程的sendBulkToSlave
…}
server.repl_transfer_left = strtol(buf+1,NULL,10); //赋值等待接收的数据量
return;
}
/* Read bulk data 真正的数据报文*/
readlen = (server.repl_transfer_left < (signed)sizeof(buf)) ?
server.repl_transfer_left : (signed)sizeof(buf);
nread = read(fd,buf,readlen); //读数据
server.repl_transfer_lastio = time(NULL);
if (write(server.repl_transfer_fd,buf,nread) != nread) { //写到前面创建的临时文件
server.repl_transfer_left -= nread;
/* Check if the transfer is now complete */
if (server.repl_transfer_left == 0) { //接收完毕
if (rename(server.repl_transfer_tmpfile,server.dbfilename) == -1) {
…}
redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Loading DB in memory");
emptyDb();
aeDeleteFileEvent(server.el,server.repl_transfer_s,AE_READABLE);//删除该file event事件
if (rdbLoad(server.dbfilename) != REDIS_OK) {//把rdb文件加载到内存
}
zfree(server.repl_transfer_tmpfile);
close(server.repl_transfer_fd);
server.master = createClient(server.repl_transfer_s); //为该fd创建新的client,该client的file event为aeCreateFileEvent(server.el,fd,AE_READABLE, readQueryFromClient, c)
server.master->flags |= REDIS_MASTER;
server.master->authenticated = 1;
server.replstate = REDIS_REPL_CONNECTED;
redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Finished with success");
/* Rewrite the AOF file now that the dataset changed. */
if (server.appendonly) rewriteAppendOnlyFileBackground(); //写aof文件
}
}
该函数主要分三个过程:读取第一个长度报文,读取数据报文,结束时把rdb加载到内存,创建新的file event 可读事件(readQueryFromClient),更新slave server状态到
REDIS_REPL_CONNECTED。到此master-slave进入增加量的命令同步,slave把来自master的更新命令当做一般的client命令来处理,slave也可对外提供服务。
// master /
3. Master slaveclient的状态转移Master在收到client发送过来的sync命令后,调用该回调函数:void syncCommand(redisClient *c) {
/* 如果该client已经sync过了*/
if (c->flags & REDIS_SLAVE) return;
/* 如果本server是一个master,但是它处于一个非REDIS_REPL_CONNECTED 的状态,显然它还处理一个正常的同步状态,此时它还不应该被别人sync*/
if (server.masterhost && server.replstate != REDIS_REPL_CONNECTED) …
/* SYNC can't be issued when the server has pending data to send to
* the client about already issued commands. We need a fresh reply
* buffer registering the differences between the BGSAVE and the current
* dataset, so that we can copy to other slaves if needed. */
if (listLength(c->reply) != 0) {
addReplyError(c,"SYNC is invalid with pending input");
return;
}
redisLog(REDIS_NOTICE,"Slave ask for synchronization");
/* 检查当前是否有后台save正在操作当中*/
if (server.bgsavechildpid != -1) {
//检查是否有其它的slave刚好也在等待bgsave的完成
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
slave = ln->value;
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
}
if (ln) {
/* 是的,则把先前的这个slave的reply回复给新的这个client*/
listRelease(c->reply);
c->reply = listDup(slave->reply);
c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
} else {
/* 没有,则该client必须等待该bgsave结束(是master自动发起的而不是由其它的slave发起的),然后重新进行一个bgsave*/
c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
}
} else {
/* 当前没有bgsave,开启一个新的进程 */
redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
addReplyError(c,"Unable to perform background save");
return;
}
c->replstate = REDIS_REPL_WAIT_BGSAVE_END; //设置该slave client的状态
}
c->repldbfd = -1; //这个为之后传输rdb文件的描述符,即server.dbfilename
c->flags |= REDIS_SLAVE; //标志该client为slave
c->slaveseldb = 0;
listAddNodeTail(server.slaves,c);
return;
}
接下来master会在它的serverCron的时候等待该bgsave子进程的结束(该过程我们已经在上一个章节里讲过),这里我们直接跳到我们当时跳过的replication的处理过程。在wait3的处理函数backgroundSaveDoneHandler的最后一步:updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
void updateSlavesWaitingBgsave(int bgsaveerr) {
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
redisClient *slave = ln->value;
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) { //该client迟了一点点,所以只能等待下一次的bgsave
startbgsave = 1; //在处理完所有的wait end的slave后马上再bgsave
slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
} else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) { //该client正好是等待当前的bgsave
struct redis_stat buf;
if (bgsaveerr != REDIS_OK) …
if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
redis_fstat(slave->repldbfd,&buf) == -1) …
slave->repldboff = 0;
slave->repldbsize = buf.st_size;
slave->replstate = REDIS_REPL_SEND_BULK; //标志新的状态
aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) …//将该slave之前的file event del掉,因为它当前的回调函数已经变为sendBulk
}
}
if (startbgsave) { //新的bgsave来满足之前是REDIS_REPL_WAIT_BGSAVE_START状态的slave client
if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
listIter li;
listRewind(server.slaves,&li);
redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
while((ln = listNext(&li))) {
redisClient *slave = ln->value;
//bg失败,则结束所有还没开始的slave 请求
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
freeClient(slave);
}
}
}
}
从该函数我们可以看到对于所有
REDIS_REPL_WAIT_BGSAVE_END状态的slave client,master 打开rdb文件,并且注册file event事件,它的回调函数是:sendBulkToSlave,并且slave client进入
REDIS_REPL_SEND_BULK状态。另外对于状态为的
REDIS_REPL_WAIT_BGSAVE_START的,则再次调用bgsave重写rdb。下面我们看一下sendBulkToSlave回调函数(这个函数是与slave 节点的readSyncBulkPayload函数相对应的):
void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
if (slave->repldboff == 0) { //先把文件大小发送给slave节点,第一个报文并且以$开头
bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
slave->repldbsize);
if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
…
}
lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN); //读取rdb文件
if ((nwritten = write(fd,buf,buflen)) == -1) {…} //写到slave client fd
slave->repldboff += nwritten; //修改偏移量
if (slave->repldboff == slave->repldbsize) { //文件读取完毕
close(slave->repldbfd);
slave->repldbfd = -1;
aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); //结束该事件
slave->replstate = REDIS_REPL_ONLINE; //设置新的slave client状态
if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
sendReplyToClient, slave) == AE_ERR) { //添加新可写事件
freeClient(slave);
return;
}…
}
}
该函数就是用于向slave 节点发送rdb文件,直到结束时添加新的file event(AE_WRITABLE, sendReplyToClient)事件,以便来同步save rdb文件之后的更新操作,我们可以看到这个回调函数就是一般的响应客户请求的回调函数,同时slave client进入
REDIS_REPL_ONLINE状态。下面我们将看到master是在什么时候向slave发送后面的更新操作:
void call(redisClient *c) {
…
if ((dirty > 0 || c->cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
listLength(server.slaves))
replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
…
}
是的在每次执行客户端请求之后判断是否有更新内容,以及master的slave队列是否有成员,调用replicationFeedSlaves来向slave client的reply buf里增加数据。
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
listRewind(slaves,&li); //遍历所有的slave client
while((ln = listNext(&li))) {
redisClient *slave = ln->value;
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue; //对于还没有bgsave的则不需要,因为后面的bgsave 的rdb会包含这些更新
if (slave->slaveseldb != dictid) { //查看db是否改变
robj *selectcmd;
…
selectcmd = createObject(REDIS_STRING,
sdscatprintf(sdsempty(),"select %d\r\n",dictid)); //构造select db命令
addReply(slave,selectcmd);
slave->slaveseldb = dictid;
}
}
addReplyMultiBulkLen(slave,argc); //发送更新命令
for (j = 0; j < argc; j++) addReplyBulk(slave,argv[j]);
}
4. 状态转移图
上面我们详细的介绍了slave与master进行同步的一个过程。下面我们通过一张图来总结该过程:
图1 slave-master sync状态转移图
参考文献: http://www.w3ccollege.org/redis/redis-copy-and-build-scalable-cluster.html