前面的文章《redis replication主从复制的源码分析(1)》分析来进行主从复制的流程,现在我们来分析redis复制的实现。
同步主要是从服务器进行主从连接时,将从服务器的数据库状态更新至主服务器的数据库状态。
1、 同步类型
1)完整重同步:通过主服务器创建rdb文件发送给从服务器,同时将从开始创建rdb文件后执行的写命令记录在一个缓冲区,然后在传输完rdb文件后将发送缓冲区中的记录发送给从服务器。
2)部分重同步:主服务器将主从连接断开期间执行的写命令,发送给从服务器,从服务器执行这些命令就能保持与主服务器状态一致。
2、同步的应用场景
1)初次同步使用完全重同步。
2)断线后同步由于在断开连接后,会创建master的cache用于之后断线重连以避免进行完整重同步。这样我们就需要一个变量来记录主从服务器偏差——复制偏移量,通过比对主从服务器的复制偏移量确定主从服务器状态一致性。此外,还有一个复制积压缓冲区来记录最近传播的命令,并且复制积压缓冲区的每个字节记录相应的偏移量,这样的设计主要是为了部分重同步。根据判断从服务器的偏移量是否在积压缓冲区内决定部分重同步还是完整重同步。
3、同步的实现
slave部分的实现
1)发送psync给master,psync runid offset;
2)根据master的reply,进行部分重同步或者完整重同步;
与master进行同步
void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
……
if (server.repl_state == REPL_STATE_SEND_PSYNC) {
//发送psync命令给master
if (slaveTryPartialResynchronization(fd,0) == PSYNC_WRITE_ERROR) {
…… }
……//读取master对于psync的reply
psync_result = slaveTryPartialResynchronization(fd,1);
if (psync_result == PSYNC_CONTINUE) {
//进行部分重同步
return;
}
……//完整重同步,创建rdb文件用来接收master发送过来的rdb文件
while(maxtries--) {
snprintf(tmpfile,256,"temp-%d.%ld.rdb",(int)server.unixtime,(long int)getpid());
dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
}
//设置接受mester的rdb文件的处理函数readSyncBulkPayload
if (aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL)== AE_ERR)
{……}
server.repl_state = REPL_STATE_TRANSFER;
……
return;
……
}
slave发送psync尝试部分重同步
int slaveTryPartialResynchronization(int fd, int read_reply) {
……/* 发送psync命令给master*/
if (!read_reply) {
server.master_initial_offset = -1;
if (server.cached_master) {
//有cached_master,断线重连,获取复制偏移量
psync_replid = server.cached_master->replid;
snprintf(psync_offset,sizeof(psync_offset),"%lld", server.cached_master->reploff+1);
} else {//初次同步
psync_replid = "?";
memcpy(psync_offset,"-1",3);
}//发送PSYNC给master psync runid offset
reply = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PSYNC",psync_replid,psync_offset,NULL);
if (reply != NULL) {
return PSYNC_WRITE_ERROR;
}
return PSYNC_WAIT_REPLY;
}
/* 读取master对于psync的reply,完整重同步or部分重同步 */
reply = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
if (sdslen(reply) == 0) {
return PSYNC_WAIT_REPLY;
}
if (!strncmp(reply,"+FULLRESYNC",11)) {
……//从服务器进行完整重同步
replicationDiscardCachedMaster();//断线重连,丢弃cached_master
return PSYNC_FULLRESYNC;
}
if (!strncmp(reply,"+CONTINUE",9)) {
……//进行部分重同步
replicationResurrectCachedMaster(fd);//复活master
return PSYNC_CONTINUE;
}
……
}
slave复活master,接收master的backlog,进行部分重同步
void replicationResurrectCachedMaster(int newfd) {
server.master = server.cached_master;
server.cached_master = NULL;
server.master->fd = newfd;
server.master->flags &= ~(CLIENT_CLOSE_AFTER_REPLY|CLIENT_CLOSE_ASAP);
server.master->authenticated = 1;
server.master->lastinteraction = server.unixtime;
server.repl_state = REPL_STATE_CONNECTED;
listAddNodeTail(server.clients,server.master);
//等待读取master发送过来的写命令
if (aeCreateFileEvent(server.el, newfd, AE_READABLE,
readQueryFromClient, server.master)) {
freeClientAsync(server.master); /* Close ASAP. */
}
if (clientHasPendingReplies(server.master)) {
if (aeCreateFileEvent(server.el, newfd, AE_WRITABLE,
sendReplyToClient, server.master)) {
freeClientAsync(server.master); /* Close ASAP. */
}
}
}
slave接收rdb,载入rdb进行完整重同步,创建master
void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
……
//接收master的rdb
nread = read(fd,buf,readlen);
……
server.repl_transfer_lastio = server.unixtime;
//master的rdb写入到本地的同步rdb
if (write(server.repl_transfer_fd,buf,nread) != nread) {
goto error;
}
……
/* Check if the transfer is now complete */
if (!usemark) {
if (server.repl_transfer_read == server.repl_transfer_size)
eof_reached = 1;
}
if (eof_reached) {
……//本地的同步rdb导入到内存
if (rdbLoad(server.rdb_filename,&rsi) != C_OK) {
return;
}
replicationCreateMasterClient(server.repl_transfer_s,rsi.repl_stream_db);
server.repl_state = REPL_STATE_CONNECTED;
……
}
return;
}
master部分的实现
1)根据runid和offset使用完整重同步和部分重同步,返回+CONTINUE或者+FULLRESYNC;
2)如果部分重同步,master发送backlog中offset后面的写命令给slave进行同步;如果是完整重同步,master就进行bgsave创建rdb文件然后发送给slave进行完整重同步,完整重同步期间master执行的写命令会通过命令传播写到slave的output buffer,完整重同步完成后,将output buffer的数据发送给slave实现主从一致;
3)replstate为SLAVE_STATE_ONLINE表示同步完成后,进入命令传播阶段,通过命令传播来保持主从服务器的数据库状态一致性;
//同步命令处理函数
void syncCommand(client *c) ;;{
……
if (!strcasecmp(c->argv[0]->ptr,"psync")) {
if (masterTryPartialResynchronization(c) == C_OK) {
return; /* No full resync needed, return. */
} else {
char *master_replid = c->argv[1]->ptr;
if (master_replid[0] != '?') server.stat_sync_partial_err++;
}
}
c->replstate = SLAVE_STATE_WAIT_BGSAVE_START;
……
c->flags |= CLIENT_SLAVE;//标记client为CLIENT_SLAVE
listAddNodeTail(server.slaves,c);
//创建rdb文件
/* CASE 1: BGSAVE is in progress, with disk target. */
if (server.rdb_child_pid != -1 &&server.rdb_child_type == RDB_CHILD_TYPE_DISK)
{
……//寻找正在bgsave的slave
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
slave = ln->value;
if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) break;
}//判断是否具备正在bgsave的slave的capa
if (ln && ((c->slave_capa & slave->slave_capa) == slave->slave_capa)) {
//将该slave的output buffer复制到c的output buffer
copyClientOutputBuffer(c,slave);
// 设置c为完整重同步
replicationSetupSlaveForFullResync(c,slave->psync_initial_offset);
} else {
//不能利用正在进行的bgsave,等待下一个bgsave
}
/* CASE 2: BGSAVE is in progress, with socket target. */
} else if (server.rdb_child_pid != -1 &&server.rdb_child_type == RDB_CHILD_TYPE_SOCKET)
{
serverLog(LL_NOTICE,"Current BGSAVE has socket target. Waiting for next BGSAVE for SYNC");
/* CASE 3: There is no BGSAVE is progress. */
} else {
if (server.repl_diskless_sync && (c->slave_capa & SLAVE_CAPA_EOF)) {
/* 等待更多的slave,在replictioncron中调用startBgsaveForReplication
将rdb直接写到slaves的socket*/
if (server.repl_diskless_sync_delay)
serverLog(LL_NOTICE,"Delay next BGSAVE for diskless SYNC");
} else {
if (server.aof_child_pid == -1) {
//创建rdb,写入到rdb文件
startBgsaveForReplication(c->slave_capa);
}
}
}
return;
}
master处理psync runid offset,进行同步
//根据slave的命令中的runid和offset进行相应的同步
int masterTryPartialResynchronization(client *c) {
……
char *master_replid = c->argv[1]->ptr;
//psync_offset参数解析失败
if (getLongLongFromObjectOrReply(c,c->argv[2],&psync_offset,NULL) !=
C_OK) goto need_full_resync;
//slave发送的runid和master的runid
if (strcasecmp(master_replid, server.replid) &&
(strcasecmp(master_replid, server.replid2) ||
psync_offset > server.second_replid_offset))
{
……//runid不一致完整重同步
goto need_full_resync;
}
/* We still have the data our slave is asking for? */
if (!server.repl_backlog ||
psync_offset < server.repl_backlog_off ||
psync_offset > (server.repl_backlog_off + server.repl_backlog_histlen))
{
……/*1、master没有repl_backlog(复制积压缓冲区)
2、从服务器的偏移量不在repl_backlog的范围内*/
goto need_full_resync;
}
//进行部分重同步
c->flags |= CLIENT_SLAVE;
c->replstate = SLAVE_STATE_ONLINE;//slave上线
c->repl_ack_time = server.unixtime;
c->repl_put_online_on_ack = 0;
listAddNodeTail(server.slaves,c);
//发送reply给slave
if (c->slave_capa & SLAVE_CAPA_PSYNC2) {
buflen = snprintf(buf,sizeof(buf),"+CONTINUE %s\r\n", server.replid);
} else {
buflen = snprintf(buf,sizeof(buf),"+CONTINUE\r\n");
}
if (write(c->fd,buf,buflen) != buflen) {
freeClientAsync(c);
return C_OK;
}
//将复制积压缓冲区psync_offset到结束的缓冲全部写到client的outbuffer中,
//然后发送给client就完成了部分重同步
psync_len = addReplyReplicationBacklog(c,psync_offset);
refreshGoodSlavesCount();//增加master的goodslaves的数量
return C_OK; //表示不需要完整重同步
need_full_resync:
return C_ERR;//表示需要完整重同步
}
master进行部分重同步发送积压缓冲区的命令记录给slave
//将backlog中offset开始的命令发送给slave
long long addReplyReplicationBacklog(client *c, long long offset) {
skip = offset - server.repl_backlog_off;
j = (server.repl_backlog_idx +
(server.repl_backlog_size-server.repl_backlog_histlen)) %
server.repl_backlog_size;
j = (j + skip) % server.repl_backlog_size;
len = server.repl_backlog_histlen - skip;
while(len) {
long long thislen =
((server.repl_backlog_size - j) < len) ?
(server.repl_backlog_size - j) : len;
addReplySds(c,sdsnewlen(server.repl_backlog + j, thislen));
len -= thislen;
j = 0;
}
return server.repl_backlog_histlen - skip;
}
master进行完整重同步的bgsave,然后发送rdb文件给slave
//为同步进行bgsave
int startBgsaveForReplication(int mincapa) {
int socket_target = server.repl_diskless_sync && (mincapa & SLAVE_CAPA_EOF);
…… //两种bgsave,1、rdb生成直接发给slavesocke;2、持久化到disk
if (socket_target)
retval = rdbSaveToSlavesSockets(&rsi);
else
retval = rdbSaveBackground(server.rdb_filename,&rsi);
if (retval == C_ERR) {
serverLog(LL_WARNING,"BGSAVE for replication failed");
……
return retval;
}// 设置c为完整重同步
if (!socket_target) {
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
replicationSetupSlaveForFullResync(slave,
getPsyncInitialOffset());
}
}
}
……
return retval;
}
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
……//检查bgsave是否完成
if (pid == server.rdb_child_pid) {
backgroundSaveDoneHandler(exitcode,bysignal);
}
……
}
void backgroundSaveDoneHandler(int exitcode, int bysignal) {
switch(server.rdb_child_type) {
case RDB_CHILD_TYPE_DISK:
backgroundSaveDoneHandlerDisk(exitcode,bysignal);
break;
case RDB_CHILD_TYPE_SOCKET:
backgroundSaveDoneHandlerSocket(exitcode,bysignal);
break;
}
}
void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) {
……
updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_DISK);
}
void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) {
……
updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_DISK);
}
//完成bgsave的更新slave
void updateSlavesWaitingBgsave(int bgsaveerr, int type) {
……
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
startbgsave = 1;
mincapa = (mincapa == -1) ? slave->slave_capa :
(mincapa & slave->slave_capa);
} else if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) {
struct redis_stat buf;
if (type == RDB_CHILD_TYPE_SOCKET) {
//rdb文件已经全部传输到socket的缓冲区
slave->replstate = SLAVE_STATE_ONLINE;//slave上线
slave->repl_put_online_on_ack = 1;
slave->repl_ack_time = server.unixtime; /* Timeout otherwise. */
} else {
……//rdb创建完成,发送rdb给client
slave->repldboff = 0;
slave->repldbsize = buf.st_size;
slave->replstate = SLAVE_STATE_SEND_BULK;
slave->replpreamble = sdscatprintf(sdsempty(),"$%lld\r\n",
(unsigned long long) slave->repldbsize);
aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
freeClient(slave);
continue;
}
}
}
}
if (startbgsave) startBgsaveForReplication(mincapa);
}
发送生成的rdb文件给slave
void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
……//rdb文件的文件描述符,读取rdb文件
buflen = read(slave->repldbfd,buf,PROTO_IOBUF_LEN);
// 写入到slave的socket fd
if ((nwritten = write(fd,buf,buflen)) == -1) {
if (errno != EAGAIN) {
freeClient(slave);
}
return;
}
slave->repldboff += nwritten;
server.stat_net_output_bytes += nwritten;
if (slave->repldboff == slave->repldbsize) {
//rdb全部写入到socket fd的缓冲区
aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
putSlaveOnline(slave);//
}
}
void putSlaveOnline(client *slave) {
slave->replstate = SLAVE_STATE_ONLINE;//slave上线
slave->repl_put_online_on_ack = 0;
slave->repl_ack_time = server.unixtime; /* 发送fd缓冲区的数据到slave */
if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
sendReplyToClient, slave) == AE_ERR) {
freeClient(slave);
return;
}
refreshGoodSlavesCount();
}
命令传播
命令传播则是主从同步完成后,在主服务器的数据库状态被修改,将相应的修改操作传播给从服务器,让从服务器与主服务器保持数据状态一致。
cmd执行函数
void call(client *c, int flags) {
/* Propagate the command into the AOF and replication link */
if (flags & CMD_CALL_PROPAGATE &&
(c->flags & CLIENT_PREVENT_PROP) != CLIENT_PREVENT_PROP)
{
……
if (propagate_flags != PROPAGATE_NONE)
propagate(c->cmd,c->db->id,c->argv,c->argc,propagate_flags);
}
}
命令传播函数
void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,
int flags)
{
if (server.aof_state != AOF_OFF && flags & PROPAGATE_AOF)
feedAppendOnlyFile(cmd,dbid,argv,argc);
if (flags & PROPAGATE_REPL)
replicationFeedSlaves(server.slaves,dbid,argv,argc);
}
传播写命令给slaves
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
/* Send SELECT command to every slave if needed. */
if (server.slaveseldb != dictid) {
robj *selectcmd;
/* For a few DBs we have pre-computed SELECT command. */
if (dictid >= 0 && dictid < PROTO_SHARED_SELECT_CMDS) {
selectcmd = shared.select[dictid];
} else {
int dictid_len;
dictid_len = ll2string(llstr,sizeof(llstr),dictid);
selectcmd = createObject(OBJ_STRING,
sdscatprintf(sdsempty(),
"*2\r\n$6\r\nSELECT\r\n$%d\r\n%s\r\n",
dictid_len, llstr));
}
/* Send it to slaves. */
listRewind(slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
addReply(slave,selectcmd);
}
if (dictid < 0 || dictid >= PROTO_SHARED_SELECT_CMDS)
decrRefCount(selectcmd);
}
server.slaveseldb = dictid;
listRewind(slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
/* 对已经开始bgsave的slave进行命令传播,保证将期间master执行的写命令
在完整重同步完成后同步到slave*/
if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
/* 添加命令到slave的output buffer*/
addReplyMultiBulkLen(slave,argc);
for (j = 0; j < argc; j++)
addReplyBulk(slave,argv[j]);
}
}
要点分析
1、部分重同步
问题:部分重同步时,会不会出现写请求过来不断增量到backlog,导致backlog无限传输?