二十一:redis主从复制实现原理
主从复制实现原理
本文使用源码redis 2.2
主从结构有两种结构:
1.一主多从
2. 级联结果
复制方式分为:
1.全量复制
2. 增量同步
全量复制: 发生在slave初始阶段及slave断线重新连接到master
增量同步: 当全量复制完成后,主服务每执行一个写命令就会向从服务发送命令。
全量复制
1.当slave启动连接到master发送sync命令
2.master接收到sync后,开始执行bgsave命令生成RDB
3.master后续写入命令存入到缓冲区
4.slave与master建立连接,读取master的rdb数据
5.slave读取数据完成,清空缓存加载rdb
6.准备接受master的后续写命令
slave端
--------------------------slave-------------------------------------------------------
main
/**
* file: redis.c
* function: main()
* 这里展示的仅是main的部分,为了排除干扰仅说明主从复制的原理
**/
int main(int argc, char **argv) {
// 初始化server配置(未读取配置文件)
initServerConfig();
//启动参数 redis-server /path/redis.conf
if (argc == 2) {
//加载配置文件
loadServerConfig(argv[1]);
}
}
loadServerConfig
/**
* file: config.c
* function: loadServerConfig()
**/
void loadServerConfig(char *filename){
// 如果当前启动的是slave,则配置文件有配置master的ip、port (slaveof是redis5.0之前的的配置,5.0后使用replicaof )
if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
server.masterhost = sdsnew(argv[1]);
server.masterport = atoi(argv[2]);
//设置复制状态为可连接
server.replstate = REDIS_REPL_CONNECT;
}
}
replicationCron
/**
* file: replication.c
* function: replicationCron()
* des: serverCronding定时任务,调用replicationCron定时复制任务,通常当重连接到master或传输失败时执行
**/
void replicationCron(void) {
// 上面的初始化读取配置文件设置了replstate为REDIS_REPL_CONNECT
if (server.replstate == REDIS_REPL_CONNECT) {
redisLog(REDIS_NOTICE,"Connecting to MASTER...");
// 发送sync到master,master准备bgsave生成rdb文件
if (syncWithMaster() == REDIS_OK) {
redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync started: SYNC sent");
}
}
}
syncWithMaster
/**
* replication.c
*
* syncWithMaster()
**/
int syncWithMaster(void) {
char buf[1024], tmpfile[256], authcmd[1024];
// tcp socket连接master
int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
// dfd: maxtries: 打开rdb文件最大的重试次数
int dfd, maxtries = 5;
// 不能连接到master
if (fd == -1) {
redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
strerror(errno));
return REDIS_ERR;
}
//如果有配置master的密码则尝试认证
if(server.masterauth) {
// 构造 auth password 命令
snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
//阻塞式的执行认证命令,并且设置超时时间5秒
if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
close(fd);
redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
strerror(errno));
return REDIS_ERR;
}
// 阻塞式的读取auth password执行的结果,并设置超时间时间3600秒
if (syncReadLine(fd,buf,1024,3600) == -1) {
close(fd);
redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
strerror(errno));
return REDIS_ERR;
}
//resp协议,五种数据类型,这里执行auth命令可能的结果前缀+、-; 如果不是+开头,则表示密码错误
if (buf[0] != '+') {
close(fd);
redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
return REDIS_ERR;
}
}
//开始准备执行sync命令,结果返回-1表示执行失败
if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
close(fd);
redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
strerror(errno));
return REDIS_ERR;
}
//最大尝试5次,打开文件
while(maxtries--) {
// 构造临时文件名
snprintf(tmpfile,256,
"temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
//O_CREAT: 如果指定文件不存在,则创建这个文件
//O_WRONLY:只写模式
//O_EXCL: 如果要创建的文件已存在,则返回-1,并且修改error的值
//oepn函数返回值成功则返回文件描述符,否则返回-1
dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
// 创建成功,直接推出循环
if (dfd != -1) break;
//否则睡眠1秒,继续尝试创建文件,最多尝试5次。
sleep(1);
}
//如果创建文件失败,输出错误信息,结束函数
if (dfd == -1) {
close(fd);
redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
return REDIS_ERR;
}
// 从master非阻塞的下载rdb文件, readSyncBulkPayload读master的rdb数据
if (aeCreateFileEvent(server.el, fd, AE_READABLE, readSyncBulkPayload, NULL)
== AE_ERR)
{
close(fd);
redisLog(REDIS_WARNING,"Can't create readable event for SYNC");
return REDIS_ERR;
}
server.replstate = REDIS_REPL_TRANSFER;
server.repl_transfer_left = -1;
server.repl_transfer_s = fd;
server.repl_transfer_fd = dfd;
server.repl_transfer_lastio = time(NULL);
server.repl_transfer_tmpfile = zstrdup(tmpfile);
return REDIS_OK;
}
readSyncBulkPayload
/**
* 实际读取master rdb到slave的函数
** file: replication.c
** function: readSyncBulkPayload
**/
/* Asynchronously read the SYNC payload we receive from a master */
void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
char buf[4096];
ssize_t nread, readlen;
REDIS_NOTUSED(el);
REDIS_NOTUSED(privdata);
REDIS_NOTUSED(mask);
// 从master读取的字节大小,如果等于-1,则读取
if (server.repl_transfer_left == -1) {
//读取master rdb一行字符
if (syncReadLine(fd,buf,1024,3600) == -1) {
redisLog(REDIS_WARNING,
"I/O error reading bulk count from MASTER: %s",
strerror(errno));
replicationAbortSyncTransfer();
return;
}
// 失败的命令,直接返回,关闭与master的连接
if (buf[0] == '-') {
redisLog(REDIS_WARNING,
"MASTER aborted replication with an error: %s",
buf+1);
replicationAbortSyncTransfer();
return;
// 读到行末,继续读取下一行
} else if (buf[0] == '\0') {
server.repl_transfer_lastio = time(NULL);
return;
} else if (buf[0] != '$') {
redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
replicationAbortSyncTransfer();
return;
}
// 计算需要读取的字节数
server.repl_transfer_left = strtol(buf+1,NULL,10);
redisLog(REDIS_NOTICE,
"MASTER <-> SLAVE sync: receiving %ld bytes from master",
server.repl_transfer_left);
return;
}
// 计算准备读取的字节长度,如果repl_transfer_left小于buf,则读repl_transfer_left大小
readlen = (server.repl_transfer_left < (signed)sizeof(buf)) ?
server.repl_transfer_left : (signed)sizeof(buf);
//读取数据从master
nread = read(fd,buf,readlen);
//读失败,关闭连接
if (nread <= 0) {
redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
(nread == -1) ? strerror(errno) : "connection lost");
replicationAbortSyncTransfer();
return;
}
server.repl_transfer_lastio = time(NULL);
//写master读到的数据到slave的repl_transfer_fd(temp文件)
if (write(server.repl_transfer_fd,buf,nread) != nread) {
redisLog(REDIS_WARNING,"Write error or short write writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
replicationAbortSyncTransfer();
return;
}
server.repl_transfer_left -= nread;
// 检查是否已经读完了master的rdb数据
if (server.repl_transfer_left == 0) {
//重命名slave tmp文件名为slave rdb文件名
if (rename(server.repl_transfer_tmpfile,server.dbfilename) == -1) {
redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
replicationAbortSyncTransfer();
return;
}
//输出日志,在master与salve之间执行sync命令完成,
redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Loading DB in memory");
//清空slave的缓存数据
emptyDb();
/* Before loading the DB into memory we need to delete the readable
* handler, otherwise it will get called recursively since
* rdbLoad() will call the event loop to process events from time to
* time for non blocking loading. */
//关闭事件连接s
aeDeleteFileEvent(server.el,server.repl_transfer_s,AE_READABLE);
//加载rdb到redis
if (rdbLoad(server.dbfilename) != REDIS_OK) {
redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
replicationAbortSyncTransfer();
return;
}
//释放临时文件、关闭临时文件的连接、slave创建连接,准备接受master后续的写命令
zfree(server.repl_transfer_tmpfile);
close(server.repl_transfer_fd);
server.master = createClient(server.repl_transfer_s);
server.master->flags |= REDIS_MASTER;
server.master->authenticated = 1;
server.replstate = REDIS_REPL_CONNECTED;
redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Finished with success");
/* Rewrite the AOF file now that the dataset changed. */
if (server.appendonly) rewriteAppendOnlyFileBackground();
}
}
master
---------------------master-----------------------------------------------------------------
syncCommand
/**
* master
* file: replication.c
* funtion: syncCommand
**/
void syncCommand(redisClient *c) {
//slave不执行sync命令
if (c->flags & REDIS_SLAVE) return;
// 这两个参数配置,说明是自己slave,不执行sync命令
if (server.masterhost && server.replstate != REDIS_REPL_CONNECTED) {
addReplyError(c,"Can't SYNC while not connected with my master");
return;
}
//当slave发送sync表示slave启动重新连接到master,重新全量复制mater的数据
//所以执行sync之前,数据缓存区应该是没有数据的。
if (listLength(c->reply) != 0) {
addReplyError(c,"SYNC is invalid with pending input");
return;
}
//master输出日志,准备全量复制数据到slave
redisLog(REDIS_NOTICE,"Slave ask for synchronization");
// bgsave的子进程 != -1表示子进程正在保存快照中
if (server.bgsavechildpid != -1) {
// 如果bgsave正在运行,则判断是否有其他slave执行了sync,如果有那么就等待
//并当前连接和那个slave保持一致.
redisClient *slave;
listNode *ln;
listIter li;
//获取当前master连接的所有slave的迭代器
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
//迭代,如果有其他的slave在等待bgsave结束,则返回这个slave
//否则返回空的listnode
slave = ln->value;
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
}
if (ln) {
//表示有其他的slava执行了bgsave
//当前连接salve,复制那个slave缓冲区、以及复制状态设为等待bgsave
listRelease(c->reply);
c->reply = listDup(slave->reply);
c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
} else {
//没有其他salve执行bgsave表示,配置的save参数(表示在多少秒内直到有多少key改变才触发rdb持久化)
//生效了,设置salve的复制状态为bgsave
c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
}
} else {
//没有其他salve执行bgsave,save的配置没有触发bgsave的条件,
//那么当前连接slave则触发bgsave
redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
addReplyError(c,"Unable to perform background save");
return;
}
//设置salve连接的复制状态为`等待bgsave结束`
c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
}
c->repldbfd = -1;
c->flags |= REDIS_SLAVE;
c->slaveseldb = 0;
//当前salve连接加入到master的slave队列中
listAddNodeTail(server.slaves,c);
return;
}
updateSlavesWaitingBgsave
/**
* file: replication.c
* function: updateSlavesWaitingBgsave
*
* des: redis.c的serverCron函数会定时执行,当bgsave执行完毕
* 会开始执行backgroundSaveDoneHandler函数,调用updateSlavesWaitingBgsave
* 发送rdb文件的数据到salve
*
**/
//当bgsave成功,传入的bgsaveerr= REDIS_OK
void updateSlavesWaitingBgsave(int bgsaveerr) {
listNode *ln;
int startbgsave = 0;
listIter li;
// 获取li,为连接master的slave的迭代器
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
//当前迭代slave的连接对象
redisClient *slave = ln->value;
//如果slave的复制状态是等待bgsave则设置为bgsave结束了
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
startbgsave = 1;
slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
} else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
//如果slave的复制状态是 等待bgsave结束,则开始发送master的rdb数据
//到slave
struct redis_stat buf;
//如果bgsave是失败的则,释放slave的连接。(slave后面后再次发起sync请求)
if (bgsaveerr != REDIS_OK) {
freeClient(slave);
redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
continue;
}
//否则,bgsave是成功的,以只读方式打开master的rdb
if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
redis_fstat(slave->repldbfd,&buf) == -1) {
//打开rdb文件失败,则释放slave的连接
freeClient(slave);
redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
continue;
}
slave->repldboff = 0;
slave->repldbsize = buf.st_size;
slave->replstate = REDIS_REPL_SEND_BULK;
aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
//使用redis自己的事件驱动来写数据到slave
//sendBulkToSlave函数为写数据的主要逻辑
if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
freeClient(slave);
continue;
}
}
}
if (startbgsave) {
if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
listIter li;
listRewind(server.slaves,&li);
redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
while((ln = listNext(&li))) {
redisClient *slave = ln->value;
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
freeClient(slave);
}
}
}
}
增量复制
增量复制:当全量复制完成后,主服务每执行一个写命令就会向从服务发送命令。
以set时判断key是否过期为例,分析增量复制:setGenericCommand —>expireIfNeeded
master
propagateExpire
/**
* file: db.c
* function: propagateExpire
*
* dec: 在expireIfNeeded函数中,当判断key是否过期,如果key过期了则调用
* propagateExpire函数传播删除key,到slave及aof文件中
**/
void propagateExpire(redisDb *db, robj *key) {
robj *argv[2];
argv[0] = createStringObject("DEL",3);
argv[1] = key;
incrRefCount(key);
//如果开启了aof,则写入命令到aof文件中
if (server.appendonly)
feedAppendOnlyFile(server.delCommand,db->id,argv,2);
if (listLength(server.slaves))
//如果master有连接的slave,则传播del命令到所有的slave
replicationFeedSlaves(server.slaves,db->id,argv,2);
decrRefCount(argv[0]);
decrRefCount(argv[1]);
}
replicationFeedSlaves
/**
* file: replication.c
* function: replicationFeedSlaves
*
* des: 复制命令到slave
**/
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
listNode *ln;
listIter li;
int outc = 0, j;
robj **outv;
//构造存放发送命令的数组
// REDIS_STATIC_ARGS = 8
robj *static_outv[REDIS_STATIC_ARGS*3+1];
robj *lenobj;
if (argc <= REDIS_STATIC_ARGS) {
outv = static_outv;
} else {
outv = zmalloc(sizeof(robj*)*(argc*3+1));
}
//构造消息头(根据resp协议),* :表示后面是数组,以及指定的数组的大小
lenobj = createObject(REDIS_STRING,
sdscatprintf(sdsempty(), "*%d\r\n", argc));
lenobj->refcount = 0;
outv[outc++] = lenobj;
//argv存放到out数组中
for (j = 0; j < argc; j++) {
lenobj = createObject(REDIS_STRING,
sdscatprintf(sdsempty(),"$%lu\r\n",
(unsigned long) stringObjectLen(argv[j])));
lenobj->refcount = 0;
outv[outc++] = lenobj;
outv[outc++] = argv[j];
outv[outc++] = shared.crlf;
}
for (j = 0; j < outc; j++) incrRefCount(outv[j]);
// slave的迭代器
listRewind(slaves,&li);
while((ln = listNext(&li))) {
redisClient *slave = ln->value;
//如果slave的复制状态是等待开始,表示刚连接上,等待bgsave
//全量复制还未完成,则不进行增量复制
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
// 选择master的数据库,如果slave与master的数据库不一样
//则构造命令执行指定slave的数据库
if (slave->slaveseldb != dictid) {
robj *selectcmd;
switch(dictid) {
case 0: selectcmd = shared.select0; break;
case 1: selectcmd = shared.select1; break;
case 2: selectcmd = shared.select2; break;
case 3: selectcmd = shared.select3; break;
case 4: selectcmd = shared.select4; break;
case 5: selectcmd = shared.select5; break;
case 6: selectcmd = shared.select6; break;
case 7: selectcmd = shared.select7; break;
case 8: selectcmd = shared.select8; break;
case 9: selectcmd = shared.select9; break;
default:
selectcmd = createObject(REDIS_STRING,
sdscatprintf(sdsempty(),"select %d\r\n",dictid));
selectcmd->refcount = 0;
break;
}
addReply(slave,selectcmd);
slave->slaveseldb = dictid;
}
// 发送数组的数据到slave的socket到slave
for (j = 0; j < outc; j++) addReply(slave,outv[j]);
}
for (j = 0; j < outc; j++) decrRefCount(outv[j]);
if (outv != static_outv) zfree(outv);
}
参考
https://www.cnblogs.com/fnlingnzb-learner/p/7040726.html
https://www.cnblogs.com/daofaziran/p/10978628.html