redis replication主从复制的源码分析(1)

        对于replication.c的源码分析,我将会分两部分介绍主从复制的过程和主从同步的复制《redis replication主

从复制的源码分析(2)》。本文主要分析slave连接master进行主从复制的过程实现。

     redis-cli通过向从服务器发送slaveof命令,可以使从服务器去复制一个主服务器:

      slaveof <master_ip> <master_port>

     主从复制的详细的步骤如下:

    1、设置主服务器的地址和端口

    2、建立套接字连接

    3、发送ping命令

    4、身份验证

    5、发送端口信息

    6、同步

    7、命令传播

    replicationSetMaster() 设置主服务器的地址和端口,初始化replication状态

void replicationSetMaster(char *ip, int port) {
    sdsfree(server.masterhost);
    server.masterhost = sdsnew(ip);
    server.masterport = port;
    if (server.master) freeClient(server.master);
    disconnectAllBlockedClients(); /* Clients blocked in master, now slave. */
    disconnectSlaves(); /* Force our slaves to resync with us as well. */
    replicationDiscardCachedMaster(); /* Don't try a PSYNC. */
    freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */
    cancelReplicationHandshake();
    server.repl_state = REPL_STATE_CONNECT;//设置repl_state,准备开始replication
    server.master_repl_offset = 0;//初始化replication的偏移
    server.repl_down_since = 0;
}

    connectWithMaster()建立套接字连接,设置事件回调syncWithMaster()

int connectWithMaster(void) {
    int fd;//创建与master的socket
    fd = anetTcpNonBlockBestEffortBindConnect(NULL,
        server.masterhost,server.masterport,NET_FIRST_BIND_ADDR);
    if (fd == -1) {
        return C_ERR;
    }//设置socket连接成功后的事件回调
    if (aeCreateFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE,syncWithMaster,NULL) ==
            AE_ERR)
    {
        close(fd);
        return C_ERR;
    }
    server.repl_transfer_lastio = server.unixtime;
    server.repl_transfer_s = fd;
    server.repl_state = REPL_STATE_CONNECTING;//更新repl_state,正在连接中
    return C_OK;
}

    syncWithMaster()发送ping命令,身份验证,发送端口信息,通知master自己可以解析rdb,进行同步。基本流程如下:

void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {    
    ……
    //repl_state表示没有活跃replication,直接返回
    if (server.repl_state == REPL_STATE_NONE) {
        close(fd);
        return;
    }
    ……
    //socket连接成功,发送ping给master
    if (server.repl_state == REPL_STATE_CONNECTING) {
        aeDeleteFileEvent(server.el,fd,AE_WRITABLE);
        server.repl_state = REPL_STATE_RECEIVE_PONG;//处于等待ping reply
        err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PING",NULL);
        if (err) goto write_error;
        return;
    }//收到pong回复,读出pong回复
    if (server.repl_state == REPL_STATE_RECEIVE_PONG) {
        err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
        if (err[0] != '+' &&strncmp(err,"-NOAUTH",7) != 0 &&
            strncmp(err,"-ERR operation not permitted",28) != 0)
        {
            ……
            goto error;
        } 
        server.repl_state = REPL_STATE_SEND_AUTH;//处于要发送auth状态
    }
    //身份验证,发送auth信息给master
    if (server.repl_state == REPL_STATE_SEND_AUTH) {
        if (server.masterauth) {
            err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"AUTH",server.masterauth,NULL);
            if (err) goto write_error;
            server.repl_state = REPL_STATE_RECEIVE_AUTH;
            return;
        } else {//不需要验证,直接进入配置REPL_STATE_SEND_PORT状态
            server.repl_state = REPL_STATE_SEND_PORT;
        }
    }//接收验证的回复,读取验证回复信息
    if (server.repl_state == REPL_STATE_RECEIVE_AUTH) {        
        err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
        if (err[0] == '-') {
            serverLog(LL_WARNING,"Unable to AUTH to MASTER: %s",err);
            sdsfree(err);
            goto error;
        }
        server.repl_state = REPL_STATE_SEND_PORT;
    }//发送listening-port给master 
    if (server.repl_state == REPL_STATE_SEND_PORT) {
        sds port = sdsfromlonglong(server.port);
        err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
                "listening-port",port, NULL);
        ……
        server.repl_state = REPL_STATE_RECEIVE_PORT;
        return;
    }  
    if (server.repl_state == REPL_STATE_RECEIVE_PORT) {
     //接收"replconf listening-port"的回复 
        err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
        if (err[0] == '-') {
            ……
        }
        server.repl_state = REPL_STATE_SEND_CAPA;
    }
    //告知master自己可以解析rdb的格式
    if (server.repl_state == REPL_STATE_SEND_CAPA) {           
        err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
                "capa","eof",NULL);
        if (err) goto write_error;
        sdsfree(err);
        server.repl_state = REPL_STATE_RECEIVE_CAPA;
        return;
    }//接收"replconf capa eof"的回复 
    if (server.repl_state == REPL_STATE_RECEIVE_CAPA) {       
        err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
        if (err[0] == '-') {
            ……
        }
        sdsfree(err);
        server.repl_state = REPL_STATE_SEND_PSYNC;
    }
    //slave发送psync给master,如果有cached_matster,进行部分重同步;反之进行完整重同步
    if (server.repl_state == REPL_STATE_SEND_PSYNC) {
        if (slaveTryPartialResynchronization(fd,0) == PSYNC_WRITE_ERROR) {
            err = sdsnew("Write error sending the PSYNC command.");
            goto write_error;
        }
        server.repl_state = REPL_STATE_RECEIVE_PSYNC;
        return;
    }
    /* If reached this point, we should be in REPL_STATE_RECEIVE_PSYNC. */
    if (server.repl_state != REPL_STATE_RECEIVE_PSYNC) {
        ……
        goto error;
    }
    //接收master psync的回复,进行同步
    psync_result = slaveTryPartialResynchronization(fd,1);
    if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */
    if (psync_result == PSYNC_CONTINUE) {
        serverLog(LL_NOTICE, "MASTER <-> SLAVE sync: Master accepted a Partial Resynchronization.");
        return;
    }
    disconnectSlaves(); /* Force our slaves to resync with us as well. */
    freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */
    //如果master不支持psync,就改用sync进行同步(老版本的同步机制)
    if (psync_result == PSYNC_NOT_SUPPORTED) {
        serverLog(LL_NOTICE,"Retrying with SYNC...");
        if (syncWrite(fd,"SYNC\r\n",6,server.repl_syncio_timeout*1000) == -1) {
            ……
            goto error;
        }
    }
    ……
    //设置事件回调读取回复过来的同步数据
    if (aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL)
            == AE_ERR)
    {
        ……
    }
    server.repl_state = REPL_STATE_TRANSFER;
    server.repl_transfer_size = -1;
    server.repl_transfer_read = 0;
    server.repl_transfer_last_fsync_off = 0;
    server.repl_transfer_fd = dfd;
    server.repl_transfer_lastio = server.unixtime;
    server.repl_transfer_tmpfile = zstrdup(tmpfile);
    return;
error:
    ……
write_error: /* Handle sendSynchronousCommand(SYNC_CMD_WRITE) errors. */
    ……
}
       接下来看看主从复制的调度中心replicationCron,主要负责监控主从复制过程中的各个状态,

并根据不同情况作出不同处理。

//Replicationcron是复制的调度中心,由redis唯一timeEvent的回调函数serverCron每秒执行一次
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
    ……
	run_with_period(1000) replicationCron();
	……
}
void replicationCron(void) {
    static long long replication_cron_loops = 0;
    //slave非阻塞连接超时
    if (server.masterhost &&
        (server.repl_state == REPL_STATE_CONNECTING ||
         slaveIsInHandshakeState()) &&
         (time(NULL)-server.repl_transfer_lastio) > server.repl_timeout)
    {
        cancelReplicationHandshake();
    }
    //slave receiving .rdb超时
    if (server.masterhost && server.repl_state == REPL_STATE_TRANSFER &&
        (time(NULL)-server.repl_transfer_lastio) > server.repl_timeout)
    {
        cancelReplicationHandshake();
    }
    //slave连接上主服务器后出现交互超时
    if (server.masterhost && server.repl_state == REPL_STATE_CONNECTED &&
        (time(NULL)-server.master->lastinteraction) > server.repl_timeout)
    {
        freeClient(server.master);
    }
    //slave检查是否需要连接主服务器
    if (server.repl_state == REPL_STATE_CONNECT) {
        serverLog(LL_NOTICE,"Connecting to MASTER %s:%d",
            server.masterhost, server.masterport);
        //建立与主服务器的套接字连接
        if (connectWithMaster() == C_OK) {
            serverLog(LL_NOTICE,"MASTER <-> SLAVE sync started");
        }
    }// slave发送ack给master 
    if (server.masterhost && server.master &&
        !(server.master->flags & CLIENT_PRE_PSYNC))
        replicationSendAck();
    listIter li;
    listNode *ln;
    robj *ping_argv[1];
    //master周期性发生ping给slave
    if ((replication_cron_loops % server.repl_ping_slave_period) == 0) {
        ping_argv[0] = createStringObject("PING",4);
        replicationFeedSlaves(server.slaves, server.slaveseldb,
            ping_argv, 1);
        decrRefCount(ping_argv[0]);
    }	
    listRewind(server.slaves,&li);
    while((ln = listNext(&li))) {
        client *slave = ln->value;
		//master发送一个空行给每个符合下面两个条件的slave,refresh slave的last-io的timer
		//1、master需要产生一个rdb文件给slave
		//2、等待rdb文件完成,但还没发给slave
        if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START ||
            (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END &&
             server.rdb_child_type != RDB_CHILD_TYPE_SOCKET))
        {
            if (write(slave->fd, "\n", 1) == -1) {
                /* Don't worry, it's just a ping. */
            }
        }
    }//master断开slave的连接
    if (listLength(server.slaves)) {
        listIter li;
        listNode *ln;
        listRewind(server.slaves,&li);
        while((ln = listNext(&li))) {
            client *slave = ln->value;
            if (slave->replstate != SLAVE_STATE_ONLINE) continue;
            if (slave->flags & CLIENT_PRE_PSYNC) continue;
            if ((server.unixtime - slave->repl_ack_time) > server.repl_timeout)
            {
                freeClient(slave);
            }
        }
    }//master没有slave,就释放掉repl_backlog的内存
    if (listLength(server.slaves) == 0 && server.repl_backlog_time_limit &&
        server.repl_backlog)
    {
        time_t idle = server.unixtime - server.repl_no_slaves_since;

        if (idle > server.repl_backlog_time_limit) {
            freeReplicationBacklog();
        }
    }//master的aof功能关闭而且没有slaves,就释放scriptcache
    if (listLength(server.slaves) == 0 &&
        server.aof_state == AOF_OFF &&
        listLength(server.repl_scriptcache_fifo) != 0)
    {
        replicationScriptCacheFlush();
    }//master没有在进行持久化操作
    if (server.rdb_child_pid == -1 && server.aof_child_pid == -1) {
        time_t idle, max_idle = 0;
        int slaves_waiting = 0;
        int mincapa = -1;
        listNode *ln;
        listIter li;
        listRewind(server.slaves,&li);
        //统计slaves中处于wait_bgsave_star的数量,最大超时时间和rdb解析能力
        while((ln = listNext(&li))) {
            client *slave = ln->value;
            if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
                idle = server.unixtime - slave->lastinteraction;
                if (idle > max_idle) max_idle = idle;
                slaves_waiting++;
                mincapa = (mincapa == -1) ? slave->slave_capa :
                                            (mincapa & slave->slave_capa);
            }
        }		
        if (slaves_waiting && max_idle > server.repl_diskless_sync_delay) {
            //有超时的处于SLAVE_STATE_WAIT_BGSAVE_START的slave
            startBgsaveForReplication(mincapa);
        }
    }
    //刷新延迟小于阈值的slave的数量
    refreshGoodSlavesCount();
    replication_cron_loops++; /* Incremented with frequency 1 HZ. */
}
replicantion.c的主要函数
 /* ---------------------------------- MASTER -------------------------------- */  
void createReplicationBacklog(void) /* 创建复制积压缓冲区 */  
void resizeReplicationBacklog(long long newsize) /* 调整复制积压缓冲区的大小*/  
void freeReplicationBacklog(void) /* 释放复制积压缓冲区*/  
void feedReplicationBacklog(void *ptr, size_t len) /* 将写命令添加到复制积压缓冲区*/  
void feedReplicationBacklogWithObject(robj *o) /*将写命令添加到复制积压缓冲区,但以对象的格式作为参数 */ 
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) /* 将主数据库复制到从数据库 */  
void replicationFeedMonitors(redisClient *c, list *monitors, int dictid, 
robj **argv, int argc) /* 发送数据给monitor监听者 */  
long long addReplyReplicationBacklog(redisClient *c, long long offset) 
/* 将复制积压缓冲区的offset到end的添加client的reply*/
int masterTryPartialResynchronization(redisClient *c) /* 主服务器尝试部分重同步 */  
void syncCommand(redisClient *c) /* 同步命令函数 */  
void replconfCommand(redisClient *c) /* 此函数用于从服务器进行配置复制进程中的执行参数设置 */  
void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) /* 给slave发送BULK数据 */  
void updateSlavesWaitingBgsave(int bgsaveerr, int type) /* 此方法将用于后台保存进程快结束时调用,更新slave */        
/* ----------------------------------- SLAVE -------------------------------- */  
void replicationAbortSyncTransfer(void) /* 中止与master的同步操作 */  
void replicationSendNewlineToMaster(void)  
void replicationEmptyDbCallback(void *privdata)  
void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) 
/* 从服务器读取同步的Sync的BULK数据 */  
char *sendSynchronousCommand(int flags, int fd, ...)  /* 从服务器给主服务器进行同步数据的命令和接收相应的回复 */  
int slaveTryPartialResynchronization(int fd) /* 从服务器尝试部分重同步操作 */  
void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) 
/* 与主服务器保持同步,期间包括发送ping命令,身份验证,发送端口信息 */  
int connectWithMaster(void) /* 连接服务器,设置事件回调 syncWithMaster*/  
void undoConnectWithMaster(void) /* 断开与主服务器的连接 */  
int cancelReplicationHandshake(void) /* 当已经存在一个复制进程时,中止一个非阻塞的replication复制的尝试 */  
void replicationSetMaster(char *ip, int port) /* 设置主服务器的ip地址和端口号 */  
void replicationUnsetMaster(void)  
void slaveofCommand(redisClient *c)  
void roleCommand(redisClient *c)  
void replicationSendAck(void) /* 发送ACK包给主服务器 ,告知当前的进程偏移量 */       
/* ---------------------- MASTER CACHING FOR PSYNC -------------------------- */  
void replicationCacheMaster(redisClient *c) /* 缓存主服务器信息 */  
void replicationDiscardCachedMaster(void) /* 当某个从服务器将不会再回复的时候,可以释放掉缓存的主服务器信息 */  
void replicationResurrectCachedMaster(int newfd) /* 将缓存主服务器复活 */       
/* ------------------------- MIN-SLAVES-TO-WRITE  --------------------------- */  
void refreshGoodSlavesCount(void) /*刷新延迟小于阈值的slave的数量*/  
void replicationScriptCacheInit(void)  
void replicationScriptCacheFlush(void)  
void replicationScriptCacheAdd(sds sha1)  
int replicationScriptCacheExists(sds sha1)  
void replicationCron(void) //主从复制的调度中心

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值