目录
代码版本redis-6.2.6
1.节点间通信内容
消息类型:
#define CLUSTERMSG_TYPE_PING 0 /* Ping */
#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */
#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */
#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */
#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */
#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */
#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */
#define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */
#define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */
#define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */
#define CLUSTERMSG_TYPE_COUNT 10 /* Total number of message types. */
结构体:
typedef struct {
char nodename[CLUSTER_NAMELEN];//节点名称
uint32_t ping_sent; //节点发送ping的时间
uint32_t pong_received; //节点收到pong的时间
char ip[NET_IP_STR_LEN]; /* 节点ip */
uint16_t port; /* 节点和客户端的通信端口 */
uint16_t cport; /* c节点集群中的通信端口 */
uint16_t flags; /* 节点的标记 */
uint16_t pport; /* plaintext-port, when base port is TLS */
uint16_t notused1;
} clusterMsgDataGossip;
//联合体,根据具体的消息类型,有不同的结构体使用
union clusterMsgData {
/* PING, MEET and PONG */
struct {
/* Array of N clusterMsgDataGossip structures */
clusterMsgDataGossip gossip[1];
} ping;
/* FAIL */
struct {
clusterMsgDataFail about;
} fail;
/* PUBLISH */
struct {
clusterMsgDataPublish msg;
} publish;
/* UPDATE */
struct {
clusterMsgDataUpdate nodecfg;
} update;
/* MODULE */
struct {
clusterMsgModule msg;
} module;
};
typedef struct {
char sig[4]; /* Signature "RCmb" (Redis Cluster message bus). */
uint32_t totlen; //消息长度
uint16_t ver; /* Protocol version, currently set to 1. */
uint16_t port; /* TCP base port number. */
uint16_t type; //消息类型
uint16_t count; /* Only used for some kind of messages. */
uint64_t currentEpoch; /* The epoch accordingly to the sending node. */
uint64_t configEpoch; /* The config epoch if it's a master, or the last
epoch advertised by its master if it is a
slave. */
uint64_t offset; /* Master replication offset if node is a master or
processed replication offset if node is a slave. */
char sender[CLUSTER_NAMELEN]; //发送节点名称
unsigned char myslots[CLUSTER_SLOTS/8];//发送消息的节点负责的slot
char slaveof[CLUSTER_NAMELEN];
char myip[NET_IP_STR_LEN]; //发送消息节点的ip
char notused1[32]; /* 32 bytes reserved for future usage. */
uint16_t pport; /* Sender TCP plaintext port, if base port is TLS */
uint16_t cport; //发送消息节点的端口
uint16_t flags; /* Sender node flags */
unsigned char state; /* Cluster state from the POV of the sender */
unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */
union clusterMsgData data; //消息内容
} clusterMsg;
2.集群节点集群功能实现
1.节点定时发送ping消息
void clusterCron(void)每秒执行10次,进行周期性执行。每当余10的时候执行一下代码:
clusterCron
{
if (!(iteration % 10)) {
int j;
//随机选取5个节点,并找出最早向当前节点发送pong的节点
for (j = 0; j < 5; j++) {
de = dictGetRandomKey(server.cluster->nodes);
clusterNode *this = dictGetVal(de);
/* Don't ping nodes disconnected or with a ping currently active. */
if (this->link == NULL || this->ping_sent != 0) continue;
if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
continue;
if (min_pong_node == NULL || min_pong > this->pong_received) {
min_pong_node = this;
min_pong = this->pong_received;
}
}
if (min_pong_node) {
serverLog(LL_DEBUG,"Pinging node %.40s", min_pong_node->name);
clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
}
}
每分钟当前节点会随机选取5个节点,且符合以上要求的里面找出最早给当前节点发送pong的节点,并向其发送ping消息。节点每分钟随机向1个其他节点发送ping消息。
2.节点收到ping消息回复pong消息
//对消息进行处理
int clusterProcessPacket(clusterLink *link) {
...........................
......................
//判断消息类型是否为MEET或者PING
if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_MEET) {
serverLog(LL_DEBUG,"Ping packet received: %p", (void*)link->node);
if ((type == CLUSTERMSG_TYPE_MEET || myself->ip[0] == '\0') &&
server.cluster_announce_ip == NULL)
{
char ip[NET_IP_STR_LEN];
if (connSockName(link->conn,ip,sizeof(ip),NULL) != -1 &&
strcmp(ip,myself->ip))
{
memcpy(myself->ip,ip,NET_IP_STR_LEN);
serverLog(LL_WARNING,"IP address for this node updated to %s",
myself->ip);
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
}
}
if (!sender && type == CLUSTERMSG_TYPE_MEET) {
clusterNode *node;
node = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE);
nodeIp2String(node->ip,link,hdr->myip);
node->port = ntohs(hdr->port);
node->pport = ntohs(hdr->pport);
node->cport = ntohs(hdr->cport);
clusterAddNode(node);
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
}
if (!sender && type == CLUSTERMSG_TYPE_MEET)
clusterProcessGossipSection(hdr,link);
//回复PONG
clusterSendPing(link,CLUSTERMSG_TYPE_PONG);
}
..............................
}
回复PONG消息的时候,报文中也会有部分其他节点信息,最终每个节点都有所有节点的信息
节点1和节点2进行pingpong,最终节点1和2互换节点信息,都可得到全部节点信息
3.节点加入集群,进行meet
向节点Node1发送meet命令,将节点Node2加入集群
4.故障转移
slave检查到所属master离线,通知所有节点发起故障转移。只有master才参与投票,票数超过1/2则成为主节点。
void clusterFailoverReplaceYourMaster(void) {
int j;
clusterNode *oldmaster = myself->slaveof;
if (nodeIsMaster(myself) || oldmaster == NULL) return;
/* 1) 切换当前节点为Master节点 */
clusterSetNodeAsMaster(myself);
replicationUnsetMaster();
/* 2) 申请之前master管理的所有slot */
for (j = 0; j < CLUSTER_SLOTS; j++) {
if (clusterNodeGetSlotBit(oldmaster,j)) {
clusterDelSlot(j);
clusterAddSlot(myself,j);
}
}
/* 3) 更新状态保存配置 */
clusterUpdateState();
clusterSaveConfigOrDie(1);
/* 4) 连接所有节点,以便他们更新相应的状态,并检测到我们切换为master */
clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
/* 5) 如果正在进行手动故障切换,请清除状态重置手动故障切换 */
resetManualFailover();
}
3.主从同步
![asd](https://i-blog.csdnimg.cn/blog_migrate/aded249ba304ffdcac4c14a9e1ea60b9.png)
1.slave节点同步
![](https://i-blog.csdnimg.cn/blog_migrate/7addfdeeda185f347ae84ac33af127fb.png)
![](https://i-blog.csdnimg.cn/blog_migrate/9a776db37159564bcbfd4d708f65e775.png)
//设置当节点为指定节点的slave
void replicationSetMaster(char *ip, int port) {
int was_master = server.masterhost == NULL;
sdsfree(server.masterhost);
server.masterhost = NULL;
if (server.master) {
freeClient(server.master);
}
disconnectAllBlockedClients(); /* Clients blocked in master, now slave. */
/* Setting masterhost only after the call to freeClient since it calls
* replicationHandleMasterDisconnection which can trigger a re-connect
* directly from within that call. */
server.masterhost = sdsnew(ip);
server.masterport = port;
/* Update oom_score_adj */
setOOMScoreAdj(-1);
//断开所有从服务器
disconnectSlaves();
cancelReplicationHandshake(0);
/* Before destroying our master state, create a cached master using
* our own parameters, to later PSYNC with the new master. */
if (was_master) {
//清空可能存在的master缓存
replicationDiscardCachedMaster();
replicationCacheMasterUsingMyself();
}
/* Fire the role change modules event. */
moduleFireServerEvent(REDISMODULE_EVENT_REPLICATION_ROLE_CHANGED,
REDISMODULE_EVENT_REPLROLECHANGED_NOW_REPLICA,
NULL);
/* Fire the master link modules event. */
if (server.repl_state == REPL_STATE_CONNECTED)
moduleFireServerEvent(REDISMODULE_EVENT_MASTER_LINK_CHANGE,
REDISMODULE_SUBEVENT_MASTER_LINK_DOWN,
NULL);
//进入连接状态
server.repl_state = REPL_STATE_CONNECT;
serverLog(LL_NOTICE,"Connecting to MASTER %s:%d",
server.masterhost, server.masterport);
connectWithMaster();
}
2.master节点同步
![](https://i-blog.csdnimg.cn/blog_migrate/f4b69d376ba6afa460b51224a6dd693a.png)
![](https://i-blog.csdnimg.cn/blog_migrate/111fdf1fb727aed0eb719e5b6658f7c9.png)
feedReplicationBacklog():master需要同步给Slave的数据通过此函数加入缓存区
addReplyReplicationBacklog():根据slave的offset,发送相应数据