环境说明:redis源码版本 5.0.3;我在阅读源码过程做了注释,git地址:https://gitee.com/xiaoangg/redis_annotation
如有错误欢迎指正
参考书籍:《redis的设计与实现》
文章推荐:
redis源码阅读-一--sds简单动态字符串
redis源码阅读--二-链表
redis源码阅读--三-redis散列表的实现
redis源码浅析--四-redis跳跃表的实现
redis源码浅析--五-整数集合的实现
redis源码浅析--六-压缩列表
redis源码浅析--七-redisObject对象(下)(内存回收、共享)
redis源码浅析--八-数据库的实现
redis源码浅析--九-RDB持久化
redis源码浅析--十-AOF(append only file)持久化
redis源码浅析--十一.事件(上)文件事件
redis源码浅析--十一.事件(下)时间事件
redis源码浅析--十二.单机数据库的实现-客户端
redis源码浅析--十三.单机数据库的实现-服务端 - 时间事件
redis源码浅析--十三.单机数据库的实现-服务端 - redis服务器的初始化
redis源码浅析--十四.多机数据库的实现(一)--新老版本复制功能的区别与实现原理
redis源码浅析--十四.多机数据库的实现(二)--复制的实现SLAVEOF、PSYNY
redis源码浅析--十五.哨兵sentinel的设计与实现
redis源码浅析--十六.cluster集群的设计与实现
redis源码浅析--十七.发布与订阅的实现
redis源码浅析--十八.事务的实现
redis源码浅析--十九.排序的实现
redis源码浅析--二十.BIT MAP的实现
redis源码浅析--二十一.慢查询日志的实现
redis源码浅析--二十二.监视器的实现
上一篇介绍了redis集群的基础概念、搭建方法。本文接着介绍redis集群的源码设计实现。
一 节点
1.1 节点数据结构
集群中每个节点都会使用clusterNode结构自己的状态,并为集群中的其他节点都创建一个clusterNode结构记录其他节点状态:
typedef struct clusterNode {
mstime_t ctime; /* Node object creation time. */ //节点的创建时间
char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size *///节点的名字,40个十六进制字符
int flags; /* CLUSTER_NODE_... */ //节点标识,记录节点角色,节点状态等;
uint64_t configEpoch; /* Last configEpoch observed for this node *///节点当前配置的纪元,用于实现故障转移
unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node *///slots是一个二进制数组,数组长度 CLUSTER_SLOTS/8
int numslots; /* Number of slots handled by this node */ //记录节点记录的槽的数量
int numslaves; /* Number of slave nodes, if this is a master */
struct clusterNode **slaves; /* pointers to slave nodes */
struct clusterNode *slaveof; /* pointer to the master node. Note that it
may be NULL even if the node is a slave
if we don't have the master node in our
tables. */
mstime_t ping_sent; /* Unix time we sent latest ping */
mstime_t pong_received; /* Unix time we received the pong */
mstime_t fail_time; /* Unix time when FAIL flag was set */
mstime_t voted_time; /* Last time we voted for a slave of this master */
mstime_t repl_offset_time; /* Unix time we received offset for this node */
mstime_t orphaned_time; /* Starting time of orphaned master condition */
long long repl_offset; /* Last known repl offset for this node. */
char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */
int port; /* Latest known clients port of this node */
int cport; /* Latest known cluster port of this node. */
clusterLink *link; /* TCP/IP link with this node *///保存连接节点有关的信息
list *fail_reports; /* List of nodes signaling this as failing */
} clusterNode;
clusterNode中link属性保存了连接节点所需要的有关信息:
/**
* clusterLink封装了与远程节点通信所需的所有信息。
*/
/* clusterLink encapsulates everything needed to talk with a remote node. */
typedef struct clusterLink {
mstime_t ctime; /* Link creation time */ //连接创建的时间
int fd; /* TCP socket file descriptor */ //文件描述符号
sds sndbuf; /* Packet send buffer */ //输出缓冲区,保存着等待发送给其他节点的信息
sds rcvbuf; /* Packet reception buffer *///输入缓冲区,保存其他节点发来的信息
struct clusterNode *node; /* Node related to this link if any, or NULL */ //与这个连接相关联的节点,没有就null
} clusterLink;
每个服务器都会保存一个clusterState结构(入口server.h/redisServer --> cluster.h/clusterState);用来记录当前集群所处的状态;
typedef struct clusterState {
clusterNode *myself; /* This node *///当前节点的指针
uint64_t currentEpoch; //集群当前配置的纪元,用于实现故障转移
int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */ //集群状态 ,上线/下线
int size; /* Num of master nodes with at least one slot */
dict *nodes; /* Hash table of name -> clusterNode structures */ //集群的节点名单
dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */
clusterNode *migrating_slots_to[CLUSTER_SLOTS];
clusterNode *importing_slots_from[CLUSTER_SLOTS];
clusterNode *slots[CLUSTER_SLOTS];
uint64_t slots_keys_count[CLUSTER_SLOTS];
rax *slots_to_keys;
/* 以下字段用于在选举时获取slave 的状态 */
/* The following fields are used to take the slave state on elections. */
mstime_t failover_auth_time; /* Time of previous or next election. */
int failover_auth_count; /* Number of votes received so far. */ //故障转移 收到的票数
int failover_auth_sent; /* True if we already asked for votes. */
int failover_auth_rank; /* This slave rank for current auth request. */
uint64_t failover_auth_epoch; /* Epoch of the current election. */
int cant_failover_reason; /* Why a slave is currently not able to
failover. See the CANT_FAILOVER_* macros. */
//salve无法进行故障转移的原因
/* 手动故障转移状态*/
/* Manual failover state in common. */
mstime_t mf_end; /* Manual failover time limit (ms unixtime).
It is zero if there is no MF in progress. */
//手动故障切换时间限制(ms unixtime)。
//如果没有正在进行的manual failover,则为零
/* master的手动故障转移状态 */
/* Manual failover state of master. */
clusterNode *mf_slave; /* Slave performing the manual failover. */
/* Manual failover state of slave. */
long long mf_master_offset; /* Master offset the slave needs to start MF
or zero if stil not received. */
int mf_can_start; /* If non-zero signal that the manual failover
can start requesting masters vote. */
/* The followign fields are used by masters to take state on elections. */
uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */
int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */
/* 按照消息类型接受 和 发送送消息*/
/* Messages received and sent by type. */
long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT];
long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT];
long long stats_pfail_nodes; /* Number of nodes in PFAIL status,
excluding nodes without address. */
} clusterState;
1.2 cluster meet命令的实现
向节点A发送cluster meet将节点B加入到集群中;
节点握手过程如下图:
cluster meet 命令的入口位于 cluster.c/clusterCommand()
/**
* cluster 命令的实现
*/
void clusterCommand(client *c) {
if (server.cluster_enabled == 0) {
addReplyError(c,"This instance has cluster support disabled");
return;
}
if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"help")) {
//.................
} else if (!strcasecmp(c->argv[1]->ptr,"meet") && (c->argc == 4 || c->argc == 5)) {
/* CLUSTER MEET <ip> <port> [cport] */
long long port, cport;
//获取端口
if (getLongLongFromObject(c->argv[3], &port) != C_OK) {
addReplyErrorFormat(c,"Invalid TCP base port specified: %s",
(char*)c->argv[3]->ptr);
return;
}
/**
* 群中的每个节点都会单独开辟一个 TCP通道,用于节点之间彼此通信
* 默认通信端口号在基础端口上加 10000
* (也可通过cport的参数指定端口)
*/
if (c->argc == 5) {
if (getLongLongFromObject(c->argv[4], &cport) != C_OK) {
addReplyErrorFormat(c,"Invalid TCP bus port specified: %s",
(char*)c->argv[4]->ptr);
return;
}
} else {
cport = port + CLUSTER_PORT_INCR;
}
//开始握手
if (clusterStartHandshake(c->argv[2]->ptr,port,cport) == 0 &&
errno == EINVAL)
{
addReplyErrorFormat(c,"Invalid node address specified: %s:%s",
(char*)c->argv[2]->ptr, (char*)c->argv[3]->ptr);
} else {
addReply(c,shared.ok);
}
}
//.......................
}
二 槽指派
2.1节点记录槽信息
clusterNode结构体中的slots属性是个二进制数组,记录了节点具体负责了哪个节点;例如,slots[111] = 1,表示该节点负责了编号111的槽;
numslots属性则记录了节点负责槽的数量;
2.2.传播节点的槽指派信息
节点除了会记录自己负责的槽信息,还会将自己负责的槽信息发送给其他节点;就是通过发送slots数组信息实现的;
2.3.记录集群所有槽的指派信息
clusterState结构中的slots是clusterNode数组,值是一个指向clusterNode的指针,记录了每个槽是由哪个节点负责的;
如果槽没有节点负责,则slot对应的数组值是NULL;
如果需要检查某个槽是否有节点负责,可以通过查询clusterState的slots高效解决;
2.4.CLUSTER ADDSLOTS命令的实现
cluster addslots 命令接受一个或者多个槽作为参数,并将输入的槽指派给接受该命令的节点;
cluster addslots 主要操作可以分为两步:
- 更新myself节点的 slots属性bit位置标记为1。标识该槽由myself节点负责
- 更新clusterStatus属性的slots 数组对应的槽指向 myself节点;标识对应槽是由哪个节点负责的
代码入口 cluster.c/clusterCommand() ==> cluster.c/clusterAddSlot()
cluster.c/clusterCommand()
/**
* cluster 命令的实现
*/
void clusterCommand(client *c) {
//....................
else if ((!strcasecmp(c->argv[1]->ptr,"addslots") ||
!strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3)
{ // cluster addslots 和cluster delslots 命令的实现
/* CLUSTER ADDSLOTS <slot> [slot] ... */
/* CLUSTER DELSLOTS <slot> [slot] ... */
int j, slot;
unsigned char *slots = zmalloc(CLUSTER_SLOTS);
int del = !strcasecmp(c->argv[1]->ptr,"delslots");//标记是否是删除slot操作
memset(slots,0,CLUSTER_SLOTS);
/**
* 检查所有输入参数是否可解析, 以及所有插槽是否已经分配。
*/
/* Check that all the arguments are parseable and that all the
* slots are not already busy. */
for (j = 2; j < c->argc; j++) {
//检查输入的slot参数是否合法
if ((slot = getSlotOrReply(c,c->argv[j])) == -1) {
zfree(slots);
return;
}
if (del && server.cluster->slots[slot] == NULL) { //如果删除还没分配的槽,返回错误
addReplyErrorFormat(c,"Slot %d is already unassigned", slot);
zfree(slots);
return;
} else if (!del && server.cluster->slots[slot]) { //如果是分配 已经被分配的槽,返回错误
addReplyErrorFormat(c,"Slot %d is already busy", slot);
zfree(slots);
return;
}
if (slots[slot]++ == 1) { //检查同一个槽是否输入了多次,如果是,返回错误;
addReplyErrorFormat(c,"Slot %d specified multiple times",
(int)slot);
zfree(slots);
return;
}
}
//至此参数输入正确,
for (j = 0; j < CLUSTER_SLOTS; j++) {
if (slots[j]) {
int retval;
/**
* 如果此插槽设置为import,我们可以清除此状态,因为现在我们是插槽的真正所有者。
*/
/* If this slot was set as importing we can clear this
* state as now we are the real owner of the slot. */
if (server.cluster->importing_slots_from[j])
server.cluster->importing_slots_from[j] = NULL;
/**
* addslot操作:
* 1. 更新myself节点的 slots属性bit位置标记为1。标识该槽由myself节点负责
* 2. 更新clusterStatus属性的slots 数组对应的槽指向 myself节点;标识对应槽是由哪个节点负责的
*/
retval = del ? clusterDelSlot(j) :
clusterAddSlot(myself,j);
serverAssertWithInfo(c,NULL,retval == C_OK);
}
}
zfree(slots);
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
addReply(c,shared.ok);
}
//.............
}
cluster.c/clusterAddSlot()
/**
* 将指定的插槽添加到节点“n”将服务的插槽列表中。
* 如果操作以成功结束,则返回C_OK。
* 如果插槽已分配给另一个实例,则认为这是一个错误,并返回C_ERR。
*/
/* Add the specified slot to the list of slots that node 'n' will
* serve. Return C_OK if the operation ended with success.
* If the slot is already assigned to another instance this is considered
* an error and C_ERR is returned. */
int clusterAddSlot(clusterNode *n, int slot) {
//'slot'槽已经分配节点了 返回错误
if (server.cluster->slots[slot]) return C_ERR;
//更新n节点的 slots属性bit位置标记为1。标识该槽由n节点负责
clusterNodeSetSlotBit(n,slot);
//更新clusterStatus属性的slots 数组对应的槽指向 myself节点;标识对应槽是由哪个节点负责的
server.cluster->slots[slot] = n;
return C_OK;
}
三 集群中执行命令
在集群中执行命令,整理流程如下
命令执行的入口位于:server.c/processCommand() ==> cluster.c/getNodeByQuery()
计算键属于哪个槽、判断槽是否当前节点负责、MOVED错误 几个关系操作都是由 getNodeByQuery() 完成的;
clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) {
clusterNode *n = NULL;
robj *firstkey = NULL;
int multiple_keys = 0;
multiState *ms, _ms;
multiCmd mc;
int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0;
/*如果模块禁用了群集重定向,则允许设置任何key操作*/
/* Allow any key to be set if a module disabled cluster redirections. */
if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION)
return myself;
/* Set error code optimistically for the base case. */
if (error_code) *error_code = CLUSTER_REDIR_NONE;
/**模块可以关闭Redis群集重定向:
* 这在编写实现完全不同的分布式系统的模块时非常有用。*/
/* Modules can turn off Redis Cluster redirection: this is useful
* when writing a module that implements a completely different
* distributed system. */
/* We handle all the cases as if they were EXEC commands, so we have
* a common code path for everything */
if (cmd->proc == execCommand) {
/* If CLIENT_MULTI flag is not set EXEC is just going to return an
* error. */
if (!(c->flags & CLIENT_MULTI)) return myself;
ms = &c->mstate;
} else {
/* In order to have a single codepath create a fake Multi State
* structure if the client is not in MULTI/EXEC state, this way
* we have a single codepath below. */
ms = &_ms;
_ms.commands = &mc;
_ms.count = 1;
mc.argv = argv;
mc.argc = argc;
mc.cmd = cmd;
}
/**
* 检查所有key是否在同一哈希槽中,并获取此槽和关联的节点。
*/
/* Check that all the keys are in the same hash slot, and obtain this
* slot and the node associated. */
for (i = 0; i < ms->count; i++) {
struct redisCommand *mcmd;
robj **margv;
int margc, *keyindex, numkeys, j;
mcmd = ms->commands[i].cmd;
margc = ms->commands[i].argc;
margv = ms->commands[i].argv;
keyindex = getKeysFromCommand(mcmd,margv,margc,&numkeys);
for (j = 0; j < numkeys; j++) {
robj *thiskey = margv[keyindex[j]];
//计算key所属的slot
int thisslot = keyHashSlot((char*)thiskey->ptr,
sdslen(thiskey->ptr));
if (firstkey == NULL) {
/* This is the first key we see. Check what is the slot
* and node. */
firstkey = thiskey;
slot = thisslot;
//n获取负责该槽的节点
n = server.cluster->slots[slot];
/**
* 如果这个节点还没节点负责,还在cluster down状态;
* 但是,该状态尚未更新,因此在processCommand()中没有捕获到该状态。
* 向客户端报告相同的错误。
*/
/* Error: If a slot is not served, we are in "cluster down"
* state. However the state is yet to be updated, so this was
* not trapped earlier in processCommand(). Report the same
* error to the client. */
if (n == NULL) {
getKeysFreeResult(keyindex);
if (error_code)
*error_code = CLUSTER_REDIR_DOWN_UNBOUND;
return NULL;
}
/**
* 如果要迁移或导入此插槽,则需要检查请求中有所有的key(只有这样才能安全地为请求提供服务,否则返回TRYAGAIN错误)。
* 为此,我们设置导入/迁移状态,并为每个缺少的键增加一个计数器。
*/
/* If we are migrating or importing this slot, we need to check
* if we have all the keys in the request (the only way we
* can safely serve the request, otherwise we return a TRYAGAIN
* error). To do so we set the importing/migrating state and
* increment a counter for every missing key. */
if (n == myself &&
server.cluster->migrating_slots_to[slot] != NULL)
{
migrating_slot = 1;
} else if (server.cluster->importing_slots_from[slot] != NULL) {
importing_slot = 1;
}
} else {
/**
* 确保后续的key和第一个key在相同的槽里
*/
/* If it is not the first key, make sure it is exactly
* the same key as the first we saw. */
if (!equalStringObjects(firstkey,thiskey)) {
//不在同一个槽,报错
if (slot != thisslot) {
/* Error: multiple keys from different slots. */
getKeysFreeResult(keyindex);
if (error_code)
*error_code = CLUSTER_REDIR_CROSS_SLOT;
return NULL;
} else {
/* Flag this request as one with multiple different
* keys. */
multiple_keys = 1;
}
}
}
/* Migarting / Improrting slot? Count keys we don't have. */
if ((migrating_slot || importing_slot) &&
lookupKeyRead(&server.db[0],thiskey) == NULL)
{
missing_keys++;
}
}
//释放 keyindex
getKeysFreeResult(keyindex);
}
/**
* 命令中没有key
*/
/* No key at all in command? then we can serve the request
* without redirections or errors in all the cases. */
if (n == NULL) return myself;
/**
* 集群不是在线状态
*/
/* Cluster is globally down but we got keys? We can't serve the request. */
if (server.cluster->state != CLUSTER_OK) {
if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE;
return NULL;
}
/**
* 通过引用 返回slot
*/
/* Return the hashslot by reference. */
if (hashslot) *hashslot = slot;
/**
* 如果插槽处于 "迁移或导入状态",则迁移始终在本地节点的上下文中工作。
* 在这种情况下,我们需要能够在实例之间自由移动键。
*/
/* MIGRATE always works in the context of the local node if the slot
* is open (migrating or importing state). We need to be able to freely
* move keys among instances in this case. */
if ((migrating_slot || importing_slot) && cmd->proc == migrateCommand)
return myself;
/**
* 节点处于迁移状态,并且存在“未迁移”的key,返回ASK错误
*/
/* If we don't have all the keys and we are migrating the slot, send
* an ASK redirection. */
if (migrating_slot && missing_keys) {
if (error_code) *error_code = CLUSTER_REDIR_ASK;
return server.cluster->migrating_slots_to[slot];
}
/**
* 如果我们正在接收插槽,并且client正确地将请求标记为“ASKING”,我们就可以为请求提供服务
* 但是,如果请求涉及多个key,而我们没有全部key,那么唯一的选择就是发送一个trygain错误。
*/
/* If we are receiving the slot, and the client correctly flagged the
* request as "ASKING", we can serve the request. However if the request
* involves multiple keys and we don't have them all, the only option is
* to send a TRYAGAIN error. */
if (importing_slot &&
(c->flags & CLIENT_ASKING || cmd->flags & CMD_ASKING))
{
if (multiple_keys && missing_keys) {
if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE;
return NULL;
} else {
return myself;
}
}
/* Handle the read-only client case reading from a slave: if this
* node is a slave and the request is about an hash slot our master
* is serving, we can reply without redirection. */
if (c->flags & CLIENT_READONLY &&
(cmd->flags & CMD_READONLY || cmd->proc == evalCommand ||
cmd->proc == evalShaCommand) &&
nodeIsSlave(myself) &&
myself->slaveof == n)
{
return myself;
}
/**
* 返回MOVED错误
*/
/* Base case: just return the right node. However if this node is not
* myself, set error_code to MOVED since we need to issue a rediretion. */
if (n != myself && error_code) *error_code = CLUSTER_REDIR_MOVED;
return n;
}
四 复制与故障转移
Redis中节点分为主节点和从节点,主节点用于处理槽,从节点用于复制某个主节点;
当主节点下线时,从节点将成为主节点,替代主节点处理命令请求;下线主节点重新上线时,将成为从节点;
4.1设置从节点 CLUSTER REPLICATE <node_id>
CLUSTER REPLICATE <node_id> 让接受到命令的节点设置为从节点, 复制node_id节点;
CLUSTER REPLICATE命令的实现入口位于:cluster.c/clusterCommand ==> replicate
void clusterCommand(){
//............
else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc == 3) {
/* CLUSTER REPLICATE <NODE ID> */
//查找命令中的NODEID 参数
clusterNode *n = clusterLookupNode(c->argv[2]->ptr);
//没有找到
/* Lookup the specified node in our table. */
if (!n) {
addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
return;
}
//自己复制自己,返回错误
/* I can't replicate myself. */
if (n == myself) {
addReplyError(c,"Can't replicate myself");
return;
}
//参数中的NODE节点是个从节点,不能复制,返回错误
/* Can't replicate a slave. */
if (nodeIsSlave(n)) {
addReplyError(c,"I can only replicate a master, not a replica.");
return;
}
/**
* 如果实例当前是一个主节点
* 没有分配slot并且没有复制其他节点
* 那么当前节点可以设置为从节点
*/
/* If the instance is currently a master, it should have no assigned
* slots nor keys to accept to replicate some other node.
* Slaves can switch to another master without issues. */
if (nodeIsMaster(myself) &&
(myself->numslots != 0 || dictSize(server.db[0].dict) != 0)) {
addReplyError(c,
"To set a master the node must be empty and "
"without assigned slots.");
return;
}
//指定当前节点的master是节点n
/* Set the master. */
clusterSetMaster(n);
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
addReply(c,shared.ok);
}
//...............
/**
* 将制定的节点n设置成当前节点的master;
* 如果当前节点的角色是master,将转换成slave
*/
/* Set the specified node 'n' as master for this node.
* If this node is currently a master, it is turned into a slave. */
void clusterSetMaster(clusterNode *n) {
//不能自己复制字节
serverAssert(n != myself);
//当前节点没有分配槽
serverAssert(myself->numslots == 0);
if (nodeIsMaster(myself)) {
//将当前节点的CLUSTER_NODE_MASTER和CLUSTER_NODE_MIGRATE_TO 取反;(当前节点不再是master,不在可以MIGRATE_TO)
myself->flags &= ~(CLUSTER_NODE_MASTER|CLUSTER_NODE_MIGRATE_TO);
//将当前节点设置为slave
myself->flags |= CLUSTER_NODE_SLAVE;
clusterCloseAllSlots();
} else { //当前节点不是master
if (myself->slaveof)
clusterNodeRemoveSlave(myself->slaveof,myself);
}
//设置myself当前节点 复制 指定的节点n
myself->slaveof = n;
//给节点n新加的一个slave:myself当前节点
clusterNodeAddSlave(n,myself);
replicationSetMaster(n->ip, n->port);
resetManualFailover();
}
clusterCommand函数接受到replicate命令后主要做了一下几件事:
- 在自己的clusterState的nodes属性中查找参数的node节点;
- 判断当前节点是否可以设置复制。
是否是自己复制自己?
参数中的node节点是否是从节点?
当前节点是否分配了槽?
......
复合要求设置条件后,调用clusterSetMaster函数,clusterSetMaster主要用于修正clusterState中的属性值(可以参考上面贴出的注释);
4.2故障检测
集群中每个节点会定期向其他节点发送PING消息,如果规定时间内没有返回PONG消息,则会将该节点标记为P_FAIL(疑似下线 probable fail);
如果集群中超过半数已经以上的主节点 将某个节点标记为P_FAIL,那个这个节点将被标记为FAIL(客观下线);并向集群广播一条 x节点的FAIL下线消息;
4.3故障转移
当一个从节点复制自己复制的master进入FAIL下线状态时,从节点将开始对主节点进行故障转移:
- 从slave中选取一个主节点;
- 被选中的slave执行 salveof no one命令,成为新的master
- 新的master会撤销 已下线master的槽指派,将槽指派给自己;
- 新的master向集群广播一条PONG消息,告知新的master已经上线;
- 新的节点接受命令,故障转移完成;
五 集群间消息
集群各节点通过互相发送消息来进行通信, 集群间的消息可以大致分为以下几种:
- MEET消息:当节点接受CLUSTER MEET命令时,节点回发送meet消息,请求加入到集群;
- PING消息:集群中每个节点 默认每间隔一秒,会从节点列表中随机选举出5个节点,在从这5个节点中选一个最长时间没有通信的,发送PING消息;
- PONG消息:当节点接受到PING 、MEET消息时,会向发送者回复PONG消息; 节点可以主动向集群广播PONG消息,通知其他节点更新该节点信息;
- FAIL消息:当一个主节点判断另个主节点已经下线时,节点会向集群广播一条FAIL消息;
- PUBLISH消息:当节点接受到一个PUBLISH命令时,节点会立即执行该命令,并向集群广播PUBLISH消息;
5.1消息头
节点消息都会包裹在一个消息头,消息头的定义位于cluster.h/clusterMsg
typedef struct {
char sig[4]; /* Signature "RCmb" (Redis Cluster message bus). */
uint32_t totlen; /* Total length of this message */ //消息的长度
uint16_t ver; /* Protocol version, currently set to 1. */
uint16_t port; /* TCP base port number. */
uint16_t type; /* Message type */ //消息类型
uint16_t count; /* Only used for some kind of messages. */
/*消息正文的包含节点信息数量(只对部分消息类型有效:meet ping pong)*/
uint64_t currentEpoch; /* The epoch accordingly to the sending node. */
/** 发送者所处的配置纪元 */
uint64_t configEpoch; /* The config epoch if it's a master, or the last
epoch advertised by its master if it is a
slave. */
/** */
uint64_t offset; /* Master replication offset if node is a master or
processed replication offset if node is a slave. */
char sender[CLUSTER_NAMELEN]; /* Name of the sender node *///发送者的名字
unsigned char myslots[CLUSTER_SLOTS/8];//发送者槽指派信息
char slaveof[CLUSTER_NAMELEN]; //如果发送至是一个从节点,该字段是主节点的名字
char myip[NET_IP_STR_LEN]; /* Sender IP, if not all zeroed. */
char notused1[34]; /* 34 bytes reserved for future usage. */
uint16_t cport; /* Sender TCP cluster bus port */
uint16_t flags; /* Sender node flags */
unsigned char state; /* Cluster state from the POV of the sender */
unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */
union clusterMsgData data;
} clusterMsg;
消息头中clusterMsgData 联合体 记录消息的正文;
5.2 MEET PING PONG消息的实现
Redis集群中个节点是通过GOSSIP协议来交换各个节点状态信息,gossip协议由MEET、PING、PONG三种消息组成的:
union clusterMsgData {
/* PING, MEET and PONG */
struct {
/* Array of N clusterMsgDataGossip structures */
clusterMsgDataGossip gossip[1];
} ping;
/* FAIL */
struct {
clusterMsgDataFail about;
} fail;
/* PUBLISH */
struct {
clusterMsgDataPublish msg;
} publish;
/* UPDATE */
struct {
clusterMsgDataUpdate nodecfg;
} update;
/* MODULE */
struct {
clusterMsgModule msg;
} module;
};
因为MEET 、PING、PONG是相同的消息体,所以节点是通过消息头中的type属性来区分PING、PONG、MEET消息的;
发送pong 、ping、meet消息的入口位于cluster.c/clusterCron ==>cluster.c/clusterSendPing; (ping、pong、meet消费发送都是通过clusterSendPing函数实现的)
5.3FAIL消息的实现
在集群节点数量较多时,使用Gossip协议来传播下线消息 会是下线消息传播延时;
而发送FAIL消息可以让集群中所有节点知道节点下线,从而尽快判断集群是否下线,对下线主节点执行故障转移操作;
FAIL消息体的定义位于cluster.h/clusterMsgDataFail
/**
* 因为集群中节点的名字是惟一的,所以FAIL消息包含节点名字
* 就可以判断是哪个节点下线了
* FAIL
*/
typedef struct {
char nodename[CLUSTER_NAMELEN];
} clusterMsgDataFail;
5.4PUBLISH消息的实现
当集群接受到PUBLISH 消息,不仅会像channel发送message消息,还会像集群广播一条PUBLISH消息;
publish消息的消息体定义位于cluster.h/clusterMsgDataPublish
typedef struct {
uint32_t channel_len; //保存channel参数的长度
uint32_t message_len; //message参数的长度
//定义为8字节正是站位,实际长度由保存内容解决
unsigned char bulk_data[8]; /* 8 bytes just as placeholder. */
} clusterMsgDataPublish;
处理publish命令的入口位于pubsub.c/publishCommand ==>cluster.c/clusterPropagatePublish
void publishCommand(client *c) {
int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
if (server.cluster_enabled)
clusterPropagatePublish(c->argv[1],c->argv[2]);
else
forceCommandPropagation(c,PROPAGATE_REPL);
addReplyLongLong(c,receivers);
}