前几部分讲解了哨兵监控master,当master出现异常时将自动切换,从replicate中选择一个最优的replicate升级为新的master,然后通知其他replicate从新的master进行数据同步。
然而只有一个哨兵进行监控,当网络波动或者哨兵本身出现问题时,这样非常容易出现误判,导致master切换。所以本文主要讲解哨兵集群,多个哨兵共同监控master,多个哨兵都认为master下线才确定master下线。
一、如何发现其他哨兵
每个哨兵配置中,只配置了需要监控的master的地址,并没有配置其他哨兵,如何发现呢?这里发布订阅就派上用场了。
1.1 订阅hello通道
在哨兵和master建立连接命令连接时,同时也建立了一个专门用于发布订阅的连接。并且订阅了__sentinel__:hello
通道。
sentinelTimer() ->
sentinelHandleDictOfRedisInstances() ->
sentinelHandleRedisInstance() ->
sentinelReconnectInstance()
#define SENTINEL_HELLO_CHANNEL "__sentinel__:hello"
void sentinelReconnectInstance(sentinelRedisInstance *ri) {
...
/* Pub / Sub */
if ((ri->flags & (SRI_MASTER|SRI_SLAVE)) && link->pc == NULL) {
link->pc = redisAsyncConnectBind(ri->addr->ip,ri->addr->port,NET_FIRST_BIND_ADDR);
...
sentinelSetClientName(ri,link->pc,"pubsub");
/* Now we subscribe to the Sentinels "Hello" channel. */
retval = redisAsyncCommand(link->pc,
sentinelReceiveHelloMessages, ri, "%s %s",
sentinelInstanceMapCommand(ri,"SUBSCRIBE"),
SENTINEL_HELLO_CHANNEL);
...
}
}
1.2 不间断发送哨兵自身消息
哨兵定期的将自身信息以及监控的master信息通过__sentinel__:hello
通道发出去,所有订阅了此通道的哨兵都会收到信息,这样哨兵就知道了其他的哨兵。可以看到是2s发送一次当前哨兵信息。
sentinelTimer() ->
sentinelHandleDictOfRedisInstances() ->
sentinelHandleRedisInstance() ->
sentinelSendPeriodicCommands()
#define SENTINEL_PUBLISH_PERIOD 2000
void sentinelSendPeriodicCommands(sentinelRedisInstance *ri) {
...
/* PUBLISH hello messages to all the three kinds of instances. */
if ((now - ri->last_pub_time) > SENTINEL_PUBLISH_PERIOD) {
sentinelSendHello(ri);
}
}
int sentinelSendHello(sentinelRedisInstance *ri) {
char ip[NET_IP_STR_LEN];
char payload[NET_IP_STR_LEN+1024];
...
/* Format and send the Hello message. */
snprintf(payload,sizeof(payload),
"%s,%d,%s,%llu," /* Info about this sentinel. */
"%s,%s,%d,%llu", /* Info about current master. */
announce_ip, announce_port, sentinel.myid,
(unsigned long long) sentinel.current_epoch,
/* --- */
master->name,announceSentinelAddr(master_addr),master_addr->port,
(unsigned long long) master->config_epoch);
retval = redisAsyncCommand(ri->link->cc,
sentinelPublishReplyCallback, ri, "%s %s %s",
sentinelInstanceMapCommand(ri,"PUBLISH"),
SENTINEL_HELLO_CHANNEL,payload);
...
return C_OK;
}
1.3 哨兵接收处理hello消息
在订阅__sentinel__:hello
通道时,注册了回调函数sentinelReceiveHelloMessages进行处理其他哨兵发布的hello消息。
/* This is our Pub/Sub callback for the Hello channel. It's useful in order
* to discover other sentinels attached at the same master. */
void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privdata) {
sentinelRedisInstance *ri = privdata;
redisReply *r;
UNUSED(c);
if (!reply || !ri) return;
r = reply;
/* Update the last activity in the pubsub channel. Note that since we
* receive our messages as well this timestamp can be used to detect
* if the link is probably disconnected even if it seems otherwise. */
ri->link->pc_last_activity = mstime();
/* Sanity check in the reply we expect, so that the code that follows
* can avoid to check for details. */
if (r->type != REDIS_REPLY_ARRAY ||
r->elements != 3 ||
r->element[0]->type != REDIS_REPLY_STRING ||
r->element[1]->type != REDIS_REPLY_STRING ||
r->element[2]->type != REDIS_REPLY_STRING ||
strcmp(r->element[0]->str,"message") != 0) return;
/* We are not interested in meeting ourselves */
if (strstr(r->element[2]->str,sentinel.myid) != NULL) return;
sentinelProcessHelloMessage(r->element[2]->str, r->element[2]->len);
}
从消息中获取哨兵的IP,port,runid,epoch,master的name,IP,port,epoch,并建立哨兵对象
void sentinelProcessHelloMessage(char *hello, int hello_len) {
/* Format is composed of 8 tokens:
* 0=ip,1=port,2=runid,3=current_epoch,4=master_name,
* 5=master_ip,6=master_port,7=master_config_epoch. */
int numtokens, port, removed, master_port;
uint64_t current_epoch, master_config_epoch;
char **token = sdssplitlen(hello, hello_len, ",", 1, &numtokens);
sentinelRedisInstance *si, *master;
if (numtokens == 8) {
/* Obtain a reference to the master this hello message is about */
master = sentinelGetMasterByName(token[4]);
if (!master) goto cleanup; /* Unknown master, skip the message. */
/* First, try to see if we already have this sentinel. */
port = atoi(token[1]);
master_port = atoi(token[6]);
si = getSentinelRedisInstanceByAddrAndRunID(
master->sentinels,token[0],port,token[2]);
current_epoch = strtoull(token[3],NULL,10);
master_config_epoch = strtoull(token[7],NULL,10);
if (!si) {
/* If not, remove all the sentinels that have the same runid
* because there was an address change, and add the same Sentinel
* with the new address back. */
removed = removeMatchingSentinelFromMaster(master,token[2]);
if (removed) {
sentinelEvent(LL_NOTICE,"+sentinel-address-switch",master,
"%@ ip %s port %d for %s", token[0],port,token[2]);
} else {
/* Check if there is another Sentinel with the same address this
* new one is reporting. What we do if this happens is to set its
* port to 0, to signal the address is invalid. We'll update it
* later if we get an HELLO message. */
sentinelRedisInstance *other =
getSentinelRedisInstanceByAddrAndRunID(
master->sentinels, token[0],port,NULL);
if (other) {
sentinelEvent(LL_NOTICE,"+sentinel-invalid-addr",other,"%@");
other->addr->port = 0; /* It means: invalid address. */
sentinelUpdateSentinelAddressInAllMasters(other);
}
}
/* Add the new sentinel. */
si = createSentinelRedisInstance(token[2],SRI_SENTINEL,
token[0],port,master->quorum,master);
if (si) {
if (!removed) sentinelEvent(LL_NOTICE,"+sentinel",si,"%@");
/* The runid is NULL after a new instance creation and
* for Sentinels we don't have a later chance to fill it,
* so do it now. */
si->runid = sdsnew(token[2]);
sentinelTryConnectionSharing(si);
if (removed) sentinelUpdateSentinelAddressInAllMasters(si);
sentinelFlushConfig();
}
}
/* Update local current_epoch if received current_epoch is greater.*/
if (current_epoch > sentinel.current_epoch) {
sentinel.current_epoch = current_epoch;
sentinelFlushConfig();
sentinelEvent(LL_WARNING,"+new-epoch",master,"%llu",
(unsigned long long) sentinel.current_epoch);
}
/* Update master info if received configuration is newer. */
if (si && master->config_epoch < master_config_epoch) {
master->config_epoch = master_config_epoch;
if (master_port != master->addr->port ||
!sentinelAddrEqualsHostname(master->addr, token[5]))
{
sentinelAddr *old_addr;
sentinelEvent(LL_WARNING,"+config-update-from",si,"%@");
sentinelEvent(LL_WARNING,"+switch-master",
master,"%s %s %d %s %d",
master->name,
announceSentinelAddr(master->addr), master->addr->port,
token[5], master_port);
old_addr = dupSentinelAddr(master->addr);
sentinelResetMasterAndChangeAddress(master, token[5], master_port);
sentinelCallClientReconfScript(master,
SENTINEL_OBSERVER,"start",
old_addr,master->addr);
releaseSentinelAddr(old_addr);
}
}
/* Update the state of the Sentinel. */
if (si) si->last_hello_time = mstime();
}
cleanup:
sdsfreesplitres(token,numtokens);
}
1.4 为啥要不间断的发送hello消息呢?
- 对于发布订阅来说,消息是不存储的,消息发布的时候,如果此时你没有订阅,你是收不到消息的,即使你后续订阅了,也收不到那条消息,相当于消息丢失,所以需要发送多次
- 对于哨兵来说随时都可能加入新的哨兵,所以需要不时的发送消息,快速的将哨兵加入到集群中来,完成拓扑关系
上图过程中,sentinel1发布消息时,sentinel3还没有订阅,所以需要后续再发布信息,经过一段时间后,每个哨兵都包含了其他所有哨兵的信息。
并且和其他哨兵也建立了连接。
二、如何确定master下线
下线状态分为主观下线和客观下线。
主观下线:某个哨兵认为监控的某个master异常
客观下线:多数哨兵都认为监控的某个master异常
只有客观下线的master才被认为异常,开始进行故障切换。
2.1 判断下线
sentinelTimer() ->
sentinelHandleDictOfRedisInstances() ->
sentinelHandleRedisInstance()
2.1.1 主观下线
void sentinelHandleRedisInstance(sentinelRedisInstance *ri) {
...
sentinelCheckSubjectivelyDown(ri);
...
}
void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) {
...
/* Update the SDOWN flag. We believe the instance is SDOWN if:
*
* 1) It is not replying.
* 2) We believe it is a master, it reports to be a slave for enough time
* to meet the down_after_period, plus enough time to get two times
* INFO report from the instance. */
if (elapsed > ri->down_after_period ||
(ri->flags & SRI_MASTER &&
ri->role_reported == SRI_SLAVE &&
mstime() - ri->role_reported_time >
(ri->down_after_period+SENTINEL_INFO_PERIOD*2)))
{
/* Is subjectively down */
if ((ri->flags & SRI_S_DOWN) == 0) {
sentinelEvent(LL_WARNING,"+sdown",ri,"%@");
ri->s_down_since_time = mstime();
ri->flags |= SRI_S_DOWN;
}
}
...
}
2.1.2 客观下线
void sentinelHandleRedisInstance(sentinelRedisInstance *ri) {
...
/* Only masters */
if (ri->flags & SRI_MASTER) {
//判断客观下线
sentinelCheckObjectivelyDown(ri);
...
}
遍历监控当前master的所有哨兵,看他们监控的状态, 当大于等于quorum个哨兵都认为master下线,则达成统一,一致认为下线。而quorum是配置的,
sentinel monitor <master-name> <ip> <redis-port> <quorum>
,只有大于等于quorum个哨兵认为master下线才确定master下线。
void sentinelCheckObjectivelyDown(sentinelRedisInstance *master) {
dictIterator *di;
dictEntry *de;
unsigned int quorum = 0, odown = 0;
if (master->flags & SRI_S_DOWN) {
/* Is down for enough sentinels? */
quorum = 1; /* the current sentinel. */
/* Count all the other sentinels. */
di = dictGetIterator(master->sentinels);
while((de = dictNext(di)) != NULL) {
sentinelRedisInstance *ri = dictGetVal(de);
if (ri->flags & SRI_MASTER_DOWN) quorum++;
}
dictReleaseIterator(di);
if (quorum >= master->quorum) odown = 1;
}
/* Set the flag accordingly to the outcome. */
if (odown) {
if ((master->flags & SRI_O_DOWN) == 0) {
sentinelEvent(LL_WARNING,"+odown",master,"%@ #quorum %d/%d",
quorum, master->quorum);
master->flags |= SRI_O_DOWN;
master->o_down_since_time = mstime();
}
} else {
if (master->flags & SRI_O_DOWN) {
sentinelEvent(LL_WARNING,"-odown",master,"%@");
master->flags &= ~SRI_O_DOWN;
}
}
}
2.2 询问其他哨兵意见
在2.1中判断客观下线时使用了其他哨兵对于master的监控状态,那这些状态是什么时候获取的呢?
哨兵主动发送请求给其他哨兵
2.2.1 发送询问请求
sentinelTimer() ->
sentinelHandleDictOfRedisInstances() ->
sentinelHandleRedisInstance() ->
sentinelAskMasterStateToOtherSentinels()
遍历监控当前master的所有哨兵,然后逐一发送is-master-down-by-addr
命令。
#define SENTINEL_ASK_FORCED (1<<0)
void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int flags) {
dictIterator *di;
dictEntry *de;
di = dictGetIterator(master->sentinels);
while((de = dictNext(di)) != NULL) {
sentinelRedisInstance *ri = dictGetVal(de);
mstime_t elapsed = mstime() - ri->last_master_down_reply_time;
char port[32];
int retval;
...
/* Ask */
ll2string(port,sizeof(port),master->addr->port);
retval = redisAsyncCommand(ri->link->cc,
sentinelReceiveIsMasterDownReply, ri,
"%s is-master-down-by-addr %s %s %llu %s",
sentinelInstanceMapCommand(ri,"SENTINEL"),
announceSentinelAddr(master->addr), port,
sentinel.current_epoch,
(master->failover_state > SENTINEL_FAILOVER_STATE_NONE) ?
sentinel.myid : "*");
...
}
dictReleaseIterator(di);
}
2.2.2 处理响应
根据注册的响应处理函数,从响应中获取状态。返回0则表示哨兵认为master在线,返回1则表示哨兵认为master下线。
void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *privdata) {
sentinelRedisInstance *ri = privdata;
instanceLink *link = c->data;
redisReply *r;
if (!reply || !link) return;
link->pending_commands--;
r = reply;
/* Ignore every error or unexpected reply.
* Note that if the command returns an error for any reason we'll
* end clearing the SRI_MASTER_DOWN flag for timeout anyway. */
if (r->type == REDIS_REPLY_ARRAY && r->elements == 3 &&
r->element[0]->type == REDIS_REPLY_INTEGER &&
r->element[1]->type == REDIS_REPLY_STRING &&
r->element[2]->type == REDIS_REPLY_INTEGER)
{
ri->last_master_down_reply_time = mstime();
if (r->element[0]->integer == 1) {
ri->flags |= SRI_MASTER_DOWN;
} else {
ri->flags &= ~SRI_MASTER_DOWN;
}
...
}
}
2.2.3 其他哨兵处理询问请求
对于其他哨兵收到请求的处理
struct redisCommand sentinelcmds[] = {
...
{"sentinel",sentinelCommand,-2,"admin",0,NULL,0,0,0,0,0},
...
};
void sentinelCommand(client *c) {
...
else if (!strcasecmp(c->argv[1]->ptr,"is-master-down-by-addr")) {
/* SENTINEL IS-MASTER-DOWN-BY-ADDR <ip> <port> <current-epoch> <runid>
*
* Arguments:
*
* ip and port are the ip and port of the master we want to be
* checked by Sentinel. Note that the command will not check by
* name but just by master, in theory different Sentinels may monitor
* different masters with the same name.
*
* current-epoch is needed in order to understand if we are allowed
* to vote for a failover leader or not. Each Sentinel can vote just
* one time per epoch.
*
* runid is "*" if we are not seeking for a vote from the Sentinel
* in order to elect the failover leader. Otherwise it is set to the
* runid we want the Sentinel to vote if it did not already voted.
*/
sentinelRedisInstance *ri;
long long req_epoch;
uint64_t leader_epoch = 0;
char *leader = NULL;
long port;
int isdown = 0;
if (c->argc != 6) goto numargserr;
if (getLongFromObjectOrReply(c,c->argv[3],&port,NULL) != C_OK ||
getLongLongFromObjectOrReply(c,c->argv[4],&req_epoch,NULL)
!= C_OK)
return;
ri = getSentinelRedisInstanceByAddrAndRunID(sentinel.masters,
c->argv[2]->ptr,port,NULL);
/* It exists? Is actually a master? Is subjectively down? It's down.
* Note: if we are in tilt mode we always reply with "0". */
if (!sentinel.tilt && ri && (ri->flags & SRI_S_DOWN) &&
(ri->flags & SRI_MASTER))
isdown = 1;
...
/* Reply with a three-elements multi-bulk reply:
* down state, leader, vote epoch. */
addReplyArrayLen(c,3);
addReply(c, isdown ? shared.cone : shared.czero);
addReplyBulkCString(c, leader ? leader : "*");
addReplyLongLong(c, (long long)leader_epoch);
if (leader) sdsfree(leader);
}
三、谁来主持故障切换
一个master有多个哨兵在监控,当master下线时,那由哪个哨兵来进行切换呢?如果都来切换一切都乱套了,所以必须有一个老大来处理,那谁来当老大呢?都不服谁,那就来个投票吧,少数服从多数。
3.1 投票选举
当故障后,依然使用的是sentinel is-master-down-by-addr
命令进行投票,和状态获取不同的是runid字段传递的是实际master的runid,不是*。
3.1.1 哨兵投票请求
检查是否需要故障切换
sentinelStartFailoverIfNeeded(sentinelRedisInstance *master)
{
...
sentinelStartFailover(master);
...
}
需要故障切换,设置状态机为SENTINEL_FAILOVER_STATE_WAIT_START,并且epoch自增,这个epoch很重要,类似于版本号,后续epoch越大的master就会被认可,所以epoch也相当于投票
void sentinelStartFailover(sentinelRedisInstance *master) {
...
master->failover_state = SENTINEL_FAILOVER_STATE_WAIT_START;
master->flags |= SRI_FAILOVER_IN_PROGRESS;
master->failover_epoch = ++sentinel.current_epoch;
...
}
当开始故障切换时,使用sentinelAskMasterStateToOtherSentinels就是投票,并且runnid传输的是当前master的runnid(唯一标识符),表示开始投票啦,我投我一票(runnid是自己),看你们投给谁?
void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int flags) {
...
/* Ask */
ll2string(port,sizeof(port),master->addr->port);
retval = redisAsyncCommand(ri->link->cc,
sentinelReceiveIsMasterDownReply, ri,
"%s is-master-down-by-addr %s %s %llu %s",
sentinelInstanceMapCommand(ri,"SENTINEL"),
announceSentinelAddr(master->addr), port,
sentinel.current_epoch,
(master->failover_state > SENTINEL_FAILOVER_STATE_NONE) ?
sentinel.myid : "*");
3.1.2 其他哨兵处理投票请求
对于其他哨兵处理is-master-down-by-addr请求时,会判断runnid是不是*,如果不是*,则会进行leader的投票。
void sentinelCommand(client *c) {
...
} else if (!strcasecmp(c->argv[1]->ptr,"is-master-down-by-addr")) {
...
/* Vote for the master (or fetch the previous vote) if the request
* includes a runid, otherwise the sender is not seeking for a vote. */
if (ri && ri->flags & SRI_MASTER && strcasecmp(c->argv[5]->ptr,"*")) {
leader = sentinelVoteLeader(ri,(uint64_t)req_epoch,
c->argv[5]->ptr,
&leader_epoch);
}
...
}
...
}
根据epoch值判断,值越大,则选那个哨兵,类似投票给那个epoch值大的哨兵。并将被投票的哨兵的runnid返回。
char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch) {
if (req_epoch > sentinel.current_epoch) {
sentinel.current_epoch = req_epoch;
sentinelFlushConfig();
sentinelEvent(LL_WARNING,"+new-epoch",master,"%llu",
(unsigned long long) sentinel.current_epoch);
}
if (master->leader_epoch < req_epoch && sentinel.current_epoch <= req_epoch)
{
sdsfree(master->leader);
master->leader = sdsnew(req_runid);
master->leader_epoch = sentinel.current_epoch;
sentinelFlushConfig();
sentinelEvent(LL_WARNING,"+vote-for-leader",master,"%s %llu",
master->leader, (unsigned long long) master->leader_epoch);
/* If we did not voted for ourselves, set the master failover start
* time to now, in order to force a delay before we can start a
* failover for the same master. */
if (strcasecmp(master->leader,sentinel.myid))
master->failover_start_time = mstime()+rand()%SENTINEL_MAX_DESYNC;
}
*leader_epoch = master->leader_epoch;
return master->leader ? sdsnew(master->leader) : NULL;
}
3.1.3 哨兵处理其他哨兵响应
响应中的runnid就是投票给的哨兵标识。
void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *privdata) {
...
if (strcmp(r->element[1]->str,"*")) {
/* If the runid in the reply is not "*" the Sentinel actually
* replied with a vote. */
sdsfree(ri->leader);
if ((long long)ri->leader_epoch != r->element[2]->integer)
serverLog(LL_WARNING,
"%s voted for %s %llu", ri->name,
r->element[1]->str,
(unsigned long long) r->element[2]->integer);
ri->leader = sdsnew(r->element[1]->str);
ri->leader_epoch = r->element[2]->integer;
}
...
}
3.1.4 判断谁是哨兵leader
切换开始,进入故障切换的状态机中
sentinelHandleRedisInstance() ->
sentinelFailoverStateMachine()
void sentinelFailoverStateMachine(sentinelRedisInstance *ri) {
serverAssert(ri->flags & SRI_MASTER);
if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS)) return;
switch(ri->failover_state) {
case SENTINEL_FAILOVER_STATE_WAIT_START:
sentinelFailoverWaitStart(ri);
break;
case SENTINEL_FAILOVER_STATE_SELECT_SLAVE:
sentinelFailoverSelectSlave(ri);
break;
case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE:
sentinelFailoverSendSlaveOfNoOne(ri);
break;
case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION:
sentinelFailoverWaitPromotion(ri);
break;
case SENTINEL_FAILOVER_STATE_RECONF_SLAVES:
sentinelFailoverReconfNextSlave(ri);
break;
}
}
现在故障切换的状态进入到SENTINEL_FAILOVER_STATE_WAIT_START,所以进入到sentinelFailoverWaitStart函数,而sentinelFailoverWaitStart最重要的一件事就是去选举哨兵的leader。
void sentinelFailoverWaitStart(sentinelRedisInstance *ri) {
char *leader;
int isleader;
/* Check if we are the leader for the failover epoch. */
leader = sentinelGetLeader(ri, ri->failover_epoch);
isleader = leader && strcasecmp(leader,sentinel.myid) == 0;
sdsfree(leader);
...
//哨兵leader获取成功,进入下一个状态
ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE;
ri->failover_state_change_time = mstime();
...
}
3.1.4.1 计算各个哨兵的票数
使用字典hash进行统计,key为runnid,值为票数
...
counters = dictCreate(&leaderVotesDictType,NULL);
...
voters = dictSize(master->sentinels)+1; /* All the other sentinels and me.*/
/* Count other sentinels votes */
di = dictGetIterator(master->sentinels);
while((de = dictNext(di)) != NULL) {
sentinelRedisInstance *ri = dictGetVal(de);
if (ri->leader != NULL && ri->leader_epoch == sentinel.current_epoch)
sentinelLeaderIncr(counters,ri->leader);
}
dictReleaseIterator(di);
...
int sentinelLeaderIncr(dict *counters, char *runid) {
dictEntry *existing, *de;
uint64_t oldval;
de = dictAddRaw(counters,runid,&existing);
if (existing) {
oldval = dictGetUnsignedIntegerVal(existing);
dictSetUnsignedIntegerVal(existing,oldval+1);
return oldval+1;
} else {
serverAssert(de != NULL);
dictSetUnsignedIntegerVal(de,1);
return 1;
}
}
3.1.4.2 选择最大票数的哨兵候选人
遍历counters计数,筛选票数最大的对象,虽然是最大值,但是不一定满足条件。
char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t epoch) {
...
dictIterator *di;
dictEntry *de;
unsigned int voters = 0, voters_quorum;
char *winner = NULL;
uint64_t max_votes = 0;
...
di = dictGetIterator(counters);
while((de = dictNext(di)) != NULL) {
uint64_t votes = dictGetUnsignedIntegerVal(de);
if (votes > max_votes) {
max_votes = votes;
winner = dictGetKey(de);
}
}
dictReleaseIterator(di);
...
}
3.1.4.3 当前哨兵投票
如果当前哨兵还没有投票,则进行投票。
/* Count this Sentinel vote:
* if this Sentinel did not voted yet, either vote for the most
* common voted sentinel, or for itself if no vote exists at all. */
if (winner)
myvote = sentinelVoteLeader(master,epoch,winner,&leader_epoch);
else
myvote = sentinelVoteLeader(master,epoch,sentinel.myid,&leader_epoch);
if (myvote && leader_epoch == epoch) {
uint64_t votes = sentinelLeaderIncr(counters,myvote);
if (votes > max_votes) {
max_votes = votes;
winner = myvote;
}
}
3.1.4.4 判断票数是否满足要求
可以看出投票数必须大于哨兵个数的一半
投票数必须大于等于quorum
这两个条件都满足时,才确定了leader。
/* Check what's the winner. For the winner to win, it needs two conditions:
* 1) Absolute majority between voters (50% + 1).
* 2) And anyway at least master->quorum votes. */
voters_quorum = voters/2+1;
if (winner && (max_votes < voters_quorum || max_votes < master->quorum))
winner = NULL;
3.2 投票失败怎么办?
当投票失败,超时后,将终止切换。然后下一轮重新开始投票。
为了防止同时发起投票导致一直都选举不出leader,每个哨兵都会随机修改调度时间,增加选举leader成功率。哨兵发现异常时,如果自己没有投票,则投票给自己,每一轮投票中只能投一票。如果多个哨兵同时发起投票请求,他们都投自己一票,则票数不能超过一半,投票失败。
超时终止
sentinelFailoverWaitStart() ->
sentinelAbortFailover()
改变定时器频率
void sentinelTimer(void) {
...
/* We continuously change the frequency of the Redis "timer interrupt"
* in order to desynchronize every Sentinel from every other.
* This non-determinism avoids that Sentinels started at the same time
* exactly continue to stay synchronized asking to be voted at the
* same time again and again (resulting in nobody likely winning the
* election because of split brain voting). */
server.hz = CONFIG_DEFAULT_HZ + rand() % CONFIG_DEFAULT_HZ;
}