一、选谁上位呢?
1.1 你没资格
哨兵遍历当前被判断为下线状态的master下的所有replicate,逐一的判断是否有资格被提升为新的master,组成一个候选人队列。
满足如下的任意一个条件的都没有资格成为候选人。
- 被哨兵认为是下线状态的
- 和哨兵连接断开的
- 上一次响应哨兵的时间超过了SENTINEL_PING_PERIOD*5,即超过5秒没有响应的
- replicate的优先级配置为0的,可通过
replica-priority 100进行配置,默认为100 - 上一次响应哨兵INFO命令的时间超过了SENTINEL_PING_PERIOD*5或者SENTINEL_INFO_PERIOD*3
- replicate和master的断开时间已经超过了master->down_after_period * 10 + master被判断下线到现在的时间间隔, 其中down_after_period通过配置获得
down-after-milliseconds <name> <milliseconds>,默认30s,
sentinelTimer() ->
sentinelHandleDictOfRedisInstances() ->
sentinelHandleRedisInstance() ->
sentinelFailoverStateMachine() ->
sentinelFailoverSelectSlave() ->
sentinelSelectSlave()
sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {
...
mstime_t max_master_down_time = 0;
if (master->flags & SRI_S_DOWN)
max_master_down_time += mstime() - master->s_down_since_time;
max_master_down_time += master->down_after_period * 10;
di = dictGetIterator(master->slaves);
while((de = dictNext(di)) != NULL) {
sentinelRedisInstance *slave = dictGetVal(de);
mstime_t info_validity_time;
if (slave->flags & (SRI_S_DOWN|SRI_O_DOWN)) continue;
if (slave->link->disconnected) continue;
if (mstime() - slave->link->last_avail_time > SENTINEL_PING_PERIOD*5) continue;
if (slave->slave_priority == 0) continue;
/* If the master is in SDOWN state we get INFO for slaves every second.
* Otherwise we get it with the usual period so we need to account for
* a larger delay. */
if (master->flags & SRI_S_DOWN)
info_validity_time = SENTINEL_PING_PERIOD*5;
else
info_validity_time = SENTINEL_INFO_PERIOD*3;
if (mstime() - slave->info_refresh > info_validity_time) continue;
if (slave->master_link_down_time > max_master_down_time) continue;
instance[instances++] = slave;
}
dictReleaseIterator(di);
...
}
1.2 候选人们来个排序
将有资格提升为master的候选者们通过自定义排序算法进行排序。
sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {
...
if (instances) {
qsort(instance,instances,sizeof(sentinelRedisInstance*),
compareSlavesForPromotion);
selected = instance[0];
}
...
}
排序规则:
- 优先级, 值越小越往前排
- 同步偏移量,偏移值越大的往前排
- 比较runid,值小的往前排
int compareSlavesForPromotion(const void *a, const void *b) {
sentinelRedisInstance **sa = (sentinelRedisInstance **)a,
**sb = (sentinelRedisInstance **)b;
char *sa_runid, *sb_runid;
if ((*sa)->slave_priority != (*sb)->slave_priority)
return (*sa)->slave_priority - (*sb)->slave_priority;
/* If priority is the same, select the slave with greater replication
* offset (processed more data from the master). */
if ((*sa)->slave_repl_offset > (*sb)->slave_repl_offset) {
return -1; /* a < b */
} else if ((*sa)->slave_repl_offset < (*sb)->slave_repl_offset) {
return 1; /* a > b */
}
/* If the replication offset is the same select the slave with that has
* the lexicographically smaller runid. Note that we try to handle runid
* == NULL as there are old Redis versions that don't publish runid in
* INFO. A NULL runid is considered bigger than any other runid. */
sa_runid = (*sa)->runid;
sb_runid = (*sb)->runid;
if (sa_runid == NULL && sb_runid == NULL) return 0;
else if (sa_runid == NULL) return 1; /* a > b */
else if (sb_runid == NULL) return -1; /* a < b */
return strcasecmp(sa_runid, sb_runid);
}
排序完成后,选择了对头的那个候选人,然后其他人就没有然后了。
二、登基上位
太子选者出来了,就开始进行升级,走上红地毯开始登基准备。
2.1 哨兵通知replica登基
哨兵通知被选择的replica,让其从replicate状态切换为master。
sentinelTimer()->
sentinelHandleDictOfRedisInstances()->
sentinelHandleRedisInstance() ->
sentinelFailoverStateMachine() ->
sentinelFailoverSendSlaveOfNoOne()
void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) {
...
retval = sentinelSendSlaveOf(ri->promoted_slave,NULL);
...
}
int sentinelSendSlaveOf(sentinelRedisInstance *ri, const sentinelAddr *addr) {
char portstr[32];
const char *host;
int retval;
/* If host is NULL we send SLAVEOF NO ONE that will turn the instance
* into a master. */
if (!addr) {
host = "NO";
memcpy(portstr,"ONE",4);
} else {
host = announceSentinelAddr(addr);
ll2string(portstr,sizeof(portstr),addr->port);
}
/* In order to send SLAVEOF in a safe way, we send a transaction performing
* the following tasks:
* 1) Reconfigure the instance according to the specified host/port params.
* 2) Rewrite the configuration.
* 3) Disconnect all clients (but this one sending the command) in order
* to trigger the ask-master-on-reconnection protocol for connected
* clients.
*
* Note that we don't check the replies returned by commands, since we
* will observe instead the effects in the next INFO output. */
retval = redisAsyncCommand(ri->link->cc,
sentinelDiscardReplyCallback, ri, "%s",
sentinelInstanceMapCommand(ri,"MULTI"));
if (retval == C_ERR) return retval;
ri->link->pending_commands++;
retval = redisAsyncCommand(ri->link->cc,
sentinelDiscardReplyCallback, ri, "%s %s %s",
sentinelInstanceMapCommand(ri,"SLAVEOF"),
host, portstr);
if (retval == C_ERR) return retval;
ri->link->pending_commands++;
retval = redisAsyncCommand(ri->link->cc,
sentinelDiscardReplyCallback, ri, "%s REWRITE",
sentinelInstanceMapCommand(ri,"CONFIG"));
if (retval == C_ERR) return retval;
ri->link->pending_commands++;
/* CLIENT KILL TYPE <type> is only supported starting from Redis 2.8.12,
* however sending it to an instance not understanding this command is not
* an issue because CLIENT is variadic command, so Redis will not
* recognized as a syntax error, and the transaction will not fail (but
* only the unsupported command will fail). */
for (int type = 0; type < 2; type++) {
retval = redisAsyncCommand(ri->link->cc,
sentinelDiscardReplyCallback, ri, "%s KILL TYPE %s",
sentinelInstanceMapCommand(ri,"CLIENT"),
type == 0 ? "normal" : "pubsub");
if (retval == C_ERR) return retval;
ri->link->pending_commands++;
}
retval = redisAsyncCommand(ri->link->cc,
sentinelDiscardReplyCallback, ri, "%s",
sentinelInstanceMapCommand(ri,"EXEC"));
if (retval == C_ERR) return retval;
ri->link->pending_commands++;
return C_OK;
}
因为升级过程需要几条命令,所以为了能原子完成一系列的操作,使用了MUTIL命令,在EXEC命令前,传输的的命令都是缓存起来的,没有执行,EXEC命令才开始全部执行。整个提升步骤如下:
- SLAVEOF NO ONE ,解脱replicate的身份,变为了master,一朝变凤凰
- CONFIG REWRITE,将配置持久化到配置文件,防止睡一觉后睁眼一切都是梦
- CLIENT KILL TYPE normal/pubsub,杀掉原来的client连接,兔死狗烹鸟尽弓藏
2.2 哨兵发布诏书昭告天下
将给其他的replicate依次发送新的master地址,从新的master进行同步数据。
sentinelFailoverStateMachine() ->
sentinelFailoverReconfNextSlave()
/* Send SLAVE OF <new master address> to all the remaining slaves that
* still don't appear to have the configuration updated. */
void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) {
dictIterator *di;
dictEntry *de;
int in_progress = 0;
...
di = dictGetIterator(master->slaves);
while(in_progress < master->parallel_syncs &&
(de = dictNext(di)) != NULL)
{
sentinelRedisInstance *slave = dictGetVal(de);
int retval;
/* Skip the promoted slave, and already configured slaves. */
if (slave->flags & (SRI_PROMOTED|SRI_RECONF_DONE)) continue;
...
/* Nothing to do for instances that are disconnected or already
* in RECONF_SENT state. */
if (slave->flags & (SRI_RECONF_SENT|SRI_RECONF_INPROG)) continue;
if (slave->link->disconnected) continue;
/* Send SLAVEOF <new master>. */
retval = sentinelSendSlaveOf(slave,master->promoted_slave->addr);
if (retval == C_OK) {
slave->flags |= SRI_RECONF_SENT;
slave->slave_reconf_sent_time = mstime();
sentinelEvent(LL_NOTICE,"+slave-reconf-sent",slave,"%@");
in_progress++;
}
}
dictReleaseIterator(di);
/* Check if all the slaves are reconfigured and handle timeout. */
sentinelFailoverDetectEnd(master);
}
- 遍历下线master下的replicate
- 跳过已经登记的replicate
- 跳过已经发送诏书的replicate
- 跳过已经和哨兵短链的replicate
- 发送诏书给replicate,让replicate从新的master进行同步数据
三、哨兵承认你的地位
切换成功后,即所有的replicate都从新的master开始同步数据,哨兵将更新本地配置,然后将新的配置写入配置文件中进行持久化。
3.1 哨兵更新配置
sentinelTimer() ->
sentinelHandleDictOfRedisInstances() ->
sentinelFailoverSwitchToPromotedSlave() ->
sentinelResetMasterAndChangeAddress()
3.1.1 重置master地址
将升级的replicate的地址替换原来那个下线的master地址。
/* Reset and switch address. */
sentinelResetMaster(master,SENTINEL_RESET_NO_SENTINELS);
oldaddr = master->addr;
master->addr = newaddr;
master->o_down_since_time = 0;
master->s_down_since_time = 0;
3.1.2 重建文武百官
将下线的master的replicate获取到,然后重新创建到新的master下面。
...
/* There can be only 0 or 1 slave that has the newaddr.
* and It can add old master 1 more slave.
* so It allocates dictSize(master->slaves) + 1 */
slaves = zmalloc(sizeof(sentinelAddr*)*(dictSize(master->slaves) + 1));
/* Don't include the one having the address we are switching to. */
di = dictGetIterator(master->slaves);
while((de = dictNext(di)) != NULL) {
sentinelRedisInstance *slave = dictGetVal(de);
if (sentinelAddrIsEqual(slave->addr,newaddr)) continue;
slaves[numslaves++] = dupSentinelAddr(slave->addr);
}
dictReleaseIterator(di);
/* If we are switching to a different address, include the old address
* as a slave as well, so that we'll be able to sense / reconfigure
* the old master. */
if (!sentinelAddrIsEqual(newaddr,master->addr)) {
slaves[numslaves++] = dupSentinelAddr(master->addr);
}
...
/* Add slaves back. */
for (j = 0; j < numslaves; j++) {
sentinelRedisInstance *slave;
slave = createSentinelRedisInstance(NULL,SRI_SLAVE,slaves[j]->hostname,
slaves[j]->port, master->quorum, master);
releaseSentinelAddr(slaves[j]);
if (slave) sentinelEvent(LL_NOTICE,"+slave",slave,"%@");
}
3.2 哨兵重写配置
哨兵将更新后的配置重新写入配置文件,防止重启后新的master丢失。
int sentinelResetMasterAndChangeAddress(sentinelRedisInstance *master, char *hostname, int port) {
...
sentinelFlushConfig();
return C_OK;
}
void sentinelFlushConfig(void) {
int fd = -1;
int saved_hz = server.hz;
int rewrite_status;
server.hz = CONFIG_DEFAULT_HZ;
rewrite_status = rewriteConfig(server.configfile, 0);
server.hz = saved_hz;
if (rewrite_status == -1) goto werr;
if ((fd = open(server.configfile,O_RDONLY)) == -1) goto werr;
if (fsync(fd) == -1) goto werr;
if (close(fd) == EOF) goto werr;
return;
werr:
serverLog(LL_WARNING,"WARNING: Sentinel was not able to save the new configuration on disk!!!: %s", strerror(errno));
if (fd != -1) close(fd);
}
到此,新的皇帝正式坐稳了。后续哨兵按照原有的监控方式继续的监控新的master以及replicate。


4486

被折叠的 条评论
为什么被折叠?



