一、为什么需要哨兵?
当使用主从模式进行高可用的部署时,如上图所示。
这样部署就出现一个问题,master和replicate的状态都需要client 进行维护,判断master是否出现异常;当master出现异常时,如何选取一个replicate进行升级为master;如何通知其他的replicate应该从新的master同步数据等等,这样client就非常臃肿,工作量大,开发效率低,不易维护。因此redis自己诞生了哨兵机制,解放了程序员。
二、什么是哨兵?
哨兵是一个特殊状态下的redis服务实例,主要有四大功能点:
- 监控
不断的检查master和replica的状态是否正常。 - 通知
当某个被监控的redis实例出现异常时,哨兵通知系统管理员或者其他程序。 - 故障转移
当某个被监控的master异常时,哨兵将开始进行故障转移,提升某个replica成为新的master,并且通知其他replica向新的master连接进行数据同步。 - 服务发现
哨兵为client提供master的地址。client询问哨兵master的地址,哨兵返回给client,client根据获取到的地址进行读写操作;当master切换后,将返回新的master地址给client。
三、我是谁的哨兵?
通过配置,告诉哨兵需要监控哪个master。
3.1 如何配置
配置监控的master的IP为127.0.0.1 端口时6379, 这样哨兵就知道监控哪个master实例了。
sentinel monitor mymaster 127.0.0.1 6379 2
3.2 如何启动哨兵
可以通过如下两种方式启动哨兵
$ redis-sentinel /path/to/sentinel.conf
$ redis-server /path/to/sentinel.conf --sentinel
通过计算二进制的MD5,能确定这几个文件都是同一个,所以不同的功能是根据参数进行判断的。
从Makefile文件中也能看出,redis-sentinel只是redis-server的一份拷贝
# redis-server
$(REDIS_SERVER_NAME): $(REDIS_SERVER_OBJ)
$(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/lua/src/liblua.a $(FINAL_LIBS)
# redis-sentinel
$(REDIS_SENTINEL_NAME): $(REDIS_SERVER_NAME)
$(REDIS_INSTALL) $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME)
3.3 哨兵初始化过程
因为哨兵只是特殊的一个redis实例,所以初始化的开头部分和redis实例初始化过程一样。
但是哨兵不需要使用数据,所以不会进行rdb或者aof的加载,对于命令也只支持几个命令。
3.3.1 获取哨兵模式
int main(int argc, char **argv) {
//正常的初始化步骤
...
//判断是否是哨兵模式
server.sentinel_mode = checkForSentinelMode(argc,argv);
...
}
从代码也能看出,哨兵模式的判断方式和我们刚开始说的一致。
/* Returns 1 if there is --sentinel among the arguments or if
* argv[0] contains "redis-sentinel". */
int checkForSentinelMode(int argc, char **argv) {
int j;
if (strstr(argv[0],"redis-sentinel") != NULL) return 1;
for (j = 1; j < argc; j++)
if (!strcmp(argv[j],"--sentinel")) return 1;
return 0;
}
3.3.2 初始化哨兵
这里主要调用了两个函数
if (server.sentinel_mode) {
initSentinelConfig();
initSentinel();
}
- 调整哨兵的默认监听端口为26379
#define REDIS_SENTINEL_PORT 26379
void initSentinelConfig(void) {
server.port = REDIS_SENTINEL_PORT;
server.protected_mode = 0; /* Sentinel must be exposed. */
}
- 构建哨兵的命令表以及全局变量初始化
void initSentinel(void) {
unsigned int j;
/* Remove usual Redis commands from the command table, then just add
* the SENTINEL command. */
dictEmpty(server.commands,NULL);
dictEmpty(server.orig_commands,NULL);
ACLClearCommandID();
for (j = 0; j < sizeof(sentinelcmds)/sizeof(sentinelcmds[0]); j++) {
int retval;
struct redisCommand *cmd = sentinelcmds+j;
cmd->id = ACLGetCommandID(cmd->name); /* Assign the ID used for ACL. */
retval = dictAdd(server.commands, sdsnew(cmd->name), cmd);
serverAssert(retval == DICT_OK);
retval = dictAdd(server.orig_commands, sdsnew(cmd->name), cmd);
serverAssert(retval == DICT_OK);
/* Translate the command string flags description into an actual
* set of flags. */
if (populateCommandTableParseFlags(cmd,cmd->sflags) == C_ERR)
serverPanic("Unsupported command flag");
}
/* Initialize various data structures. */
sentinel.current_epoch = 0;
sentinel.masters = dictCreate(&instancesDictType,NULL);
sentinel.tilt = 0;
sentinel.tilt_start_time = 0;
sentinel.previous_time = mstime();
sentinel.running_scripts = 0;
sentinel.scripts_queue = listCreate();
sentinel.announce_ip = NULL;
sentinel.announce_port = 0;
sentinel.simfailure_flags = SENTINEL_SIMFAILURE_NONE;
sentinel.deny_scripts_reconfig = SENTINEL_DEFAULT_DENY_SCRIPTS_RECONFIG;
sentinel.sentinel_auth_pass = NULL;
sentinel.sentinel_auth_user = NULL;
sentinel.resolve_hostnames = SENTINEL_DEFAULT_RESOLVE_HOSTNAMES;
sentinel.announce_hostnames = SENTINEL_DEFAULT_ANNOUNCE_HOSTNAMES;
memset(sentinel.myid,0,sizeof(sentinel.myid));
server.sentinel_config = NULL;
}
可以看出哨兵只支持如下15个命令
struct redisCommand sentinelcmds[] = {
{"ping",pingCommand,1,"fast @connection",0,NULL,0,0,0,0,0},
{"sentinel",sentinelCommand,-2,"admin",0,NULL,0,0,0,0,0},
{"subscribe",subscribeCommand,-2,"pub-sub",0,NULL,0,0,0,0,0},
{"unsubscribe",unsubscribeCommand,-1,"pub-sub",0,NULL,0,0,0,0,0},
{"psubscribe",psubscribeCommand,-2,"pub-sub",0,NULL,0,0,0,0,0},
{"punsubscribe",punsubscribeCommand,-1,"pub-sub",0,NULL,0,0,0,0,0},
{"publish",sentinelPublishCommand,3,"pub-sub fast",0,NULL,0,0,0,0,0},
{"info",sentinelInfoCommand,-1,"random @dangerous",0,NULL,0,0,0,0,0},
{"role",sentinelRoleCommand,1,"fast read-only @dangerous",0,NULL,0,0,0,0,0},
{"client",clientCommand,-2,"admin random @connection",0,NULL,0,0,0,0,0},
{"shutdown",shutdownCommand,-1,"admin",0,NULL,0,0,0,0,0},
{"auth",authCommand,-2,"no-auth fast @connection",0,NULL,0,0,0,0,0},
{"hello",helloCommand,-1,"no-auth fast @connection",0,NULL,0,0,0,0,0},
{"acl",aclCommand,-2,"admin",0,NULL,0,0,0,0,0,0},
{"command",commandCommand,-1, "random @connection", 0,NULL,0,0,0,0,0,0}
};
3.3.3 加载哨兵配置
if (server.sentinel_mode) loadSentinelConfigFromQueue();
从三个链表中加载配置进行处理。
/* This function is used for loading the sentinel configuration from
* pre_monitor_cfg, monitor_cfg and post_monitor_cfg list */
void loadSentinelConfigFromQueue(void) {
const char *err = NULL;
listIter li;
listNode *ln;
int linenum = 0;
sds line = NULL;
/* if there is no sentinel_config entry, we can return immediately */
if (server.sentinel_config == NULL) return;
/* loading from pre monitor config queue first to avoid dependency issues */
listRewind(server.sentinel_config->pre_monitor_cfg,&li);
while((ln = listNext(&li))) {
struct sentinelLoadQueueEntry *entry = ln->value;
err = sentinelHandleConfiguration(entry->argv,entry->argc);
...
}
/* loading from monitor config queue */
listRewind(server.sentinel_config->monitor_cfg,&li);
while((ln = listNext(&li))) {
struct sentinelLoadQueueEntry *entry = ln->value;
err = sentinelHandleConfiguration(entry->argv,entry->argc);
...
}
/* loading from the post monitor config queue */
listRewind(server.sentinel_config->post_monitor_cfg,&li);
while((ln = listNext(&li))) {
struct sentinelLoadQueueEntry *entry = ln->value;
err = sentinelHandleConfiguration(entry->argv,entry->argc);
...
}
/* free sentinel_config when config loading is finished */
freeSentinelConfig();
return;
...
}
这三个链表在哪里赋值的呢?
从配置中加载出来的,然后加入到链表中。
main() ->
loadServerConfig(server.configfile, config_from_stdin, options) ->
loadServerConfigFromString()
...
for (i = 0; i < totlines; i++) {
...
else if (!strcasecmp(argv[0],"sentinel")) {
...
queueSentinelConfig(argv+1,argc-1,linenum,lines[i]);
}
}
...
将每行解析成argc,argv形式,然后加入到链表中。
void queueSentinelConfig(sds *argv, int argc, int linenum, sds line) {
int i;
struct sentinelLoadQueueEntry *entry;
/* initialize sentinel_config for the first call */
if (server.sentinel_config == NULL) initializeSentinelConfig();
entry = zmalloc(sizeof(struct sentinelLoadQueueEntry));
entry->argv = zmalloc(sizeof(char*)*argc);
entry->argc = argc;
entry->linenum = linenum;
entry->line = sdsdup(line);
for (i = 0; i < argc; i++) {
entry->argv[i] = sdsdup(argv[i]);
}
/* Separate config lines with pre monitor config, monitor config and
* post monitor config, in order to parsing config dependencies
* correctly. */
if (!strcasecmp(argv[0],"monitor")) {
listAddNodeTail(server.sentinel_config->monitor_cfg,entry);
} else if (searchPreMonitorCfgName(argv[0])) {
listAddNodeTail(server.sentinel_config->pre_monitor_cfg,entry);
} else{
listAddNodeTail(server.sentinel_config->post_monitor_cfg,entry);
}
}
根据不同的配置放入不同的链表中。
monitor配置放入monitor_cfg链表,而对于如下的配置,放入到pre_monitor_cfg链表中,其他的配置则放入到post_monitor_cfg链表中。
int searchPreMonitorCfgName(const char *name) {
for (unsigned int i = 0; i < sizeof(preMonitorCfgName)/sizeof(preMonitorCfgName[0]); i++) {
if (!strcasecmp(preMonitorCfgName[i],name)) return 1;
}
return 0;
}
const char *preMonitorCfgName[] = {
"announce-ip",
"announce-port",
"deny-scripts-reconfig",
"sentinel-user",
"sentinel-pass",
"current-epoch",
"myid",
"resolve-hostnames",
"announce-hostnames"
};
为啥需要三个链表?
因为各个配置之间有前后依赖关系,为了简化处理过程,将配置分为三个链表,将不同的依赖关系的配置加入到不同的链表中,后续处理时只需要按照三个链表的依赖关系先后处理,这个方式可以在我们自己的设计中可以借鉴。
/* loading from pre monitor config queue first to avoid dependency issues */
listRewind(server.sentinel_config->pre_monitor_cfg,&li);
while((ln = listNext(&li))) {
...
}
/* loading from monitor config queue */
listRewind(server.sentinel_config->monitor_cfg,&li);
while((ln = listNext(&li))) {
...
}
/* loading from the post monitor config queue */
listRewind(server.sentinel_config->post_monitor_cfg,&li);
while((ln = listNext(&li))) {
...
}
配置的主要解析函数
const char *sentinelHandleConfiguration(char **argv, int argc) {
sentinelRedisInstance *ri;
if (!strcasecmp(argv[0],"monitor") && argc == 5) {
/* monitor <name> <host> <port> <quorum> */
int quorum = atoi(argv[4]);
if (quorum <= 0) return "Quorum must be 1 or greater.";
if (createSentinelRedisInstance(argv[1],SRI_MASTER,argv[2],
atoi(argv[3]),quorum,NULL) == NULL)
{
return sentinelCheckCreateInstanceErrors(SRI_MASTER);
}
} else if (!strcasecmp(argv[0],"down-after-milliseconds") && argc == 3) {
/* down-after-milliseconds <name> <milliseconds> */
ri = sentinelGetMasterByName(argv[1]);
if (!ri) return "No such master with specified name.";
ri->down_after_period = atoi(argv[2]);
if (ri->down_after_period <= 0)
return "negative or zero time parameter.";
sentinelPropagateDownAfterPeriod(ri);
} else if (!strcasecmp(argv[0],"failover-timeout") && argc == 3) {
/* failover-timeout <name> <milliseconds> */
ri = sentinelGetMasterByName(argv[1]);
if (!ri) return "No such master with specified name.";
ri->failover_timeout = atoi(argv[2]);
if (ri->failover_timeout <= 0)
return "negative or zero time parameter.";
} else if (!strcasecmp(argv[0],"parallel-syncs") && argc == 3) {
/* parallel-syncs <name> <milliseconds> */
ri = sentinelGetMasterByName(argv[1]);
if (!ri) return "No such master with specified name.";
ri->parallel_syncs = atoi(argv[2]);
} else if (!strcasecmp(argv[0],"notification-script") && argc == 3) {
/* notification-script <name> <path> */
ri = sentinelGetMasterByName(argv[1]);
if (!ri) return "No such master with specified name.";
if (access(argv[2],X_OK) == -1)
return "Notification script seems non existing or non executable.";
ri->notification_script = sdsnew(argv[2]);
} else if (!strcasecmp(argv[0],"client-reconfig-script") && argc == 3) {
/* client-reconfig-script <name> <path> */
ri = sentinelGetMasterByName(argv[1]);
if (!ri) return "No such master with specified name.";
if (access(argv[2],X_OK) == -1)
return "Client reconfiguration script seems non existing or "
"non executable.";
ri->client_reconfig_script = sdsnew(argv[2]);
} else if (!strcasecmp(argv[0],"auth-pass") && argc == 3) {
/* auth-pass <name> <password> */
ri = sentinelGetMasterByName(argv[1]);
if (!ri) return "No such master with specified name.";
ri->auth_pass = sdsnew(argv[2]);
} else if (!strcasecmp(argv[0],"auth-user") && argc == 3) {
/* auth-user <name> <username> */
ri = sentinelGetMasterByName(argv[1]);
if (!ri) return "No such master with specified name.";
ri->auth_user = sdsnew(argv[2]);
} else if (!strcasecmp(argv[0],"current-epoch") && argc == 2) {
/* current-epoch <epoch> */
unsigned long long current_epoch = strtoull(argv[1],NULL,10);
if (current_epoch > sentinel.current_epoch)
sentinel.current_epoch = current_epoch;
} else if (!strcasecmp(argv[0],"myid") && argc == 2) {
if (strlen(argv[1]) != CONFIG_RUN_ID_SIZE)
return "Malformed Sentinel id in myid option.";
memcpy(sentinel.myid,argv[1],CONFIG_RUN_ID_SIZE);
} else if (!strcasecmp(argv[0],"config-epoch") && argc == 3) {
/* config-epoch <name> <epoch> */
ri = sentinelGetMasterByName(argv[1]);
if (!ri) return "No such master with specified name.";
ri->config_epoch = strtoull(argv[2],NULL,10);
/* The following update of current_epoch is not really useful as
* now the current epoch is persisted on the config file, but
* we leave this check here for redundancy. */
if (ri->config_epoch > sentinel.current_epoch)
sentinel.current_epoch = ri->config_epoch;
} else if (!strcasecmp(argv[0],"leader-epoch") && argc == 3) {
/* leader-epoch <name> <epoch> */
ri = sentinelGetMasterByName(argv[1]);
if (!ri) return "No such master with specified name.";
ri->leader_epoch = strtoull(argv[2],NULL,10);
} else if ((!strcasecmp(argv[0],"known-slave") ||
!strcasecmp(argv[0],"known-replica")) && argc == 4)
{
sentinelRedisInstance *slave;
/* known-replica <name> <ip> <port> */
ri = sentinelGetMasterByName(argv[1]);
if (!ri) return "No such master with specified name.";
if ((slave = createSentinelRedisInstance(NULL,SRI_SLAVE,argv[2],
atoi(argv[3]), ri->quorum, ri)) == NULL)
{
return sentinelCheckCreateInstanceErrors(SRI_SLAVE);
}
} else if (!strcasecmp(argv[0],"known-sentinel") &&
(argc == 4 || argc == 5)) {
sentinelRedisInstance *si;
if (argc == 5) { /* Ignore the old form without runid. */
/* known-sentinel <name> <ip> <port> [runid] */
ri = sentinelGetMasterByName(argv[1]);
if (!ri) return "No such master with specified name.";
if ((si = createSentinelRedisInstance(argv[4],SRI_SENTINEL,argv[2],
atoi(argv[3]), ri->quorum, ri)) == NULL)
{
return sentinelCheckCreateInstanceErrors(SRI_SENTINEL);
}
si->runid = sdsnew(argv[4]);
sentinelTryConnectionSharing(si);
}
} else if (!strcasecmp(argv[0],"rename-command") && argc == 4) {
/* rename-command <name> <command> <renamed-command> */
ri = sentinelGetMasterByName(argv[1]);
if (!ri) return "No such master with specified name.";
sds oldcmd = sdsnew(argv[2]);
sds newcmd = sdsnew(argv[3]);
if (dictAdd(ri->renamed_commands,oldcmd,newcmd) != DICT_OK) {
sdsfree(oldcmd);
sdsfree(newcmd);
return "Same command renamed multiple times with rename-command.";
}
} else if (!strcasecmp(argv[0],"announce-ip") && argc == 2) {
/* announce-ip <ip-address> */
if (strlen(argv[1]))
sentinel.announce_ip = sdsnew(argv[1]);
} else if (!strcasecmp(argv[0],"announce-port") && argc == 2) {
/* announce-port <port> */
sentinel.announce_port = atoi(argv[1]);
} else if (!strcasecmp(argv[0],"deny-scripts-reconfig") && argc == 2) {
/* deny-scripts-reconfig <yes|no> */
if ((sentinel.deny_scripts_reconfig = yesnotoi(argv[1])) == -1) {
return "Please specify yes or no for the "
"deny-scripts-reconfig options.";
}
} else if (!strcasecmp(argv[0],"sentinel-user") && argc == 2) {
/* sentinel-user <user-name> */
if (strlen(argv[1]))
sentinel.sentinel_auth_user = sdsnew(argv[1]);
} else if (!strcasecmp(argv[0],"sentinel-pass") && argc == 2) {
/* sentinel-pass <password> */
if (strlen(argv[1]))
sentinel.sentinel_auth_pass = sdsnew(argv[1]);
} else if (!strcasecmp(argv[0],"resolve-hostnames") && argc == 2) {
/* resolve-hostnames <yes|no> */
if ((sentinel.resolve_hostnames = yesnotoi(argv[1])) == -1) {
return "Please specify yes or no for the resolve-hostnames option.";
}
} else if (!strcasecmp(argv[0],"announce-hostnames") && argc == 2) {
/* announce-hostnames <yes|no> */
if ((sentinel.announce_hostnames = yesnotoi(argv[1])) == -1) {
return "Please specify yes or no for the announce-hostnames option.";
}
} else {
return "Unrecognized sentinel configuration statement.";
}
return NULL;
}