一、集群
再强大的系统也有当机的时候,可能是自身原因,也可能是外在原因。那么单机部署任何一个软件,都会有这种风险,而有的时候儿,这种风险是不可承受的或者说成本太高了。所以,支持分布式的部署几乎是所有大型软件的必备,Redis同样也支持,这就是集群模式。
Redis的分布式集群模式不是一步而就的,一开始是简单的主从模式,一主N备,发现主设备挂掉,需要人工参与将Slaver提升到主;后来在2.8版本中,增加了哨兵模式,也就是简单化了主从的转化,但仍然没有解决主从模式中从节点下线的人工干预,也无法进行自动伸缩;从3.0开始,提供了集群模式,P2P中,没有了明显的主从,自动故障转移,数据迁移,动态扩容。当然,做为新的东西,总是需要严苛的考验后才能说真正为大家接受。
集群的分布式部署,一方面减小了灾害的风险,另外一方面,增强了负载均衡的能力,使得整个软件更稳定,安全。那么分布式理论中要求对多节点进行分区(对于分布式的理论比如2PC,3PC等,请大家自行学习,这里用到啥说啥,不深入展开)。常见的分区有哈希分区和顺序分区,而Redis使用的是前者,而哈希分区又分为三种即:节点取余分区;一致性哈希分区和虚拟槽分区。而Redis使用了虚拟槽分区,每个槽的范围是0~16383,请一定记清,槽是Redis的数据管理的基本单位,每个节点负责一定数量的槽。它的计算公式:
slot = CRC16(key)&16383
为什么叫虚拟槽?因为他和节点的关系并不是固定的,是可以动态伸缩的。槽和节点通过一定的关系进行映射,可以通过设定的各种情况进行数据的在线自动伸缩和数据路由。不过,它也是有一定的限制的:
对Key的批量操作支持有限制,只支持相同槽的执行批量动作;
支持的事物有好用,只支持同一节点的事务,也就是说,不是完全的分布式事务;
键值对不能映射到不同的节点。
不支持多数据空间;单机支持16个DB,而集群下只有一个;
不支持嵌套复制;
集群是通过Gossip协议进行通信的,这也是P2P常见的一种通信协议。
二、网络通信
回到网络通信,分为以下几部分:
1、数据结构
支持集群的相关结构体都在cluster.h文件中定义,其中最主要的有三个数据结构,其它的都是相关传递的数据消息的结构体。
/* clusterLink encapsulates everything needed to talk with a remote node. */
//描述集群连接的节点资源
typedef struct clusterLink {
mstime_t ctime; /* Link creation time */
connection *conn; /* Connection to remote node */
sds sndbuf; /* Packet send buffer */
sds rcvbuf; /* Packet reception buffer */
struct clusterNode *node; /* Node related to this link if any, or NULL 关联结点*/
} clusterLink;
//描述连接节点的具体内容数据结构
typedef struct clusterNode {
mstime_t ctime; /* Node object creation time. */
char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */
int flags; /* CLUSTER_NODE_... */
uint64_t configEpoch; /* Last configEpoch observed for this node */
unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */
int numslots; /* Number of slots handled by this node */
int numslaves; /* Number of slave nodes, if this is a master */
struct clusterNode **slaves; /* pointers to slave nodes */
struct clusterNode *slaveof; /* pointer to the master node. Note that it
may be NULL even if the node is a slave
if we don't have the master node in our
tables. */
mstime_t ping_sent; /* Unix time we sent latest ping */
mstime_t pong_received; /* Unix time we received the pong */
mstime_t fail_time; /* Unix time when FAIL flag was set */
mstime_t voted_time; /* Last time we voted for a slave of this master */
mstime_t repl_offset_time; /* Unix time we received offset for this node */
mstime_t orphaned_time; /* Starting time of orphaned master condition */
long long repl_offset; /* Last known repl offset for this node. */
char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */
int port; /* Latest known clients port of this node */
int cport; /* Latest known cluster port of this node. */
clusterLink *link; /* TCP/IP link with this node */
list *fail_reports; /* List of nodes signaling this as failing */
} clusterNode;
//当前集群状态数据结构体
typedef struct clusterState {
clusterNode *myself; /* This node */
uint64_t currentEpoch;//学过算法的可能更理解这个Epoch,轮,次,纪元
int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */
int size; /* Num of master nodes with at least one slot */
dict *nodes; /* Hash table of name -> clusterNode structures */
dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */
clusterNode *migrating_slots_to[CLUSTER_SLOTS];//导入和导出到槽的目标节点数组
clusterNode *importing_slots_from[CLUSTER_SLOTS];
clusterNode *slots[CLUSTER_SLOTS];//槽和槽节点的映射
uint64_t slots_keys_count[CLUSTER_SLOTS];//槽映射到键的有序集合
rax *slots_to_keys; //用有序字典数槽和KEY的映射,利用rax的key是按key前缀挂接的来快速遍历
/* The following fields are used to take the slave state on elections. */
mstime_t failover_auth_time; /* Time of previous or next election. */
int failover_auth_count; /* Number of votes received so far. */
int failover_auth_sent; /* True if we already asked for votes. */
int failover_auth_rank; /* This slave rank for current auth request. */
uint64_t failover_auth_epoch; /* Epoch of the current election. */
int cant_failover_reason; /* Why a slave is currently not able to
failover. See the CANT_FAILOVER_* macros. */
/* Manual failover state in common. */
mstime_t mf_end; /* Manual failover time limit (ms unixtime).
It is zero if there is no MF in progress. */
/* Manual failover state of master. */
clusterNode *mf_slave; /* Slave performing the manual failover. */
/* Manual failover state of slave. */
long long mf_master_offset; /* Master offset the slave needs to start MF
or zero if stil not received. */
int mf_can_start; /* If non-zero signal that the manual failover
can start requesting masters vote. */
/* The followign fields are used by masters to take state on elections. */
uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */
int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */
/* Messages received and sent by type. */
long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT];
long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT];
long long stats_pfail_nodes; /* Number of nodes in PFAIL status,
excluding nodes without address. * /
} clusterState;
2、通信流程
在前边的网络通信中,有意忽略了InitServer中的集群相关代码,包括时间事件都忽略了。而集群部分的代码在下面的流程中体现:
int main()
{
......
initServer();
if (background || server.pidfile) createPidFile();
redisSetProcTitle(argv[0]); //这里来配置模式
redisAsciiArt();
checkTcpBacklogSettings();......
}
void redisSetProcTitle(char *title) {
#ifdef USE_SETPROCTITLE
char *server_mode = "";
if (server.cluster_enabled) server_mode = " [cluster]";
else if (server.sentinel_mode) server_mode = " [sentinel]";
setproctitle("%s %s:%d%s",
title,
server.bindaddr_count ? server.bindaddr[0] : "*",
server.port ? server.port : server.tls_port,
server_mode);
#else
UNUSED(title);
#endif
}
void initServer(void) {
......
/* Create the timer callback, this is our way to process many background
* operations incrementally, like clients timeout, eviction of unaccessed
* expired keys and so forth. * /
if (aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL) == AE_ERR) {
serverPanic("Can't create event loop timers.");
exit(1);
}
......
if (server.cluster_enabled) clusterInit();
......
}
long long aeCreateTimeEvent(aeEventLoop * eventLoop, long long milliseconds,
aeTimeProc * proc, void * clientData,
aeEventFinalizerProc * finalizerProc)
{
long long id = eventLoop->timeEventNextId++;
aeTimeEvent * te;
te = zmalloc(sizeof(*te));
if (te == NULL) return AE_ERR;
te->id = id;
aeAddMillisecondsToNow(milliseconds,&te->when_sec,&te->when_ms);
te->timeProc = proc;
te->finalizerProc = finalizerProc;
te->clientData = clientData;
te->prev = NULL;
te->next = eventLoop->timeEventHead;
if (te->next)
te->next->prev = te;
eventLoop->timeEventHead = te;
return id;
}
......
}
其实最主要的还是serverCron这个函数,在它的注释里写明了,它的工作可不少,异步任务都需要它来控制,包括过期KEY,软狗,更新统计数据,再哈希的自增控制,触发后台的一些数据存储 ,客户端的超时,重新连接等等,看一下代码:
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
int j;
UNUSED(eventLoop);
UNUSED(id);
UNUSED(clientData);
/* Software watchdog: deliver the SIGALRM that will reach the signal
* handler if we don't return here fast enough. */
if (server.watchdog_period) watchdogScheduleSignal(server.watchdog_period);
/* Update the time cache. */
updateCachedTime(1);
server.hz = server.config_hz;
/* Adapt the server.hz value to the number of configured clients. If we have
* many clients, we want to call serverCron() with an higher frequency. */
if (server.dynamic_hz) {
while (listLength(server.clients) / server.hz >
MAX_CLIENTS_PER_CLOCK_TICK)
{
server.hz *= 2;
if (server.hz > CONFIG_MAX_HZ) {
server.hz = CONFIG_MAX_HZ;
break;
}
}
}
run_with_period(100) {
trackInstantaneousMetric(STATS_METRIC_COMMAND,server.stat_numcommands);
trackInstantaneousMetric(STATS_METRIC_NET_INPUT,
server.stat_net_input_bytes);
trackInstantaneousMetric(STATS_METRIC_NET_OUTPUT,
server.stat_net_output_bytes);
}
/* We have just LRU_BITS bits per object for LRU information.
* So we use an (eventually wrapping) LRU clock.
*
* Note that even if the counter wraps it's not a big problem,
* everything will still work but some object will appear younger
* to Redis. However for this to happen a given object should never be
* touched for all the time needed to the counter to wrap, which is
* not likely.
*
* Note that you can change the resolution altering the
* LRU_CLOCK_RESOLUTION define. */
server.lruclock = getLRUClock();
/* Record the max memory used since the server was started. */
if (zmalloc_used_memory() > server.stat_peak_memory)
server.stat_peak_memory = zmalloc_used_memory();
run_with_period(100) {
/* Sample the RSS and other metrics here since this is a relatively slow call.
* We must sample the zmalloc_used at the same time we take the rss, otherwise
* the frag ratio calculate may be off (ratio of two samples at different times) */
server.cron_malloc_stats.process_rss = zmalloc_get_rss();
server.cron_malloc_stats.zmalloc_used = zmalloc_used_memory();
/* Sampling the allcator info can be slow too.
* The fragmentation ratio it'll show is potentically more accurate
* it excludes other RSS pages such as: shared libraries, LUA and other non-zmalloc
* allocations, and allocator reserved pages that can be pursed (all not actual frag) */
zmalloc_get_allocator_info(&server.cron_malloc_stats.allocator_allocated,
&server.cron_malloc_stats.allocator_active,
&server.cron_malloc_stats.allocator_resident);
/* in case the allocator isn't providing these stats, fake them so that
* fragmention info still shows some (inaccurate metrics) */
if (!server.cron_malloc_stats.allocator_resident) {
/* LUA memory isn't part of zmalloc_used, but it is part of the process RSS,
* so we must desuct it in order to be able to calculate correct
* "allocator fragmentation" ratio */
size_t lua_memory = lua_gc(server.lua,LUA_GCCOUNT,0)*1024LL;
server.cron_malloc_stats.allocator_resident = server.cron_malloc_stats.process_rss - lua_memory;
}
if (!server.cron_malloc_stats.allocator_active)
server.cron_malloc_stats.allocator_active = server.cron_malloc_stats.allocator_resident;
if (!server.cron_malloc_stats.allocator_allocated)
server.cron_malloc_stats.allocator_allocated = server.cron_malloc_stats.zmalloc_used;
}
/* We received a SIGTERM, shutting down here in a safe way, as it is
* not ok doing so inside the signal handler. */
if (server.shutdown_asap) {
if (prepareForShutdown(SHUTDOWN_NOFLAGS) == C_OK) exit(0);
serverLog(LL_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
server.shutdown_asap = 0;
}
/* Show some info about non-empty databases */
run_with_period(5000) {
for (j = 0; j < server.dbnum; j++) {
long long size, used, vkeys;
size = dictSlots(server.db[j].dict);
used = dictSize(server.db[j].dict);
vkeys = dictSize(server.db[j].expires);
if (used || vkeys) {
serverLog(LL_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
/* dictPrintStats(server.dict); */
}
}
}
/* Show information about connected clients */
if (!server.sentinel_mode) {
run_with_period(5000) {
serverLog(LL_VERBOSE,
"%lu clients connected (%lu replicas), %zu bytes in use",
listLength(server.clients)-listLength(server.slaves),
listLength(server.slaves),
zmalloc_used_memory());
}
}
/* We need to do a few operations on clients asynchronously. */
//客户端管理
clientsCron();
/* Handle background operations on Redis databases. */
databasesCron();
/* Start a scheduled AOF rewrite if this was requested by the user while
* a BGSAVE was in progress. */
//后台存储
if (!hasActiveChildProcess() &&
server.aof_rewrite_scheduled)
{
rewriteAppendOnlyFileBackground();
}
/* Check if a background saving or AOF rewrite in progress terminated. */
if (hasActiveChildProcess() || ldbPendingChildren())
{
checkChildrenDone();
} else {
/* If there is not a background saving/rewrite in progress check if
* we have to save/rewrite now. */
for (j = 0; j < server.saveparamslen; j++) {
struct saveparam *sp = server.saveparams+j;
/* Save if we reached the given amount of changes,
* the given amount of seconds, and if the latest bgsave was
* successful or if, in case of an error, at least
* CONFIG_BGSAVE_RETRY_DELAY seconds already elapsed. */
if (server.dirty >= sp->changes &&
server.unixtime-server.lastsave > sp->seconds &&
(server.unixtime-server.lastbgsave_try >
CONFIG_BGSAVE_RETRY_DELAY ||
server.lastbgsave_status == C_OK))
{
serverLog(LL_NOTICE,"%d changes in %d seconds. Saving...",
sp->changes, (int)sp->seconds);
rdbSaveInfo rsi, *rsiptr;
rsiptr = rdbPopulateSaveInfo(&rsi);
rdbSaveBackground(server.rdb_filename,rsiptr);
break;
}
}
/* Trigger an AOF rewrite if needed. */
if (server.aof_state == AOF_ON &&
!hasActiveChildProcess() &&
server.aof_rewrite_perc &&
server.aof_current_size > server.aof_rewrite_min_size)
{
long long base = server.aof_rewrite_base_size ?
server.aof_rewrite_base_size : 1;
long long growth = (server.aof_current_size*100/base) - 100;
if (growth >= server.aof_rewrite_perc) {
serverLog(LL_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);
rewriteAppendOnlyFileBackground();
}
}
}
/* AOF postponed flush: Try at every cron cycle if the slow fsync
* completed. */
if (server.aof_flush_postponed_start) flushAppendOnlyFile(0);
/* AOF write errors: in this case we have a buffer to flush as well and
* clear the AOF error in case of success to make the DB writable again,
* however to try every second is enough in case of 'hz' is set to
* an higher frequency. */
run_with_period(1000) {
if (server.aof_last_write_status == C_ERR)
flushAppendOnlyFile(0);
}
/* Clear the paused clients flag if needed. */
clientsArePaused(); /* Don't check return value, just use the side effect.*/
//下面是关于集群的
/* Replication cron function -- used to reconnect to master,
* detect transfer failures, start background RDB transfers and so forth. */
run_with_period(1000) replicationCron();
/* Run the Redis Cluster cron. */
//此处运行集群管理
run_with_period(100) {
if (server.cluster_enabled) clusterCron();
}
/* Run the Sentinel timer if we are in sentinel mode. */
if (server.sentinel_mode) sentinelTimer();
/* Cleanup expired MIGRATE cached sockets. */
run_with_period(1000) {
migrateCloseTimedoutSockets();
}
/* Stop the I/O threads if we don't have enough pending work. */
stopThreadedIOIfNeeded();
/* Start a scheduled BGSAVE if the corresponding flag is set. This is
* useful when we are forced to postpone a BGSAVE because an AOF
* rewrite is in progress.
*
* Note: this code must be after the replicationCron() call above so
* make sure when refactoring this file to keep this order. This is useful
* because we want to give priority to RDB savings for replication. */
if (!hasActiveChildProcess() &&
server.rdb_bgsave_scheduled &&
(server.unixtime-server.lastbgsave_try > CONFIG_BGSAVE_RETRY_DELAY ||
server.lastbgsave_status == C_OK))
{
rdbSaveInfo rsi, *rsiptr;
rsiptr = rdbPopulateSaveInfo(&rsi);
if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK)
server.rdb_bgsave_scheduled = 0;
}
/* Fire the cron loop modules event. * /
RedisModuleCronLoopV1 ei = {REDISMODULE_CRON_LOOP_VERSION,server.hz};
moduleFireServerEvent(REDISMODULE_EVENT_CRON_LOOP,
0,
&ei);
server.cronloops++;
return 1000/server.hz;
}
重点看一下clusterCron()这个函数:
//每秒执行10次
void clusterCron(void) {
dictIterator *di;
dictEntry *de;
int update_state = 0;
int orphaned_masters; /* How many masters there are without ok slaves. */
int max_slaves; /* Max number of ok slaves for a single master. */
int this_slaves; /* Number of ok slaves for our master (if we are slave). */
mstime_t min_pong = 0, now = mstime();
clusterNode *min_pong_node = NULL;
static unsigned long long iteration = 0;
mstime_t handshake_timeout;
iteration++; /* Number of times this function was called so far. */
/* We want to take myself->ip in sync with the cluster-announce-ip option.
* The option can be set at runtime via CONFIG SET, so we periodically check
* if the option changed to reflect this into myself->ip. */
{
static char *prev_ip = NULL;
char *curr_ip = server.cluster_announce_ip;
int changed = 0;
if (prev_ip == NULL && curr_ip != NULL) changed = 1;
else if (prev_ip != NULL && curr_ip == NULL) changed = 1;
else if (prev_ip && curr_ip && strcmp(prev_ip,curr_ip)) changed = 1;
if (changed) {
if (prev_ip) zfree(prev_ip);
prev_ip = curr_ip;
if (curr_ip) {
/* We always take a copy of the previous IP address, by
* duplicating the string. This way later we can check if
* the address really changed. */
prev_ip = zstrdup(prev_ip);
strncpy(myself->ip,server.cluster_announce_ip,NET_IP_STR_LEN);
myself->ip[NET_IP_STR_LEN-1] = '\0';
} else {
myself->ip[0] = '\0'; /* Force autodetection. */
}
}
}
/* The handshake timeout is the time after which a handshake node that was
* not turned into a normal node is removed from the nodes. Usually it is
* just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use
* the value of 1 second. */
handshake_timeout = server.cluster_node_timeout;
if (handshake_timeout < 1000) handshake_timeout = 1000;
/* Update myself flags. */
clusterUpdateMyselfFlags();
/* Check if we have disconnected nodes and re-establish the connection.
* Also update a few stats while we are here, that can be used to make
* better decisions in other part of the code. */
di = dictGetSafeIterator(server.cluster->nodes);
server.cluster->stats_pfail_nodes = 0;
while((de = dictNext(di)) != NULL) {
clusterNode *node = dictGetVal(de);
/* Not interested in reconnecting the link with myself or nodes
* for which we have no address. */
if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR)) continue;
if (node->flags & CLUSTER_NODE_PFAIL)
server.cluster->stats_pfail_nodes++;
/* A Node in HANDSHAKE state has a limited lifespan equal to the
* configured node timeout. */
if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) {
clusterDelNode(node);
continue;
}
if (node->link == NULL) {
clusterLink *link = createClusterLink(node);
link->conn = server.tls_cluster ? connCreateTLS() : connCreateSocket();
connSetPrivateData(link->conn, link);
if (connConnect(link->conn, node->ip, node->cport, NET_FIRST_BIND_ADDR,
clusterLinkConnectHandler) == -1) {
/* We got a synchronous error from connect before
* clusterSendPing() had a chance to be called.
* If node->ping_sent is zero, failure detection can't work,
* so we claim we actually sent a ping now (that will
* be really sent as soon as the link is obtained). */
if (node->ping_sent == 0) node->ping_sent = mstime();
serverLog(LL_DEBUG, "Unable to connect to "
"Cluster Node [%s]:%d -> %s", node->ip,
node->cport, server.neterr);
freeClusterLink(link);
continue;
}
node->link = link;
}
}
dictReleaseIterator(di);
/* Ping some random node 1 time every 10 iterations, so that we usually ping
* one random node every second. */
if (!(iteration % 10)) {
int j;
/* Check a few random nodes and ping the one with the oldest
* pong_received time. */
for (j = 0; j < 5; j++) {
de = dictGetRandomKey(server.cluster->nodes);
clusterNode *this = dictGetVal(de);
/* Don't ping nodes disconnected or with a ping currently active. */
if (this->link == NULL || this->ping_sent != 0) continue;
if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
continue;
if (min_pong_node == NULL || min_pong > this->pong_received) {
min_pong_node = this;
min_pong = this->pong_received;
}
}
if (min_pong_node) {
serverLog(LL_DEBUG,"Pinging node %.40s", min_pong_node->name);
clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
}
}
/* Iterate nodes to check if we need to flag something as failing.
* This loop is also responsible to:
* 1) Check if there are orphaned masters (masters without non failing
* slaves).
* 2) Count the max number of non failing slaves for a single master.
* 3) Count the number of slaves for our master, if we are a slave. */
orphaned_masters = 0;
max_slaves = 0;
this_slaves = 0;
di = dictGetSafeIterator(server.cluster->nodes);
while((de = dictNext(di)) != NULL) {
clusterNode *node = dictGetVal(de);
now = mstime(); /* Use an updated time at every iteration. */
mstime_t delay;
if (node->flags &
(CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE))
continue;
/* Orphaned master check, useful only if the current instance
* is a slave that may migrate to another master. */
if (nodeIsSlave(myself) && nodeIsMaster(node) && !nodeFailed(node)) {
int okslaves = clusterCountNonFailingSlaves(node);
/* A master is orphaned if it is serving a non-zero number of
* slots, have no working slaves, but used to have at least one
* slave, or failed over a master that used to have slaves. */
if (okslaves == 0 && node->numslots > 0 &&
node->flags & CLUSTER_NODE_MIGRATE_TO)
{
orphaned_masters++;
}
if (okslaves > max_slaves) max_slaves = okslaves;
if (nodeIsSlave(myself) && myself->slaveof == node)
this_slaves = okslaves;
}
/* If we are waiting for the PONG more than half the cluster
* timeout, reconnect the link: maybe there is a connection
* issue even if the node is alive. */
if (node->link && /* is connected */
now - node->link->ctime >
server.cluster_node_timeout && /* was not already reconnected */
node->ping_sent && /* we already sent a ping */
node->pong_received < node->ping_sent && /* still waiting pong */
/* and we are waiting for the pong more than timeout/2 */
now - node->ping_sent > server.cluster_node_timeout/2)
{
/* Disconnect the link, it will be reconnected automatically. */
freeClusterLink(node->link);
}
/* If we have currently no active ping in this instance, and the
* received PONG is older than half the cluster timeout, send
* a new ping now, to ensure all the nodes are pinged without
* a too big delay. */
if (node->link &&
node->ping_sent == 0 &&
(now - node->pong_received) > server.cluster_node_timeout/2)
{
clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
continue;
}
/* If we are a master and one of the slaves requested a manual
* failover, ping it continuously. */
if (server.cluster->mf_end &&
nodeIsMaster(myself) &&
server.cluster->mf_slave == node &&
node->link)
{
clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
continue;
}
/* Check only if we have an active ping for this instance. */
if (node->ping_sent == 0) continue;
/* Compute the delay of the PONG. Note that if we already received
* the PONG, then node->ping_sent is zero, so can't reach this
* code at all. */
delay = now - node->ping_sent;
if (delay > server.cluster_node_timeout) {
/* Timeout reached. Set the node as possibly failing if it is
* not already in this state. */
if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) {
serverLog(LL_DEBUG,"*** NODE %.40s possibly failing",
node->name);
node->flags |= CLUSTER_NODE_PFAIL;
update_state = 1;
}
}
}
dictReleaseIterator(di);
/* If we are a slave node but the replication is still turned off,
* enable it if we know the address of our master and it appears to
* be up. */
if (nodeIsSlave(myself) &&
server.masterhost == NULL &&
myself->slaveof &&
nodeHasAddr(myself->slaveof))
{
replicationSetMaster(myself->slaveof->ip, myself->slaveof->port);
}
/* Abourt a manual failover if the timeout is reached. */
manualFailoverCheckTimeout();
if (nodeIsSlave(myself)) {
clusterHandleManualFailover();
if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
clusterHandleSlaveFailover();
/* If there are orphaned slaves, and we are a slave among the masters
* with the max number of non-failing slaves, consider migrating to
* the orphaned masters. Note that it does not make sense to try
* a migration if there is no master with at least * two * working
* slaves. * /
if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves)
clusterHandleSlaveMigration(max_slaves);
}
if (update_state || server.cluster->state == CLUSTER_FAIL)
clusterUpdateState();
}
这个函数里比较多,既有老的哨兵,也有新的集群,都得在这里进行处理,包括其它前面提到的相关的功能,都在此处进行循环处理。里面的英文注释很详细,包括握手和PING,PONG都可以看到。
再回到InitServer,看一下clusterInit():
void clusterInit(void) {
int saveconf = 0;
server.cluster = zmalloc(sizeof(clusterState));
server.cluster->myself = NULL;
server.cluster->currentEpoch = 0;
server.cluster->state = CLUSTER_FAIL;
server.cluster->size = 1;
server.cluster->todo_before_sleep = 0;
server.cluster->nodes = dictCreate(&clusterNodesDictType,NULL);
server.cluster->nodes_black_list =
dictCreate(&clusterNodesBlackListDictType,NULL);
server.cluster->failover_auth_time = 0;
server.cluster->failover_auth_count = 0;
server.cluster->failover_auth_rank = 0;
server.cluster->failover_auth_epoch = 0;
server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
server.cluster->lastVoteEpoch = 0;
for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) {
server.cluster->stats_bus_messages_sent[i] = 0;
server.cluster->stats_bus_messages_received[i] = 0;
}
server.cluster->stats_pfail_nodes = 0;
memset(server.cluster->slots,0, sizeof(server.cluster->slots));
clusterCloseAllSlots();
/* Lock the cluster config file to make sure every node uses
* its own nodes.conf. */
if (clusterLockConfig(server.cluster_configfile) == C_ERR)
exit(1);
/* Load or create a new nodes configuration. */
if (clusterLoadConfig(server.cluster_configfile) == C_ERR) {
/* No configuration found. We will just use the random name provided
* by the createClusterNode() function. */
myself = server.cluster->myself =
createClusterNode(NULL,CLUSTER_NODE_MYSELF|CLUSTER_NODE_MASTER);
serverLog(LL_NOTICE,"No cluster configuration found, I'm %.40s",
myself->name);
clusterAddNode(myself);
saveconf = 1;
}
if (saveconf) clusterSaveConfigOrDie(1);
/* We need a listening TCP port for our cluster messaging needs. */
server.cfd_count = 0;
/* Port sanity check II
* The other handshake port check is triggered too late to stop
* us from trying to use a too-high cluster port number. */
int port = server.tls_cluster ? server.tls_port : server.port;
if (port > (65535-CLUSTER_PORT_INCR)) {
serverLog(LL_WARNING, "Redis port number too high. "
"Cluster communication port is 10,000 port "
"numbers higher than your Redis port. "
"Your Redis port number must be "
"lower than 55535.");
exit(1);
}
if (listenToPort(port+CLUSTER_PORT_INCR,
server.cfd,&server.cfd_count) == C_ERR)
{
exit(1);
} else {
int j;
for (j = 0; j < server.cfd_count; j++) {
if (aeCreateFileEvent(server.el, server.cfd[j], AE_READABLE,
clusterAcceptHandler, NULL) == AE_ERR)
serverPanic("Unrecoverable error creating Redis Cluster "
"file event.");
}
}
/* The slots -> keys map is a radix tree. Initialize it here. * /
server.cluster->slots_to_keys = raxNew();
memset(server.cluster->slots_keys_count,0,
sizeof(server.cluster->slots_keys_count));
/* Set myself->port / cport to my listening ports, we'll just need to
* discover the IP address via MEET messages. * /
myself->port = port;
myself->cport = port+CLUSTER_PORT_INCR;
if (server.cluster_announce_port)
myself->port = server.cluster_announce_port;
if (server.cluster_announce_bus_port)
myself->cport = server.cluster_announce_bus_port;
server.cluster->mf_end = 0;
resetManualFailover();
clusterUpdateMyselfFlags();
}
这个没有啥可详细展开的,基本就是由配置文件中的数据对相关的端口、IP服务等进行初始化和处理。
然后在前面提到的beforeSleep()中执行clusterBeforeSleep():
void clusterBeforeSleep(void) {
/* Handle failover, this is needed when it is likely that there is already
* the quorum from masters in order to react fast. */
if (server.cluster->todo_before_sleep & CLUSTER_TODO_HANDLE_FAILOVER)
clusterHandleSlaveFailover();
/* Update the cluster state. */
if (server.cluster->todo_before_sleep & CLUSTER_TODO_UPDATE_STATE)
clusterUpdateState();
/* Save the config, possibly using fsync. */
if (server.cluster->todo_before_sleep & CLUSTER_TODO_SAVE_CONFIG) {
int fsync = server.cluster->todo_before_sleep &
CLUSTER_TODO_FSYNC_CONFIG;
clusterSaveConfigOrDie(fsync);
}
/* Reset our flags (not strictly needed since every single function
* called for flags set should be able to clear its flag). * /
server.cluster->todo_before_sleep = 0;
}
这个的功能和它的调用函数一样,是在进入睡眠前再一次检查一下有没有可执行的工作并处理相关事项。
3、槽位管理
在前面的clusterNode定义中有槽位分配的数组,unsigned char slots[CLUSTER_SLOTS/8];同时在状态数据结构钵中还定义了各种映射关系,用来保证在重新分片时导入导出时的映射状态数据。CLUSTER_SLOTS/8,使用C和C++编程的很容易想到这是在用位操作,也就是说一个字节是8位,每一位如果1则代表管理这个槽位,如果为0则表示未管理。槽是由Master节点来管理的,Slave只能使用。
在Redis的实际应用中,会有重新分片的情况,看到上面中的映射导入导出的数组没,就是是Resharding过程中,槽和集群节点的映射关系要改变,但是槽和Key的关系不做修改。所以在迁移的过程中分为两步,一步是槽迁移,然后是键迁移。
槽迁移时,是一个主机A向另外一个主机B迁移,这时,主机A中的预备迁移的槽状态为MIGRATING,如果此时有客户端向其发出命令,会有以下几种情况:
1、如果尚未迁走,KEY仍然在,则直接执行。
2、如果Key不存在,则返回ASK,只转移本次的转向另外一个节点,但以后再请示此Key时仍然选择A节点。
3、如果Key有多条命令,都存在,则如1一样处理,都不存在则返回ASK。部分存在则返回TRYAGAIN,这样客户端命令会在迁移完成后得到一次ASK,然后重定向到迁移后的B主机。
而B主机会处于IMPORTING状态,同样对于客户端的命令影响如下:
1、正常情况下,尚未迁移的访问会返回MOVED并重定向,如果ASKING命令则会被执行。
2、Key不存在则创建
3、没有ASKING的请示同样MOVED,保证节点和槽的映射关系不变。
然后是键的迁移,它有三个命令:DUMP,RESTROE,DEL:
1、第一步是DUMP,就是在A主机拷贝复制相关键。
2、第二步在B主机Restore
3、第三步,在A主机DEL原有键
需要注意的是,这几个步骤并不是原子或者说加锁的,有可能出现错误,出现错误会导致键在两个节点都存在或者只在一个节点。
明白了上面的原则,看一下代码:
int bitmapTestBit(unsigned char *bitmap, int pos) {
off_t byte = pos/8;
int bit = pos&7;
return (bitmap[byte] & (1<<bit)) != 0;
}
/* Set the bit at position 'pos' in a bitmap. */
void bitmapSetBit(unsigned char *bitmap, int pos) {
off_t byte = pos/8;
int bit = pos&7;
bitmap[byte] |= 1<<bit;
}
/* Clear the bit at position 'pos' in a bitmap. */
void bitmapClearBit(unsigned char *bitmap, int pos) {
off_t byte = pos/8;
int bit = pos&7;
bitmap[byte] &= ~(1<<bit);
}
/* Set the slot bit and return the old value. */
int clusterNodeSetSlotBit(clusterNode *n, int slot) {
int old = bitmapTestBit(n->slots,slot);
bitmapSetBit(n->slots,slot);
if (!old) {
n->numslots++;
/* When a master gets its first slot, even if it has no slaves,
* it gets flagged with MIGRATE_TO, that is, the master is a valid
* target for replicas migration, if and only if at least one of
* the other masters has slaves right now.
*
* Normally masters are valid targerts of replica migration if:
* 1. The used to have slaves (but no longer have).
* 2. They are slaves failing over a master that used to have slaves.
*
* However new masters with slots assigned are considered valid
* migration tagets if the rest of the cluster is not a slave-less.
*
* See https://github.com/antirez/redis/issues/3043 for more info. */
if (n->numslots == 1 && clusterMastersHaveSlaves())
n->flags |= CLUSTER_NODE_MIGRATE_TO;
}
return old;
}
/* Clear the slot bit and return the old value. */
int clusterNodeClearSlotBit(clusterNode *n, int slot) {
int old = bitmapTestBit(n->slots,slot);
bitmapClearBit(n->slots,slot);
if (old) n->numslots--;
return old;
}
/* Return the slot bit from the cluster node structure. */
int clusterNodeGetSlotBit(clusterNode *n, int slot) {
return bitmapTestBit(n->slots,slot);
}
/* Add the specified slot to the list of slots that node 'n' will
* serve. Return C_OK if the operation ended with success.
* If the slot is already assigned to another instance this is considered
* an error and C_ERR is returned. */
int clusterAddSlot(clusterNode *n, int slot) {
if (server.cluster->slots[slot]) return C_ERR;
clusterNodeSetSlotBit(n,slot);
server.cluster->slots[slot] = n;
return C_OK;
}
/* Delete the specified slot marking it as unassigned.
* Returns C_OK if the slot was assigned, otherwise if the slot was
* already unassigned C_ERR is returned. */
int clusterDelSlot(int slot) {
clusterNode *n = server.cluster->slots[slot];
if (!n) return C_ERR;
serverAssert(clusterNodeClearSlotBit(n,slot) == 1);
server.cluster->slots[slot] = NULL;
return C_OK;
}
/* Delete all the slots associated with the specified node.
* The number of deleted slots is returned. * /
int clusterDelNodeSlots(clusterNode *node) {
int deleted = 0, j;
for (j = 0; j < CLUSTER_SLOTS; j++) {
if (clusterNodeGetSlotBit(node,j)) {
clusterDelSlot(j);
deleted++;
}
}
return deleted;
}
/* Clear the migrating / importing state for all the slots.
* This is useful at initialization and when turning a master into slave. * /
void clusterCloseAllSlots(void) {
memset(server.cluster->migrating_slots_to,0,
sizeof(server.cluster->migrating_slots_to));
memset(server.cluster->importing_slots_from,0,
sizeof(server.cluster->importing_slots_from));
}
这里是底层的槽管理,看一下谈到的Restroe相关,其它类似在Server.h中有定义:
//cluster.c
void dumpCommand(client *c) {
robj *o;
rio payload;
/* Check if the key is here. */
if ((o = lookupKeyRead(c->db,c->argv[1])) == NULL) {
addReplyNull(c);
return;
}
/* Create the DUMP encoded representation. */
createDumpPayload(&payload,o,c->argv[1]);
/* Transfer to the client */
addReplyBulkSds(c,payload.io.buffer.ptr);
return;
}
void restoreCommand(client *c) {
long long ttl, lfu_freq = -1, lru_idle = -1, lru_clock = -1;
rio payload;
int j, type, replace = 0, absttl = 0;
robj *obj;
/* Parse additional options */
for (j = 4; j < c->argc; j++) {
int additional = c->argc-j-1;
if (!strcasecmp(c->argv[j]->ptr,"replace")) {
replace = 1;
} else if (!strcasecmp(c->argv[j]->ptr,"absttl")) {
absttl = 1;
} else if (!strcasecmp(c->argv[j]->ptr,"idletime") && additional >= 1 &&
lfu_freq == -1)
{
if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lru_idle,NULL)
!= C_OK) return;
if (lru_idle < 0) {
addReplyError(c,"Invalid IDLETIME value, must be >= 0");
return;
}
lru_clock = LRU_CLOCK();
j++; /* Consume additional arg. */
} else if (!strcasecmp(c->argv[j]->ptr,"freq") && additional >= 1 &&
lru_idle == -1)
{
if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lfu_freq,NULL)
!= C_OK) return;
if (lfu_freq < 0 || lfu_freq > 255) {
addReplyError(c,"Invalid FREQ value, must be >= 0 and <= 255");
return;
}
j++; /* Consume additional arg. */
} else {
addReply(c,shared.syntaxerr);
return;
}
}
/* Make sure this key does not already exist here... */
if (!replace && lookupKeyWrite(c->db,c->argv[1]) != NULL) {
addReply(c,shared.busykeyerr);
return;
}
/* Check if the TTL value makes sense */
if (getLongLongFromObjectOrReply(c,c->argv[2],&ttl,NULL) != C_OK) {
return;
} else if (ttl < 0) {
addReplyError(c,"Invalid TTL value, must be >= 0");
return;
}
/* Verify RDB version and data checksum. */
if (verifyDumpPayload(c->argv[3]->ptr,sdslen(c->argv[3]->ptr)) == C_ERR)
{
addReplyError(c,"DUMP payload version or checksum are wrong");
return;
}
rioInitWithBuffer(&payload,c->argv[3]->ptr);
if (((type = rdbLoadObjectType(&payload)) == -1) ||
((obj = rdbLoadObject(type,&payload,c->argv[1])) == NULL))
{
addReplyError(c,"Bad data format");
return;
}
/* Remove the old key if needed. */
if (replace) dbDelete(c->db,c->argv[1]);
/* Create the key and set the TTL if any */
dbAdd(c->db,c->argv[1],obj);
if (ttl) {
if (!absttl) ttl+=mstime();
setExpire(c,c->db,c->argv[1],ttl);
}
objectSetLRUOrLFU(obj,lfu_freq,lru_idle,lru_clock,1000);
signalModifiedKey(c->db,c->argv[1]);
addReply(c,shared.ok);
server.dirty++;
}
void clusterCommand(client *c) {
if (server.cluster_enabled == 0) {
addReplyError(c,"This instance has cluster support disabled");
return;
}
if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"help")) {
const char *help[] = {
"ADDSLOTS <slot> [slot ...] -- Assign slots to current node.",
"BUMPEPOCH -- Advance the cluster config epoch.",
"COUNT-failure-reports <node-id> -- Return number of failure reports for <node-id>.",
"COUNTKEYSINSLOT <slot> - Return the number of keys in <slot>.",
"DELSLOTS <slot> [slot ...] -- Delete slots information from current node.",
"FAILOVER [force|takeover] -- Promote current replica node to being a master.",
"FORGET <node-id> -- Remove a node from the cluster.",
"GETKEYSINSLOT <slot> <count> -- Return key names stored by current node in a slot.",
"FLUSHSLOTS -- Delete current node own slots information.",
"INFO - Return onformation about the cluster.",
"KEYSLOT <key> -- Return the hash slot for <key>.",
"MEET <ip> <port> [bus-port] -- Connect nodes into a working cluster.",
"MYID -- Return the node id.",
"NODES -- Return cluster configuration seen by node. Output format:",
" <id> <ip:port> <flags> <master> <pings> <pongs> <epoch> <link> <slot> ... <slot>",
"REPLICATE <node-id> -- Configure current node as replica to <node-id>.",
"RESET [hard|soft] -- Reset current node (default: soft).",
"SET-config-epoch <epoch> - Set config epoch of current node.",
"SETSLOT <slot> (importing|migrating|stable|node <node-id>) -- Set slot state.",
"REPLICAS <node-id> -- Return <node-id> replicas.",
"SLOTS -- Return information about slots range mappings. Each range is made of:",
" start, end, master and replicas IP addresses, ports and ids",
NULL
};
addReplyHelp(c, help);
} else if (!strcasecmp(c->argv[1]->ptr,"meet") && (c->argc == 4 || c->argc == 5)) {
/* CLUSTER MEET <ip> <port> [cport] */
long long port, cport;
if (getLongLongFromObject(c->argv[3], &port) != C_OK) {
addReplyErrorFormat(c,"Invalid TCP base port specified: %s",
(char*)c->argv[3]->ptr);
return;
}
if (c->argc == 5) {
if (getLongLongFromObject(c->argv[4], &cport) != C_OK) {
addReplyErrorFormat(c,"Invalid TCP bus port specified: %s",
(char*)c->argv[4]->ptr);
return;
}
} else {
cport = port + CLUSTER_PORT_INCR;
}
if (clusterStartHandshake(c->argv[2]->ptr,port,cport) == 0 &&
errno == EINVAL)
{
addReplyErrorFormat(c,"Invalid node address specified: %s:%s",
(char*)c->argv[2]->ptr, (char*)c->argv[3]->ptr);
} else {
addReply(c,shared.ok);
}
} else if (!strcasecmp(c->argv[1]->ptr,"nodes") && c->argc == 2) {
/* CLUSTER NODES */
sds nodes = clusterGenNodesDescription(0);
addReplyVerbatim(c,nodes,sdslen(nodes),"txt");
sdsfree(nodes);
} else if (!strcasecmp(c->argv[1]->ptr,"myid") && c->argc == 2) {
/* CLUSTER MYID */
addReplyBulkCBuffer(c,myself->name, CLUSTER_NAMELEN);
} else if (!strcasecmp(c->argv[1]->ptr,"slots") && c->argc == 2) {
/* CLUSTER SLOTS */
clusterReplyMultiBulkSlots(c);
} else if (!strcasecmp(c->argv[1]->ptr,"flushslots") && c->argc == 2) {
/* CLUSTER FLUSHSLOTS */
if (dictSize(server.db[0].dict) != 0) {
addReplyError(c,"DB must be empty to perform CLUSTER FLUSHSLOTS.");
return;
}
clusterDelNodeSlots(myself);
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
addReply(c,shared.ok);
} else if ((!strcasecmp(c->argv[1]->ptr,"addslots") ||
!strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3)
{
/* CLUSTER ADDSLOTS <slot> [slot] ... */
/* CLUSTER DELSLOTS <slot> [slot] ... */
int j, slot;
unsigned char *slots = zmalloc(CLUSTER_SLOTS);
int del = !strcasecmp(c->argv[1]->ptr,"delslots");
memset(slots,0,CLUSTER_SLOTS);
/* Check that all the arguments are parseable and that all the
* slots are not already busy. */
for (j = 2; j < c->argc; j++) {
if ((slot = getSlotOrReply(c,c->argv[j])) == -1) {
zfree(slots);
return;
}
if (del && server.cluster->slots[slot] == NULL) {
addReplyErrorFormat(c,"Slot %d is already unassigned", slot);
zfree(slots);
return;
} else if (!del && server.cluster->slots[slot]) {
addReplyErrorFormat(c,"Slot %d is already busy", slot);
zfree(slots);
return;
}
if (slots[slot]++ == 1) {
addReplyErrorFormat(c,"Slot %d specified multiple times",
(int)slot);
zfree(slots);
return;
}
}
for (j = 0; j < CLUSTER_SLOTS; j++) {
if (slots[j]) {
int retval;
/* If this slot was set as importing we can clear this
* state as now we are the real owner of the slot. */
if (server.cluster->importing_slots_from[j])
server.cluster->importing_slots_from[j] = NULL;
retval = del ? clusterDelSlot(j) :
clusterAddSlot(myself,j);
serverAssertWithInfo(c,NULL,retval == C_OK);
}
}
zfree(slots);
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
addReply(c,shared.ok);
} else if (!strcasecmp(c->argv[1]->ptr,"setslot") && c->argc >= 4) {
/* SETSLOT 10 MIGRATING <node ID> */
/* SETSLOT 10 IMPORTING <node ID> */
/* SETSLOT 10 STABLE */
/* SETSLOT 10 NODE <node ID> */
int slot;
clusterNode *n;
if (nodeIsSlave(myself)) {
addReplyError(c,"Please use SETSLOT only with masters.");
return;
}
if ((slot = getSlotOrReply(c,c->argv[2])) == -1) return;
if (!strcasecmp(c->argv[3]->ptr,"migrating") && c->argc == 5) {
if (server.cluster->slots[slot] != myself) {
addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot);
return;
}
if ((n = clusterLookupNode(c->argv[4]->ptr)) == NULL) {
addReplyErrorFormat(c,"I don't know about node %s",
(char*)c->argv[4]->ptr);
return;
}
server.cluster->migrating_slots_to[slot] = n;
} else if (!strcasecmp(c->argv[3]->ptr,"importing") && c->argc == 5) {
if (server.cluster->slots[slot] == myself) {
addReplyErrorFormat(c,
"I'm already the owner of hash slot %u",slot);
return;
}
if ((n = clusterLookupNode(c->argv[4]->ptr)) == NULL) {
addReplyErrorFormat(c,"I don't know about node %s",
(char*)c->argv[4]->ptr);
return;
}
server.cluster->importing_slots_from[slot] = n;
} else if (!strcasecmp(c->argv[3]->ptr,"stable") && c->argc == 4) {
/* CLUSTER SETSLOT <SLOT> STABLE */
server.cluster->importing_slots_from[slot] = NULL;
server.cluster->migrating_slots_to[slot] = NULL;
} else if (!strcasecmp(c->argv[3]->ptr,"node") && c->argc == 5) {
/* CLUSTER SETSLOT <SLOT> NODE <NODE ID> */
clusterNode *n = clusterLookupNode(c->argv[4]->ptr);
if (!n) {
addReplyErrorFormat(c,"Unknown node %s",
(char*)c->argv[4]->ptr);
return;
}
/* If this hash slot was served by 'myself' before to switch
* make sure there are no longer local keys for this hash slot. */
if (server.cluster->slots[slot] == myself && n != myself) {
if (countKeysInSlot(slot) != 0) {
addReplyErrorFormat(c,
"Can't assign hashslot %d to a different node "
"while I still hold keys for this hash slot.", slot);
return;
}
}
/* If this slot is in migrating status but we have no keys
* for it assigning the slot to another node will clear
* the migratig status. */
if (countKeysInSlot(slot) == 0 &&
server.cluster->migrating_slots_to[slot])
server.cluster->migrating_slots_to[slot] = NULL;
/* If this node was importing this slot, assigning the slot to
* itself also clears the importing status. */
if (n == myself &&
server.cluster->importing_slots_from[slot])
{
/* This slot was manually migrated, set this node configEpoch
* to a new epoch so that the new version can be propagated
* by the cluster.
*
* Note that if this ever results in a collision with another
* node getting the same configEpoch, for example because a
* failover happens at the same time we close the slot, the
* configEpoch collision resolution will fix it assigning
* a different epoch to each node. */
if (clusterBumpConfigEpochWithoutConsensus() == C_OK) {
serverLog(LL_WARNING,
"configEpoch updated after importing slot %d", slot);
}
server.cluster->importing_slots_from[slot] = NULL;
}
clusterDelSlot(slot);
clusterAddSlot(n,slot);
} else {
addReplyError(c,
"Invalid CLUSTER SETSLOT action or number of arguments. Try CLUSTER HELP");
return;
}
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE);
addReply(c,shared.ok);
} else if (!strcasecmp(c->argv[1]->ptr,"bumpepoch") && c->argc == 2) {
/* CLUSTER BUMPEPOCH */
int retval = clusterBumpConfigEpochWithoutConsensus();
sds reply = sdscatprintf(sdsempty(),"+%s %llu\r\n",
(retval == C_OK) ? "BUMPED" : "STILL",
(unsigned long long) myself->configEpoch);
addReplySds(c,reply);
} else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) {
/* CLUSTER INFO */
char *statestr[] = {"ok","fail","needhelp"};
int slots_assigned = 0, slots_ok = 0, slots_pfail = 0, slots_fail = 0;
uint64_t myepoch;
int j;
for (j = 0; j < CLUSTER_SLOTS; j++) {
clusterNode *n = server.cluster->slots[j];
if (n == NULL) continue;
slots_assigned++;
if (nodeFailed(n)) {
slots_fail++;
} else if (nodeTimedOut(n)) {
slots_pfail++;
} else {
slots_ok++;
}
}
myepoch = (nodeIsSlave(myself) && myself->slaveof) ?
myself->slaveof->configEpoch : myself->configEpoch;
sds info = sdscatprintf(sdsempty(),
"cluster_state:%s\r\n"
"cluster_slots_assigned:%d\r\n"
"cluster_slots_ok:%d\r\n"
"cluster_slots_pfail:%d\r\n"
"cluster_slots_fail:%d\r\n"
"cluster_known_nodes:%lu\r\n"
"cluster_size:%d\r\n"
"cluster_current_epoch:%llu\r\n"
"cluster_my_epoch:%llu\r\n"
, statestr[server.cluster->state],
slots_assigned,
slots_ok,
slots_pfail,
slots_fail,
dictSize(server.cluster->nodes),
server.cluster->size,
(unsigned long long) server.cluster->currentEpoch,
(unsigned long long) myepoch
);
/* Show stats about messages sent and received. */
long long tot_msg_sent = 0;
long long tot_msg_received = 0;
for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) {
if (server.cluster->stats_bus_messages_sent[i] == 0) continue;
tot_msg_sent += server.cluster->stats_bus_messages_sent[i];
info = sdscatprintf(info,
"cluster_stats_messages_%s_sent:%lld\r\n",
clusterGetMessageTypeString(i),
server.cluster->stats_bus_messages_sent[i]);
}
info = sdscatprintf(info,
"cluster_stats_messages_sent:%lld\r\n", tot_msg_sent);
for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) {
if (server.cluster->stats_bus_messages_received[i] == 0) continue;
tot_msg_received += server.cluster->stats_bus_messages_received[i];
info = sdscatprintf(info,
"cluster_stats_messages_%s_received:%lld\r\n",
clusterGetMessageTypeString(i),
server.cluster->stats_bus_messages_received[i]);
}
info = sdscatprintf(info,
"cluster_stats_messages_received:%lld\r\n", tot_msg_received);
/* Produce the reply protocol. */
addReplyVerbatim(c,info,sdslen(info),"txt");
sdsfree(info);
} else if (!strcasecmp(c->argv[1]->ptr,"saveconfig") && c->argc == 2) {
int retval = clusterSaveConfig(1);
if (retval == 0)
addReply(c,shared.ok);
else
addReplyErrorFormat(c,"error saving the cluster node config: %s",
strerror(errno));
} else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) {
/* CLUSTER KEYSLOT <key> */
sds key = c->argv[2]->ptr;
addReplyLongLong(c,keyHashSlot(key,sdslen(key)));
} else if (!strcasecmp(c->argv[1]->ptr,"countkeysinslot") && c->argc == 3) {
/* CLUSTER COUNTKEYSINSLOT <slot> */
long long slot;
if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK)
return;
if (slot < 0 || slot >= CLUSTER_SLOTS) {
addReplyError(c,"Invalid slot");
return;
}
addReplyLongLong(c,countKeysInSlot(slot));
} else if (!strcasecmp(c->argv[1]->ptr,"getkeysinslot") && c->argc == 4) {
/* CLUSTER GETKEYSINSLOT <slot> <count> */
long long maxkeys, slot;
unsigned int numkeys, j;
robj **keys;
if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK)
return;
if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL)
!= C_OK)
return;
if (slot < 0 || slot >= CLUSTER_SLOTS || maxkeys < 0) {
addReplyError(c,"Invalid slot or number of keys");
return;
}
/* Avoid allocating more than needed in case of large COUNT argument
* and smaller actual number of keys. */
unsigned int keys_in_slot = countKeysInSlot(slot);
if (maxkeys > keys_in_slot) maxkeys = keys_in_slot;
keys = zmalloc(sizeof(robj*)*maxkeys);
numkeys = getKeysInSlot(slot, keys, maxkeys);
addReplyArrayLen(c,numkeys);
for (j = 0; j < numkeys; j++) {
addReplyBulk(c,keys[j]);
decrRefCount(keys[j]);
}
zfree(keys);
} else if (!strcasecmp(c->argv[1]->ptr,"forget") && c->argc == 3) {
/* CLUSTER FORGET <NODE ID> */
clusterNode *n = clusterLookupNode(c->argv[2]->ptr);
if (!n) {
addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
return;
} else if (n == myself) {
addReplyError(c,"I tried hard but I can't forget myself...");
return;
} else if (nodeIsSlave(myself) && myself->slaveof == n) {
addReplyError(c,"Can't forget my master!");
return;
}
clusterBlacklistAddNode(n);
clusterDelNode(n);
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|
CLUSTER_TODO_SAVE_CONFIG);
addReply(c,shared.ok);
} else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc == 3) {
/* CLUSTER REPLICATE <NODE ID> */
clusterNode *n = clusterLookupNode(c->argv[2]->ptr);
/* Lookup the specified node in our table. */
if (!n) {
addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
return;
}
/* I can't replicate myself. */
if (n == myself) {
addReplyError(c,"Can't replicate myself");
return;
}
/* Can't replicate a slave. */
if (nodeIsSlave(n)) {
addReplyError(c,"I can only replicate a master, not a replica.");
return;
}
/* If the instance is currently a master, it should have no assigned
* slots nor keys to accept to replicate some other node.
* Slaves can switch to another master without issues. */
if (nodeIsMaster(myself) &&
(myself->numslots != 0 || dictSize(server.db[0].dict) != 0)) {
addReplyError(c,
"To set a master the node must be empty and "
"without assigned slots.");
return;
}
/* Set the master. */
clusterSetMaster(n);
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
addReply(c,shared.ok);
} else if ((!strcasecmp(c->argv[1]->ptr,"slaves") ||
!strcasecmp(c->argv[1]->ptr,"replicas")) && c->argc == 3) {
/* CLUSTER SLAVES <NODE ID> */
clusterNode *n = clusterLookupNode(c->argv[2]->ptr);
int j;
/* Lookup the specified node in our table. */
if (!n) {
addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
return;
}
if (nodeIsSlave(n)) {
addReplyError(c,"The specified node is not a master");
return;
}
addReplyArrayLen(c,n->numslaves);
for (j = 0; j < n->numslaves; j++) {
sds ni = clusterGenNodeDescription(n->slaves[j]);
addReplyBulkCString(c,ni);
sdsfree(ni);
}
} else if (!strcasecmp(c->argv[1]->ptr,"count-failure-reports") &&
c->argc == 3)
{
/* CLUSTER COUNT-FAILURE-REPORTS <NODE ID> */
clusterNode *n = clusterLookupNode(c->argv[2]->ptr);
if (!n) {
addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
return;
} else {
addReplyLongLong(c,clusterNodeFailureReportsCount(n));
}
} else if (!strcasecmp(c->argv[1]->ptr,"failover") &&
(c->argc == 2 || c->argc == 3))
{
/* CLUSTER FAILOVER [FORCE|TAKEOVER] */
int force = 0, takeover = 0;
if (c->argc == 3) {
if (!strcasecmp(c->argv[2]->ptr,"force")) {
force = 1;
} else if (!strcasecmp(c->argv[2]->ptr,"takeover")) {
takeover = 1;
force = 1; /* Takeover also implies force. */
} else {
addReply(c,shared.syntaxerr);
return;
}
}
/* Check preconditions. */
if (nodeIsMaster(myself)) {
addReplyError(c,"You should send CLUSTER FAILOVER to a replica");
return;
} else if (myself->slaveof == NULL) {
addReplyError(c,"I'm a replica but my master is unknown to me");
return;
} else if (!force &&
(nodeFailed(myself->slaveof) ||
myself->slaveof->link == NULL))
{
addReplyError(c,"Master is down or failed, "
"please use CLUSTER FAILOVER FORCE");
return;
}
resetManualFailover();
server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT;
if (takeover) {
/* A takeover does not perform any initial check. It just
* generates a new configuration epoch for this node without
* consensus, claims the master's slots, and broadcast the new
* configuration. */
serverLog(LL_WARNING,"Taking over the master (user request).");
clusterBumpConfigEpochWithoutConsensus();
clusterFailoverReplaceYourMaster();
} else if (force) {
/* If this is a forced failover, we don't need to talk with our
* master to agree about the offset. We just failover taking over
* it without coordination. */
serverLog(LL_WARNING,"Forced failover user request accepted.");
server.cluster->mf_can_start = 1;
} else {
serverLog(LL_WARNING,"Manual failover user request accepted.");
clusterSendMFStart(myself->slaveof);
}
addReply(c,shared.ok);
} else if (!strcasecmp(c->argv[1]->ptr,"set-config-epoch") && c->argc == 3)
{
/* CLUSTER SET-CONFIG-EPOCH <epoch>
*
* The user is allowed to set the config epoch only when a node is
* totally fresh: no config epoch, no other known node, and so forth.
* This happens at cluster creation time to start with a cluster where
* every node has a different node ID, without to rely on the conflicts
* resolution system which is too slow when a big cluster is created. */
long long epoch;
if (getLongLongFromObjectOrReply(c,c->argv[2],&epoch,NULL) != C_OK)
return;
if (epoch < 0) {
addReplyErrorFormat(c,"Invalid config epoch specified: %lld",epoch);
} else if (dictSize(server.cluster->nodes) > 1) {
addReplyError(c,"The user can assign a config epoch only when the "
"node does not know any other node.");
} else if (myself->configEpoch != 0) {
addReplyError(c,"Node config epoch is already non-zero");
} else {
myself->configEpoch = epoch;
serverLog(LL_WARNING,
"configEpoch set to %llu via CLUSTER SET-CONFIG-EPOCH",
(unsigned long long) myself->configEpoch);
if (server.cluster->currentEpoch < (uint64_t)epoch)
server.cluster->currentEpoch = epoch;
/* No need to fsync the config here since in the unlucky event
* of a failure to persist the config, the conflict resolution code
* will assign an unique config to this node. */
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|
CLUSTER_TODO_SAVE_CONFIG);
addReply(c,shared.ok);
}
} else if (!strcasecmp(c->argv[1]->ptr,"reset") &&
(c->argc == 2 || c->argc == 3))
{
/* CLUSTER RESET [SOFT|HARD] */
int hard = 0;
/* Parse soft/hard argument. Default is soft. */
if (c->argc == 3) {
if (!strcasecmp(c->argv[2]->ptr,"hard")) {
hard = 1;
} else if (!strcasecmp(c->argv[2]->ptr,"soft")) {
hard = 0;
} else {
addReply(c,shared.syntaxerr);
return;
}
}
/* Slaves can be reset while containing data, but not master nodes
* that must be empty. */
if (nodeIsMaster(myself) && dictSize(c->db->dict) != 0) {
addReplyError(c,"CLUSTER RESET can't be called with "
"master nodes containing keys");
return;
}
clusterReset(hard);
addReply(c,shared.ok);
} else {
addReplySubcommandSyntaxError(c);
return;
}
}
服务端上层server.c中有processCommand函数,其中对这些命令的转向和分配有相关部分,这里不再详述。
三、总结
集群模式下的优点在于分布式部署,有安全保障,但复杂也在于分布式,探活、故障的迁移和重分片等,都增加了复杂性。在后面将分别对这些展开进行分析。