选举主节点
选举主节点入口方法在node的start方法中
//node启动开始选举master节点
discovery.startInitialJoin();
@Override
public void startInitialJoin() {
// start the join thread from a cluster state update. See {@link JoinThreadControl} for details.
//从集群状态更新启动连接线程
synchronized (stateMutex) {
// do the join on a different thread, the caller of this method waits for 30s anyhow till it is discovered
joinThreadControl.startNewThreadIfNotRunning();
}
}
选举运行在线程池Generic中运行
public void startNewThreadIfNotRunning() {
assert Thread.holdsLock(stateMutex);
//线程已经运行直接退出
if (joinThreadActive()) {
return;
}
//提交任务
threadPool.generic().execute(new Runnable() {
@Override
public void run() {
Thread currentThread = Thread.currentThread();
if (!currentJoinThread.compareAndSet(null, currentThread)) {
return;
}
//不停执行加入集群服务
while (running.get() && joinThreadActive(currentThread)) {
try {
innerJoinCluster();
return;
} catch (Exception e) {
logger.error("unexpected error while joining cluster, trying again", e);
assert ExceptionsHelper.reThrowIfNotNull(e);
}
}
}
});
}
首先判断是否已经有选举任务正在运行,如果有则直接退出,提交了一个任务到generic中运行,最终调用innerJoinCluster方法
- 选举临时主节点
//选举临时master节点
masterNode = findMaster();
首先查询当前集群中活跃的主节点或者从主节点候选列表中选择新的主节点,如果选举成功则返回。我们分析一下该方法下的几个主要过程
发送ping请求到所有节点
//ping所有节点,并接受返回结果
List<ZenPing.PingResponse> fullPingResponses = pingAndWait(pingTimeout).toList();
if (fullPingResponses == null) {
logger.trace("No full ping responses");
return null;
}
protected void ping(final Consumer<PingCollection> resultsConsumer,
final TimeValue scheduleDuration,
final TimeValue requestDuration) {
final List<TransportAddress> seedAddresses = new ArrayList<>();
//所有节点地址
seedAddresses.addAll(hostsProvider.getSeedAddresses(createHostsResolver()));
//获取所有节点
final DiscoveryNodes nodes = contextProvider.clusterState().nodes();
// add all possible master nodes that were active in the last known cluster configuration
for (ObjectCursor<DiscoveryNode> masterNode : nodes.getMasterNodes().values()) {
seedAddresses.add(masterNode.value.getAddress());
}
final ConnectionProfile connectionProfile =
ConnectionProfile.buildSingleChannelProfile(TransportRequestOptions.Type.REG, requestDuration, requestDuration,
TimeValue.MINUS_ONE, null);
final PingingRound pingingRound = new PingingRound(pingingRoundIdGenerator.incrementAndGet(), seedAddresses, resultsConsumer,
nodes.getLocalNode(), connectionProfile);
//记录ping结果
activePingingRounds.put(pingingRound.id(), pingingRound);
final AbstractRunnable pingSender = new AbstractRunnable() {
@Override
public void onFailure(Exception e) {
if (e instanceof AlreadyClosedException == false) {
logger.warn("unexpected error while pinging", e);
}
}
@Override
protected void doRun() throws Exception {
//发送ping请求
sendPings(requestDuration, pingingRound);
}
};
//使用通用的的线程池执行
threadPool.generic().execute(pingSender);
//并且添加两个定时执行任务
threadPool.schedule(pingSender, TimeValue.timeValueMillis(scheduleDuration.millis() / 3), ThreadPool.Names.GENERIC);
threadPool.schedule(pingSender, TimeValue.timeValueMillis(scheduleDuration.millis() / 3 * 2), ThreadPool.Names.GENERIC);
threadPool.schedule(new AbstractRunnable() {
@Override
protected void doRun() throws Exception {
//结束ping,关闭资源
finishPingingRound(pingingRound);
}
@Override
public void onFailure(Exception e) {
logger.warn("unexpected error while finishing pinging round", e);
}
}, scheduleDuration, ThreadPool.Names.GENERIC);
}
根据配置的单播地址构造需要发送请求的nodes,发送ping请求到node
protected void sendPings(final TimeValue timeout, final PingingRound pingingRound) {
final ClusterState lastState = contextProvider.clusterState();
final UnicastPingRequest pingRequest = new UnicastPingRequest(pingingRound.id(), timeout, createPingResponse(lastState));
List<TransportAddress> temporalAddresses = temporalResponses.stream().map(pingResponse -> {
assert clusterName.equals(pingResponse.clusterName()) :
"got a ping request from a different cluster. expected " + clusterName + " got " + pingResponse.clusterName();
return pingResponse.node().getAddress();
}).collect(Collectors.toList());
final Stream<TransportAddress> uniqueAddresses = Stream.concat(pingingRound.getSeedAddresses().stream(),
temporalAddresses.stream()).distinct();
// resolve what we can via the latest cluster state
//单播node节点
final Set<DiscoveryNode> nodesToPing = uniqueAddresses
.map(address -> {
DiscoveryNode foundNode = lastState.nodes().findByAddress(address);
//检查节点之前是否连接过
if (foundNode != null && transportService.nodeConnected(foundNode)) {
return foundNode;
} else {
return new DiscoveryNode(
address.toString(),
address,
emptyMap(),
emptySet(),
Version.CURRENT.minimumCompatibilityVersion());
}
}).collect(Collectors.toSet());
//遍历发送所有节点
nodesToPing.forEach(node -> sendPingRequestToNode(node, timeout, pingingRound, pingRequest));
}
首先检查与该node是否已经建立过连接,如果已经建立直接返回连接,如果没有则调用transport模块创建连接
protected void doRun() throws Exception {
Connection connection = null;
if (transportService.nodeConnected(node)) {
try {
// concurrency can still cause disconnects
//并发可能会导致断开连接
connection = transportService.getConnection(node);
} catch (NodeNotConnectedException e) {
logger.trace("[{}] node [{}] just disconnected, will create a temp connection", pingingRound.id(), node);
}
}
//之前没有创建连接
if (connection == null) {
connection = pingingRound.getOrConnect(node);
}
logger.trace("[{}] sending to {}", pingingRound.id(), node);
//"internal:discovery/zen/unicast"; 发送ping请求,并且将返回结果保存在pingingRound中
transportService.sendRequest(connection, ACTION_NAME, pingRequest,
TransportRequestOptions.builder().withTimeout((long) (timeout.millis() * 1.25)).build(),
getPingResponseHandler(pingingRound, node));
}
//创建连接
result = transportService.openConnection(node, connectionProfile);
获取所有返回结果,并将本节点结果到fullPingResponses中
//本地节点
final DiscoveryNode localNode = transportService.getLocalNode();
// add our selves
assert fullPingResponses.stream().map(ZenPing.PingResponse::node)
.filter(n -> n.equals(localNode)).findAny().isPresent() == false;
//添加本地节点
fullPingResponses.add(new ZenPing.PingResponse(localNode, null, this.clusterState()));
过滤结果,获取只有master角色的节点返回结果
// filter responses 过滤不具有master节点返回的结果
final List<ZenPing.PingResponse> pingResponses = filterPingResponses(fullPingResponses, masterElectionIgnoreNonMasters, logger);
然后会构建两个列表一个是activeMasters存储集群当前活跃master集合,另一个是masterCandidates具有master资格的候选节点列表
//每个节点认为的master的节点集合
List<DiscoveryNode> activeMasters = new ArrayList<>();
for (ZenPing.PingResponse pingResponse : pingResponses) {
if (pingResponse.master() != null && !localNode.equals(pingResponse.master())) {
activeMasters.add(pingResponse.master());
}
}
activeMasters列表中保存了每个节点认为的当前master节点,正常情况下只有一个,这里不含本节点
List<ElectMasterService.MasterCandidate> masterCandidates = new ArrayList<>();
for (ZenPing.PingResponse pingResponse : pingResponses) {
//记录可以作为master节点的节点
if (pingResponse.node().isMasterNode()) {
masterCandidates.add(new ElectMasterService.MasterCandidate(pingResponse.node(), pingResponse.getClusterStateVersion()));
}
}
过滤掉只有master资格的节点,构建候选master节点集合,如果上面的activeMasters列表为空则从masterCandidates选举主节点
//如果还没有选举出来master
if (activeMasters.isEmpty()) {
//达到了minimumMasterNodes个节点,单个节点也可以满足
if (electMaster.hasEnoughCandidates(masterCandidates)) {
//选出master节点
final ElectMasterService.MasterCandidate winner = electMaster.electMaster(masterCandidates);
logger.trace("candidate {} won election", winner);
//返回master节点
return winner.getNode();
} else {
// if we don't have enough master nodes, we bail, because there are not enough master to elect from
logger.warn("not enough master nodes discovered during pinging (found [{}], but needed [{}]), pinging again",
masterCandidates, electMaster.minimumMasterNodes());
return null;
}
} else {
assert !activeMasters.contains(localNode) :
"local node should never be elected as master when other nodes indicate an active master";
// lets tie break between discovered nodes
//已经选出了master,从返回的master中选择一个id最小的节点
return electMaster.tieBreakActiveMasters(activeMasters);
}
如果当前没有主节点,则从候选节点中选择主节点。首先判断当前法定人数是否达到,正常情况是master节点数/2+1
public boolean hasEnoughCandidates(Collection<MasterCandidate> candidates) {
//没有候选节点
if (candidates.isEmpty()) {
return false;
}
//单节点选举
if (minimumMasterNodes < 1) {
return true;
}
assert candidates.stream().map(MasterCandidate::getNode).collect(Collectors.toSet()).size() == candidates.size() :
"duplicates ahead: " + candidates;
return candidates.size() >= minimumMasterNodes;
}
如果当前候选节点达到法定人数则从中选择一个作为master
public MasterCandidate electMaster(Collection<MasterCandidate> candidates) {
//再次检验是否有足够的候选节点
assert hasEnoughCandidates(candidates);
//对节点进行排序
List<MasterCandidate> sortedCandidates = new ArrayList<>(candidates);
//比对两个候选节点的clusterState的版本,大的版本排在前面
//如果两个版本相同则比较节点的id,升序排序,选出id最小的
sortedCandidates.sort(MasterCandidate::compare);
//取出第一个
return sortedCandidates.get(0);
}
这里可以看出只是将所有候选节点进行排序选择一个最小节点作为master,并且自己实现了compare方法
public static int compare(MasterCandidate c1, MasterCandidate c2) {
//比较两个候选节点的状态版本信息,版本新的在前
int ret = Long.compare(c2.clusterStateVersion, c1.clusterStateVersion);
//如果版本相同,比较节点id
if (ret == 0) {
ret = compareNodes(c1.getNode(), c2.getNode());
}
return ret;
}
首先会比较两个候选节点的集群状态版本信息,如果版本信息比较新则排在前面,如果版本号相同则需要比较两个节点的节点id,然后选择排在最前面的一个节点为master
如果activeMasters列表不为空则从中选择一个作为master
//已经选出了master,从返回的master中选择一个id最小的节点
return electMaster.tieBreakActiveMasters(activeMasters);
public DiscoveryNode tieBreakActiveMasters(Collection<DiscoveryNode> activeMasters) {
return activeMasters.stream().min(ElectMasterService::compareNodes).get();
}
private static int compareNodes(DiscoveryNode o1, DiscoveryNode o2) {
//两个节点都没有master资格
if (o1.isMasterNode() && !o2.isMasterNode()) {
return -1;
}
//选择具有master资格的节点
if (!o1.isMasterNode() && o2.isMasterNode()) {
return 1;
}
//比较两者的id,从小到大,升序排序
return o1.getId().compareTo(o2.getId());
}
activeMasters集合一般都是具有master资格的节点,这里可以看到只是简单的获取id小的节点作为master
- 本节点为临时master
//选举出来的master节点是本节点
if (transportService.getLocalNode().equals(masterNode)) {
//需要加入的其他具有主节点资格的节点数,就是承认当前节点是主节点的数量
final int requiredJoins = Math.max(0, electMaster.minimumMasterNodes() - 1); // we count as one
logger.debug("elected as master, waiting for incoming joins ([{}] needed)", requiredJoins);
//等待超过半数的节点加入,选择本节点作为master
nodeJoinController.waitToBeElectedAsMaster(requiredJoins, masterElectionWaitForJoinsTimeout,
new NodeJoinController.ElectionCallback() {
@Override
public void onElectedAsMaster(ClusterState state) {
//选举成功
synchronized (stateMutex) {
joinThreadControl.markThreadAsDone(currentThread);
}
}
@Override
public void onFailure(Throwable t) {
logger.trace("failed while waiting for nodes to join, rejoining", t);
synchronized (stateMutex) {
joinThreadControl.markThreadAsDoneAndStartNew(currentThread);
}
}
}
);
}
获取需要得投票数量,然后等待投票数量超过法定数量则选举成功,如果达到超时时间没有得到足够选票则失败
public void waitToBeElectedAsMaster(int requiredMasterJoins, TimeValue timeValue, final ElectionCallback callback) {
final CountDownLatch done = new CountDownLatch(1);
//阻塞到选举成功或失败
final ElectionCallback wrapperCallback = new ElectionCallback() {
@Override
public void onElectedAsMaster(ClusterState state) {
done.countDown();
callback.onElectedAsMaster(state);
}
@Override
public void onFailure(Throwable t) {
done.countDown();
callback.onFailure(t);
}
};
ElectionContext myElectionContext = null;
try {
// check what we have so far..
// capture the context we add the callback to make sure we fail our own
synchronized (this) {
assert electionContext != null : "waitToBeElectedAsMaster is called we are not accumulating joins";
myElectionContext = electionContext;
//设置需要承认本节点为master的数量,并设置回调,解除阻塞
electionContext.onAttemptToBeElected(requiredMasterJoins, wrapperCallback);
//检查是否有足够节点join,让自己选为master,
// 并且想其他节点通知自己选举成功更新集群状态
checkAndElectIfNeeded();
}
try {
if (done.await(timeValue.millis(), TimeUnit.MILLISECONDS)) {
// callback handles everything
return;
}
} catch (InterruptedException e) {
}
if (logger.isTraceEnabled()) {
final int pendingNodes = myElectionContext.getPendingMasterJoinsCount();
logger.trace("timed out waiting to be elected. waited [{}]. pending master node joins [{}]", timeValue, pendingNodes);
}
//选举超时
failContextIfNeeded(myElectionContext, "timed out waiting to be elected");
} catch (Exception e) {
logger.error("unexpected failure while waiting for incoming joins", e);
if (myElectionContext != null) {
failContextIfNeeded(myElectionContext, "unexpected failure while waiting for pending joins [" + e.getMessage() + "]");
}
}
}
这里可以看到是通过CountDownLatch进行控制,阻塞等待一定时间。当选举完成会发布集群状态,然后执行回调,再执行wrapperCallback方法,执行countDown比较选举结束,上面阻塞的程序继续往下走
private final ClusterStateTaskListener electionFinishedListener = new ClusterStateTaskListener() {
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
//本节点选为master
if (newState.nodes().isLocalNodeElectedMaster()) {
ElectionContext.this.onElectedAsMaster(newState);
} else {
onFailure(source, new NotMasterException("election stopped [" + source + "]"));
}
}
@Override
public void onFailure(String source, Exception e) {
ElectionContext.this.onFailure(e);
}
};
}
- 查看投票结果
投票其实就是加入请求数量,在ZenDiscovery的构造函数中会创建membership对象处理node加入、离开和校验
this.membership = new MembershipAction(transportService, new MembershipListener(), onJoinValidators);
public MembershipAction(TransportService transportService, MembershipListener listener,
Collection<BiConsumer<DiscoveryNode,ClusterState>> joinValidators) {
this.transportService = transportService;
this.listener = listener;
//注册加入集群处理
transportService.registerRequestHandler(DISCOVERY_JOIN_ACTION_NAME,
ThreadPool.Names.GENERIC, JoinRequest::new, new JoinRequestRequestHandler());
//注册加入集群校验处理
transportService.registerRequestHandler(DISCOVERY_JOIN_VALIDATE_ACTION_NAME,
ThreadPool.Names.GENERIC, ValidateJoinRequest::new,
new ValidateJoinRequestRequestHandler(transportService::getLocalNode, joinValidators));
//注册离开集群处理
transportService.registerRequestHandler(DISCOVERY_LEAVE_ACTION_NAME,
ThreadPool.Names.GENERIC, LeaveRequest::new, new LeaveRequestRequestHandler());
}
当join请求到来时会在JoinRequestRequestHandler进行处理
private class JoinRequestRequestHandler implements TransportRequestHandler<JoinRequest> {
@Override
public void messageReceived(final JoinRequest request, final TransportChannel channel, Task task) throws Exception {
listener.onJoin(request.getNode(), new JoinCallback() {
@Override
public void onSuccess() {//成功
try {
//发送返回
channel.sendResponse(TransportResponse.Empty.INSTANCE);
} catch (Exception e) {
onFailure(e);
}
}
@Override
public void onFailure(Exception e) {
try {
channel.sendResponse(e);
} catch (Exception inner) {
inner.addSuppressed(e);
logger.warn("failed to send back failure on join request", inner);
}
}
});
}
}
然后调用handleJoinRequest方法
void handleJoinRequest(final DiscoveryNode node, final ClusterState state, final MembershipAction.JoinCallback callback) {
if (nodeJoinController == null) {
throw new IllegalStateException("discovery module is not yet started");
} else {
// we do this in a couple of places including the cluster update thread. This one here is really just best effort
// to ensure we fail as fast as possible.
onJoinValidators.stream().forEach(a -> a.accept(node, state));
if (state.getBlocks().hasGlobalBlock(STATE_NOT_RECOVERED_BLOCK) == false) {
JoinTaskExecutor.ensureMajorVersionBarrier(node.getVersion(), state.getNodes().getMinNodeVersion());
}
// try and connect to the node, if it fails, we can raise an exception back to the client...
//连接node
transportService.connectToNode(node);
// validate the join request, will throw a failure if it fails, which will get back to the
// node calling the join request
//验证加入请求,如果失败将抛出失败,这将返回到
// 节点调用加入请求
try {
membership.sendValidateJoinRequestBlocking(node, state, joinTimeout);
} catch (Exception e) {
logger.warn(() -> new ParameterizedMessage("failed to validate incoming join request from node [{}]", node),
e);
callback.onFailure(new IllegalStateException("failure when sending a validation request to node", e));
return;
}
//处理加入请求
nodeJoinController.handleJoinRequest(node, callback);
}
}
public synchronized void handleJoinRequest(final DiscoveryNode node, final MembershipAction.JoinCallback callback) {
//正在选举
if (electionContext != null) {
electionContext.addIncomingJoin(node, callback);
checkPendingJoinsAndElectIfNeeded();
} else {
//如果是单个节点加入集群,提交提个状态变更
masterService.submitStateUpdateTask("zen-disco-node-join",
new JoinTaskExecutor.Task(node, "no election context"), ClusterStateTaskConfig.build(Priority.URGENT),
joinTaskExecutor, new JoinTaskListener(callback, logger));
}
}
累积加入请求数量及投票数量
public synchronized void addIncomingJoin(DiscoveryNode node, MembershipAction.JoinCallback callback) {
ensureOpen();
joinRequestAccumulator.computeIfAbsent(node, n -> new ArrayList<>()).add(callback);
}
检查是否达到法定数量
private synchronized void checkPendingJoinsAndElectIfNeeded() {
assert electionContext != null : "election check requested but no active context";
//连接数
final int pendingMasterJoins = electionContext.getPendingMasterJoinsCount();
//不足够连接数
if (electionContext.isEnoughPendingJoins(pendingMasterJoins) == false) {
if (logger.isTraceEnabled()) {
logger.trace("not enough joins for election. Got [{}], required [{}]", pendingMasterJoins,
electionContext.requiredMasterJoins);
}
} else {
if (logger.isTraceEnabled()) {
logger.trace("have enough joins for election. Got [{}], required [{}]", pendingMasterJoins,
electionContext.requiredMasterJoins);
}
//选举胜出,成功成为master,广播选为master状态
electionContext.closeAndBecomeMaster();
electionContext = null; // clear this out so future joins won't be accumulated
}
}
如果达到法定数量,则选举结束向集群提交一个集群状态变更任务,广播自己成为master
public synchronized void closeAndBecomeMaster() {
assert callback != null : "becoming a master but the callback is not yet set";
assert isEnoughPendingJoins(getPendingMasterJoinsCount()) : "becoming a master but pending joins of "
+ getPendingMasterJoinsCount() + " are not enough. needs [" + requiredMasterJoins + "];";
//设置选举关闭
innerClose();
Map<JoinTaskExecutor.Task, ClusterStateTaskListener> tasks = getPendingAsTasks("become master");
//向集群广播自己选为master的状态的任务source
final String source = "zen-disco-elected-as-master ([" + tasks.size() + "] nodes joined)";
// noop listener, the election finished listener determines result
//noop 监听器,选举完成监听器确定结果
tasks.put(JoinTaskExecutor.newBecomeMasterTask(), (source1, e) -> {});
//添加选举结束的回调
tasks.put(JoinTaskExecutor.newFinishElectionTask(), electionFinishedListener);
//提交一个集群状态变更任务,向集群广播自己选为master的状态
masterService.submitStateUpdateTasks(source, tasks, ClusterStateTaskConfig.build(Priority.URGENT), joinTaskExecutor);
}
- 其他节点为临时master
//选举出来的master节点不是自己
//停止选主服务,改为加入主节点
nodeJoinController.stopElectionContext(masterNode + " elected");
//发送加入master节点的请求
final boolean success = joinElectedMaster(masterNode);
首先停止选master服务,改为发送加入集群请求
private boolean joinElectedMaster(DiscoveryNode masterNode) {
try {
// first, make sure we can connect to the master
//首先检查是否能连接master
transportService.connectToNode(masterNode);
} catch (Exception e) {
logger.warn(() -> new ParameterizedMessage("failed to connect to master [{}], retrying...", masterNode), e);
return false;
}
int joinAttempt = 0; // we retry on illegal state if the master is not yet ready
while (true) {
try {
logger.trace("joining master {}", masterNode);
//发送加入集群
membership.sendJoinRequestBlocking(masterNode, transportService.getLocalNode(), joinTimeout);
return true;
} catch (Exception e) {
final Throwable unwrap = ExceptionsHelper.unwrapCause(e);
//重试
if (unwrap instanceof NotMasterException) {
if (++joinAttempt == this.joinRetryAttempts) {
...
} else {
...
return false;
}
}
try {
Thread.sleep(this.joinRetryDelay.millis());
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
发送加入集群请求
public void sendJoinRequestBlocking(DiscoveryNode masterNode, DiscoveryNode node, TimeValue timeout) {
//"internal:discovery/zen/join";发送加入请求
transportService.submitRequest(masterNode, DISCOVERY_JOIN_ACTION_NAME, new JoinRequest(node),
EmptyTransportResponseHandler.INSTANCE_SAME).txGet(timeout.millis(), TimeUnit.MILLISECONDS);
}
加入请求处理就在JoinRequestRequestHandler中
最后检查加入集群是否成功,如果成功则检查集群状态中的master节点是否存在,不存在需要重新选举;然后比较选择的master节点和自己加入的节点是否相同,不相同则加入新发布的master;如果选择的master节点是本节点加入的节点则设置选举结束。如果加入集群失败则需要重新选举。
到这里集群已经选择出来了master节点,其他节点已经加入集群。那如何保证master到非master节点的连接是否存活,相反非master到master的连接是否存活需要用到两个探测器MasterFaultDetection和NodesFaultDetection
//检查主节点是否失效
this.masterFD = new MasterFaultDetection(settings, threadPool, transportService, this::clusterState, masterService, clusterName);
//添加监听器
this.masterFD.addListener(new MasterNodeFailureListener());
this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService, this::clusterState, clusterName);
//添加监听器
this.nodesFD.addListener(new NodeFaultDetectionListener());
MasterFaultDetection会在非master节点运行定时任务,默认每隔1s发送一次"internal:discovery/zen/fd/master_ping"请求,如果失败则需要重新加入集群,执行一次选主流程
private class MasterNodeFailureListener implements MasterFaultDetection.Listener {
@Override
public void onMasterFailure(DiscoveryNode masterNode, Throwable cause, String reason) {
handleMasterGone(masterNode, cause, reason);
}
}
//ping master节点失败了
private void handleMasterGone(final DiscoveryNode masterNode, final Throwable cause, final String reason) {
...
synchronized (stateMutex) {
if (localNodeMaster() == false && masterNode.equals(committedState.get().nodes().getMasterNode())) {
// flush any pending cluster states from old master, so it will not be set as master again
pendingStatesQueue.failAllStatesAndClear(new ElasticsearchException("master left [{}]", reason));
//重新选举加入master
rejoin("master left (reason = " + reason + ")");
}
}
}
NodesFaultDetection运行在master节点,master也是定时发送心跳ping请求给非master节点,如果发送失败则先重试,重试达到一定次数调用notifyNodeFailure,如果返回连接错误则调用handleTransportDisconnect(node)
transportService.sendRequest(node, PING_ACTION_NAME, newPingRequest(), options, new TransportResponseHandler<PingResponse>() {
@Override
public PingResponse read(StreamInput in) throws IOException {
return new PingResponse(in);
}
@Override
public void handleResponse(PingResponse response) {
if (!running()) {
return;
}
retryCount = 0;
threadPool.schedule(NodeFD.this, pingInterval, ThreadPool.Names.SAME);
}
@Override
public void handleException(TransportException exp) {
if (!running()) {
return;
}
if (exp instanceof ConnectTransportException || exp.getCause() instanceof ConnectTransportException) {
handleTransportDisconnect(node);
return;
}
retryCount++;
logger.trace( () -> new ParameterizedMessage(
"[node ] failed to ping [{}], retry [{}] out of [{}]", node, retryCount, pingRetryCount), exp);
if (retryCount >= pingRetryCount) {
logger.debug("[node ] failed to ping [{}], tried [{}] times, each with maximum [{}] timeout", node,
pingRetryCount, pingRetryTimeout);
// not good, failure,查过ping重试次数,移除节点
if (nodesFD.remove(node, NodeFD.this)) {
notifyNodeFailure(node, "failed to ping, tried [" + pingRetryCount + "] times, each with maximum ["
+ pingRetryTimeout + "] timeout");
}
} else {
// resend the request, not reschedule, rely on send timeout
transportService.sendRequest(node, PING_ACTION_NAME, newPingRequest(), options, this);
}
}
@Override
public String executor() {
return ThreadPool.Names.SAME;
}
});
最终都会调用NodeFaultDetectionListener的onNodeFailure方法
private void handleNodeFailure(final DiscoveryNode node, final String reason) {
if (lifecycleState() != Lifecycle.State.STARTED) {
// not started, ignore a node failure
return;
}
if (!localNodeMaster()) {
// nothing to do here...
return;
}
//ping节点失败,移除
removeNode(node, "zen-disco-node-failed", reason);
}
private void removeNode(final DiscoveryNode node, final String source, final String reason) {
masterService.submitStateUpdateTask(
source + "(" + node + "), reason(" + reason + ")",
new NodeRemovalClusterStateTaskExecutor.Task(node, reason),
ClusterStateTaskConfig.build(Priority.IMMEDIATE),
nodeRemovalExecutor,
nodeRemovalExecutor);
}
public ClusterTasksResult<Task> execute(final ClusterState currentState, final List<Task> tasks) throws Exception {
final DiscoveryNodes.Builder remainingNodesBuilder = DiscoveryNodes.builder(currentState.nodes());
boolean removed = false;
for (final Task task : tasks) {
if (currentState.nodes().nodeExists(task.node())) {
remainingNodesBuilder.remove(task.node());
removed = true;
} else {
logger.debug("node [{}] does not exist in cluster state, ignoring", task);
}
}
if (!removed) {
// no nodes to remove, keep the current cluster state
return ClusterTasksResult.<Task>builder().successes(tasks).build(currentState);
}
final ClusterState remainingNodesClusterState = remainingNodesClusterState(currentState, remainingNodesBuilder);
return getTaskClusterTasksResult(currentState, tasks, remainingNodesClusterState);
}
@Override
protected ClusterTasksResult<Task> getTaskClusterTasksResult(ClusterState currentState, List<Task> tasks,
ClusterState remainingNodesClusterState) {
//检查是还有足够的节点存在
if (electMasterService.hasEnoughMasterNodes(remainingNodesClusterState.nodes()) == false) {
final ClusterTasksResult.Builder<Task> resultBuilder = ClusterTasksResult.<Task>builder().successes(tasks);
final int masterNodes = electMasterService.countMasterNodes(remainingNodesClusterState.nodes());
rejoin.accept(LoggerMessageFormat.format("not enough master nodes (has [{}], but needed [{}])",
masterNodes, electMasterService.minimumMasterNodes()));
return resultBuilder.build(currentState);
} else {
return super.getTaskClusterTasksResult(currentState, tasks, remainingNodesClusterState);
}
}
检查当前剩余节点数量是否达到法定数量,如果不足需要放弃master身份,重新加入集群,避免脑裂发生