一、ZK集群间通信类型
zookeeper集群间通信大体可分为四类:
数据同步型,服务器初始化型,请求处理型,会话管理型。
数据同步型:(leader和flower通信的端口号: 2888)
服务器初始化型(leader和flower通信的端口号: 2888)
请求处理型(client的端口号: 2181,NIO/Netty)
会话管理型(leader和flower通信的端口号: 2888)
二、ZK常用的几个常用端口号:
接收客户请求的端口号: 2181(NIO/Netty)
leader和flower通信的端口号: 2888
选举leader时通信的端口号: 3888(BIO)
其他服务与监控中心通信端口: 7070
三、ZK服务端启动流程
根据ZooKeeper启动脚本./zkServer.sh start -server ip:port,打开脚本可以看到服务端启动入口:org.apache.zookeeper.server.quorum.QuorumPeerMain
服务启动的调用流程如下:
QuorumPeerMain#main
->QuorumPeerMain#initializeAndRun
->QuorumPeerMain#runFromConfig
->QuorumPeer#run
QuorumPeer#run的核心代码如下:
while (running) {
switch (getPeerState()) {
case LOOKING:
// 当前节点是选举者,接下来就是就行选主
LOG.info("LOOKING");
try {
setBCVote(null);
setCurrentVote(makeLEStrategy().lookForLeader());
} catch (Exception e) {
//当前节点出现问题,需要节点状态重置为looking,下次循环则进行新一轮的选举
LOG.warn("Unexpected exception", e);
setPeerState(ServerState.LOOKING);
}
break;
case OBSERVING:
// 当前节点是观察者,则开始和leader建立连接并保持同步
try {
LOG.info("OBSERVING");
setObserver(makeObserver(logFactory));
observer.observeLeader();
} catch (Exception e) {
LOG.warn("Unexpected exception",e );
} finally {
//当前节点出现问题,需要节点状态重置为looking,下次循环则进行新一轮的选举
observer.shutdown();
setObserver(null);
setPeerState(ServerState.LOOKING);
}
break;
case FOLLOWING:
// 当前节点是从节点,则开始和leader建立连接并保持同步
try {
LOG.info("FOLLOWING");
setFollower(makeFollower(logFactory));
follower.followLeader();
} catch (Exception e) {
LOG.warn("Unexpected exception",e);
} finally {
// 当前节点出现问题或者Leader节点挂掉,需要节点状态重置为looking,
// 下次循环则进行新一轮的选举
follower.shutdown();
setFollower(null);
setPeerState(ServerState.LOOKING);
}
break;
case LEADING:
// 当前节点是leader节点,就进行leader相关的初始化并启动leader.lead()
LOG.info("LEADING");
try {
setLeader(makeLeader(logFactory));
// leader存活时,周期性的向flower节点发送ping命令,保持心跳
leader.lead();
setLeader(null);
} catch (Exception e) {
LOG.warn("Unexpected exception",e);
} finally {
if (leader != null) {
leader.shutdown("Forcing shutdown");
setLeader(null);
}
//当前节点出现问题,需要节点状态重置为looking,下次循环则进行新一轮的选举
setPeerState(ServerState.LOOKING);
}
break;
}
}
四、选主流程
FastLeaderElection#lookForLeader核心代码如下:
public Vote lookForLeader() throws InterruptedException {
try {
HashMap<Long, Vote> recvset = new HashMap<Long, Vote>();
HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>();
int notTimeout = finalizeWait;
// 1、初始化选票
synchronized(this){
logicalclock++;
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
// 2、发送选票
sendNotifications();
/*
* Loop in which we exchange notifications until we find a leader
*/
while ((self.getPeerState() == ServerState.LOOKING) &&
(!stop)){
// 3、接收外部投票并处理
/*
* Remove next notification from queue, times out after 2 times
* the termination time
*/
Notification n = recvqueue.poll(notTimeout,
TimeUnit.MILLISECONDS);
/*
* Only proceed if the vote comes from a replica in the
* voting view.
*/
switch (n.state) {
case LOOKING:
// 如果外部投票的轮次大于内部投票n.electionEpoch > logicalclock.get(),
// 则立即更新自己的选举轮次logicalclock.set(n.electionEpoch);
// 并清空所有已经收到的投票recvset.clear(),然后使用初始化的投票来进行pk,
// 并把内部投票发送出去
// If notification > current, replace and send messages out
if (n.electionEpoch > logicalclock) {
logicalclock = n.electionEpoch;
recvset.clear();
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
} else {
updateProposal(getInitId(),
getInitLastLoggedZxid(),
getPeerEpoch());
}
sendNotifications();
// 外部投票的轮次小于内部投票,服务器会直接忽略掉该外部投票
} else if (n.electionEpoch < logicalclock) {
if(LOG.isDebugEnabled()){
LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x"
+ Long.toHexString(n.electionEpoch)
+ ", logicalclock=0x" + Long.toHexString(logicalclock));
}
break;
// 外部投票的选举轮次和内部投票一致,开始pk选票
} else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
sendNotifications();
}
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
if (termPredicate(recvset,
new Vote(proposedLeader, proposedZxid,
logicalclock, proposedEpoch))) {
// Verify if there is any change in the proposed leader
while((n = recvqueue.poll(finalizeWait,
TimeUnit.MILLISECONDS)) != null){
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)){
recvqueue.put(n);
break;
}
}
/*
* This predicate is true once we don't read any new
* relevant message from the reception queue
*/
if (n == null) {
self.setPeerState((proposedLeader == self.getId()) ?
ServerState.LEADING: learningState());
Vote endVote = new Vote(proposedLeader,
proposedZxid,
logicalclock,
proposedEpoch);
leaveInstance(endVote);
return endVote;
}
}
break;
case OBSERVING:
LOG.debug("Notification from observer: " + n.sid);
break;
case FOLLOWING:
case LEADING:
// 如果所接收服务器不在选举状态,也就是在FOLLOWING或者LEADING状态,做以下两个判断:
// a、如果逻辑时钟相同,将该数据保存到recvset,如果所接收服务器宣称自己是leader,
// 那么将判断是不是有半数以上的服务器选举它,如果是则设置选举状态退出选举过程
/*
* Consider all notifications from the same epoch
* together.
*/
if(n.electionEpoch == logicalclock){
recvset.put(n.sid, new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch));
if(ooePredicate(recvset, outofelection, n)) {
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
Vote endVote = new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
}
// b、 否则这是一条与当前逻辑时钟不符合的消息,那么说明在另一个选举过程中已经有了选举结果,
// 于是将该选举结果加入到outofelection集合中,再根据outofelection来判断是否可以结束选举,
// 如果可以也是保存逻辑时钟,设置选举状态,退出选举过程.
/*
* Before joining an established ensemble, verify
* a majority is following the same leader.
*/
outofelection.put(n.sid, new Vote(n.version,
n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch,
n.state));
if(ooePredicate(outofelection, outofelection, n)) {
synchronized(this){
logicalclock = n.electionEpoch;
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
}
Vote endVote = new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
break;
default:
LOG.warn("Notification state unrecognized: {} (n.state), {} (n.sid)",
n.state, n.sid);
break;
}
}
return null;
}
五、Leader节点挂掉之后,自动重新选主
1、Leader存活时,周期性的向Follower节点发送ping命令
while (true) {
*******
// 设置心跳间隔时间
Thread.sleep(self.tickTime / 2);
if (!tickSkip) {
self.tick++;
}
*******
for (LearnerHandler f : getLearners()) {
// Synced set is used to check we have a supporting quorum, so only
// PARTICIPANT, not OBSERVER, learners should be used
if (f.synced() && f.getLearnerType() == LearnerType.PARTICIPANT) {
syncedSet.add(f.getSid());
}
// 向Follower节点发送心跳信息-ping命令
f.ping();
}
*******
}
2、Follower节点会通过socket在while循环中不断接收leader传来的心跳信息
QuorumPacket qp = new QuorumPacket();
while (this.isRunning()) {
readPacket(qp);
processPacket(qp);
}
3、如果Leader挂掉,Follower节点接收不到消息,会抛异常,然后在finally中更新Follower状态为 LOOKING,由于Leader向所有的Follower都发送ping命令,所以如果Leader节点挂掉,所有节点都会变为LOOKING,会重新进行选举!
case FOLLOWING:
try {
LOG.info("FOLLOWING");
setFollower(makeFollower(logFactory));
follower.followLeader();
} catch (Exception e) {
LOG.warn("Unexpected exception",e);
} finally {
follower.shutdown();
setFollower(null);
//如果Leader节点挂掉,所有节点都会变为LOOKING,会重新进行选举
setPeerState(ServerState.LOOKING);
}
break;
case LEADING:
LOG.info("LEADING");
try {
setLeader(makeLeader(logFactory));
leader.lead();
setLeader(null);
} catch (Exception e) {
LOG.warn("Unexpected exception",e);
} finally {
if (leader != null) {
leader.shutdown("Forcing shutdown");
setLeader(null);
}
//如果Leader节点挂掉,所有节点都会变为LOOKING,会重新进行选举
setPeerState(ServerState.LOOKING);
}
break;
六、ZK各节点之间选票收发通信逻辑
ZK在选票收发时,采用的多线程、多级队列的方式来进行处理的。作为基础架构,充分考虑了性能和解耦。而搞多个队列/线程,每个队列/线程绑定一个远程节点sid,是要保证在线程从队列中取数据互不影响。每个节点对应的线程去这个节点的队列中获取选票信息,多节点/线程可以防止某个节点出了问题,所有节点跟着阻塞的情况发生。
如下图: