Zookeeper源码分析

一、ZK集群间通信类型

zookeeper集群间通信大体可分为四类:
数据同步型,服务器初始化型,请求处理型,会话管理型。

数据同步型:(leader和flower通信的端口号: 2888)
在这里插入图片描述
服务器初始化型(leader和flower通信的端口号: 2888)
在这里插入图片描述
请求处理型(client的端口号: 2181,NIO/Netty)
在这里插入图片描述
会话管理型(leader和flower通信的端口号: 2888)
在这里插入图片描述

二、ZK常用的几个常用端口号:

接收客户请求的端口号: 2181(NIO/Netty)
leader和flower通信的端口号: 2888
选举leader时通信的端口号: 3888(BIO)
其他服务与监控中心通信端口: 7070

三、ZK服务端启动流程

根据ZooKeeper启动脚本./zkServer.sh start -server ip:port,打开脚本可以看到服务端启动入口:org.apache.zookeeper.server.quorum.QuorumPeerMain

服务启动的调用流程如下:

QuorumPeerMain#main
	->QuorumPeerMain#initializeAndRun
		->QuorumPeerMain#runFromConfig
			->QuorumPeer#run

QuorumPeer#run的核心代码如下:

while (running) {
	switch (getPeerState()) {
	case LOOKING:
		// 当前节点是选举者,接下来就是就行选主
		LOG.info("LOOKING");
		try {
			setBCVote(null);
			setCurrentVote(makeLEStrategy().lookForLeader());
		} catch (Exception e) {
		//当前节点出现问题,需要节点状态重置为looking,下次循环则进行新一轮的选举
			LOG.warn("Unexpected exception", e);
			setPeerState(ServerState.LOOKING);
		}
		break;
	case OBSERVING:
		// 当前节点是观察者,则开始和leader建立连接并保持同步
		try {
			LOG.info("OBSERVING");
			setObserver(makeObserver(logFactory));
			observer.observeLeader();
		} catch (Exception e) {
			LOG.warn("Unexpected exception",e );                        
		} finally {
		//当前节点出现问题,需要节点状态重置为looking,下次循环则进行新一轮的选举
			observer.shutdown();
			setObserver(null);
			setPeerState(ServerState.LOOKING);
		}
		break;
	case FOLLOWING:
		// 当前节点是从节点,则开始和leader建立连接并保持同步
		try {
			LOG.info("FOLLOWING");
			setFollower(makeFollower(logFactory));
			follower.followLeader();
		} catch (Exception e) {
			LOG.warn("Unexpected exception",e);
		} finally {
		// 当前节点出现问题或者Leader节点挂掉,需要节点状态重置为looking,
		// 下次循环则进行新一轮的选举
			follower.shutdown();
			setFollower(null);
			setPeerState(ServerState.LOOKING);
		}
		break;
	case LEADING:
		// 当前节点是leader节点,就进行leader相关的初始化并启动leader.lead()
		LOG.info("LEADING");
		try {
			setLeader(makeLeader(logFactory));
			// leader存活时,周期性的向flower节点发送ping命令,保持心跳
			leader.lead();
			setLeader(null);
		} catch (Exception e) {
			LOG.warn("Unexpected exception",e);
		} finally {
			if (leader != null) {
				leader.shutdown("Forcing shutdown");
				setLeader(null);
			}
			//当前节点出现问题,需要节点状态重置为looking,下次循环则进行新一轮的选举
			setPeerState(ServerState.LOOKING);
		}
		break;
	}
}

四、选主流程

FastLeaderElection#lookForLeader核心代码如下:

public Vote lookForLeader() throws InterruptedException {
	try {
		HashMap<Long, Vote> recvset = new HashMap<Long, Vote>();

		HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>();

		int notTimeout = finalizeWait;
		// 1、初始化选票
		synchronized(this){
			logicalclock++;
			updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
		}
		// 2、发送选票
		sendNotifications();

		/*
		 * Loop in which we exchange notifications until we find a leader
		 */

		while ((self.getPeerState() == ServerState.LOOKING) &&
				(!stop)){
			
			// 3、接收外部投票并处理
			/*
			 * Remove next notification from queue, times out after 2 times
			 * the termination time
			 */
			Notification n = recvqueue.poll(notTimeout,
					TimeUnit.MILLISECONDS);

			/*
			 * Only proceed if the vote comes from a replica in the
			 * voting view.
			 */
			switch (n.state) {
			case LOOKING:
				// 如果外部投票的轮次大于内部投票n.electionEpoch > logicalclock.get(),
				// 则立即更新自己的选举轮次logicalclock.set(n.electionEpoch); 
				// 并清空所有已经收到的投票recvset.clear(),然后使用初始化的投票来进行pk,
				// 并把内部投票发送出去
				// If notification > current, replace and send messages out
				if (n.electionEpoch > logicalclock) {
					logicalclock = n.electionEpoch;
					recvset.clear();
					if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
							getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
						updateProposal(n.leader, n.zxid, n.peerEpoch);
					} else {
						updateProposal(getInitId(),
								getInitLastLoggedZxid(),
								getPeerEpoch());
					}
					sendNotifications();
				// 外部投票的轮次小于内部投票,服务器会直接忽略掉该外部投票
				} else if (n.electionEpoch < logicalclock) {
					if(LOG.isDebugEnabled()){
						LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x"
								+ Long.toHexString(n.electionEpoch)
								+ ", logicalclock=0x" + Long.toHexString(logicalclock));
					}
					break;
				// 外部投票的选举轮次和内部投票一致,开始pk选票
				} else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
						proposedLeader, proposedZxid, proposedEpoch)) {
					updateProposal(n.leader, n.zxid, n.peerEpoch);
					sendNotifications();
				}

				recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));

				if (termPredicate(recvset,
						new Vote(proposedLeader, proposedZxid,
								logicalclock, proposedEpoch))) {

					// Verify if there is any change in the proposed leader
					while((n = recvqueue.poll(finalizeWait,
							TimeUnit.MILLISECONDS)) != null){
						if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
								proposedLeader, proposedZxid, proposedEpoch)){
							recvqueue.put(n);
							break;
						}
					}

					/*
					 * This predicate is true once we don't read any new
					 * relevant message from the reception queue
					 */
					if (n == null) {
						self.setPeerState((proposedLeader == self.getId()) ?
								ServerState.LEADING: learningState());

						Vote endVote = new Vote(proposedLeader,
												proposedZxid,
												logicalclock,
												proposedEpoch);
						leaveInstance(endVote);
						return endVote;
					}
				}
				break;
			case OBSERVING:
				LOG.debug("Notification from observer: " + n.sid);
				break;
			case FOLLOWING:
			case LEADING:
			// 如果所接收服务器不在选举状态,也就是在FOLLOWING或者LEADING状态,做以下两个判断:
			// a、如果逻辑时钟相同,将该数据保存到recvset,如果所接收服务器宣称自己是leader,
			//    那么将判断是不是有半数以上的服务器选举它,如果是则设置选举状态退出选举过程
				/*
				 * Consider all notifications from the same epoch
				 * together.
				 */
				if(n.electionEpoch == logicalclock){
					recvset.put(n.sid, new Vote(n.leader,
												  n.zxid,
												  n.electionEpoch,
												  n.peerEpoch));
				   
					if(ooePredicate(recvset, outofelection, n)) {
						self.setPeerState((n.leader == self.getId()) ?
								ServerState.LEADING: learningState());

						Vote endVote = new Vote(n.leader, 
								n.zxid, 
								n.electionEpoch, 
								n.peerEpoch);
						leaveInstance(endVote);
						return endVote;
					}
				}
			// b、 否则这是一条与当前逻辑时钟不符合的消息,那么说明在另一个选举过程中已经有了选举结果,
			// 于是将该选举结果加入到outofelection集合中,再根据outofelection来判断是否可以结束选举,
			// 如果可以也是保存逻辑时钟,设置选举状态,退出选举过程.
				/*
				 * Before joining an established ensemble, verify
				 * a majority is following the same leader.
				 */
				outofelection.put(n.sid, new Vote(n.version,
													n.leader,
													n.zxid,
													n.electionEpoch,
													n.peerEpoch,
													n.state));
   
				if(ooePredicate(outofelection, outofelection, n)) {
					synchronized(this){
						logicalclock = n.electionEpoch;
						self.setPeerState((n.leader == self.getId()) ?
								ServerState.LEADING: learningState());
					}
					Vote endVote = new Vote(n.leader,
											n.zxid,
											n.electionEpoch,
											n.peerEpoch);
					leaveInstance(endVote);
					return endVote;
				}
				break;
			default:
				LOG.warn("Notification state unrecognized: {} (n.state), {} (n.sid)",
						n.state, n.sid);
				break;
			}
		}
		return null;
}

五、Leader节点挂掉之后,自动重新选主

1、Leader存活时,周期性的向Follower节点发送ping命令

while (true) {

*******

	// 设置心跳间隔时间
	Thread.sleep(self.tickTime / 2);
	if (!tickSkip) {
		self.tick++;
	}
	
*******

	for (LearnerHandler f : getLearners()) {
		// Synced set is used to check we have a supporting quorum, so only
		// PARTICIPANT, not OBSERVER, learners should be used
		if (f.synced() && f.getLearnerType() == LearnerType.PARTICIPANT) {
			syncedSet.add(f.getSid());
		}
		
		// 向Follower节点发送心跳信息-ping命令
		f.ping();
	}

*******
}

2、Follower节点会通过socket在while循环中不断接收leader传来的心跳信息

QuorumPacket qp = new QuorumPacket();
while (this.isRunning()) {
    readPacket(qp);
    processPacket(qp);
}

3、如果Leader挂掉,Follower节点接收不到消息,会抛异常,然后在finally中更新Follower状态为 LOOKING,由于Leader向所有的Follower都发送ping命令,所以如果Leader节点挂掉,所有节点都会变为LOOKING,会重新进行选举!

case FOLLOWING:
	try {
		LOG.info("FOLLOWING");
		setFollower(makeFollower(logFactory));
		follower.followLeader();
	} catch (Exception e) {
		LOG.warn("Unexpected exception",e);
	} finally {
		follower.shutdown();
		setFollower(null);
		//如果Leader节点挂掉,所有节点都会变为LOOKING,会重新进行选举
		setPeerState(ServerState.LOOKING);
	}
	break;
case LEADING:
	LOG.info("LEADING");
	try {
		setLeader(makeLeader(logFactory));
		leader.lead();
		setLeader(null);
	} catch (Exception e) {
		LOG.warn("Unexpected exception",e);
	} finally {
		if (leader != null) {
			leader.shutdown("Forcing shutdown");
			setLeader(null);
		}
		//如果Leader节点挂掉,所有节点都会变为LOOKING,会重新进行选举
		setPeerState(ServerState.LOOKING);
	}
	break;

六、ZK各节点之间选票收发通信逻辑

ZK在选票收发时,采用的多线程、多级队列的方式来进行处理的。作为基础架构,充分考虑了性能和解耦。而搞多个队列/线程,每个队列/线程绑定一个远程节点sid,是要保证在线程从队列中取数据互不影响。每个节点对应的线程去这个节点的队列中获取选票信息,多节点/线程可以防止某个节点出了问题,所有节点跟着阻塞的情况发生。

如下图:
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值