Zookeeper启动与Leader选举源码阅读
选举流程图解
以三个节点的集群为例,节点1和节点2选举完成后,加入节点3
ZooKeeper选举线程模型
ZooKeeper服务的启动(branch-3.8.0)
ZooKeeper服务端启动类QuorumPeerMain
从bin目录下的zkServer.sh或zkServer.cmd里找到启动主类为QuorumPeerMain
QuorumPeerMain main = new QuorumPeerMain();
try {
main.initializeAndRun(args);
}catch{...}
protected void initializeAndRun(String[] args),加载zookeeper配置,启动服务
// 加载配置
QuorumPeerConfig config = new QuorumPeerConfig();
if (args.length == 1) {
config.parse(args[0]);
}
......
if (args.length == 1 && config.isDistributed()) {
// 集群入口
runFromConfig(config);
} else {
// 单机入口
LOG.warn("Either no config or no quorum defined in config, running in standalone mode");
// there is only server in the quorum -- run as standalone
ZooKeeperServerMain.main(args);
}
public void runFromConfig(QuorumPeerConfig config) zookeeper集群启动
//设置ServerCnxnFactory,默认为NIOServerCnxnFactory,可以通过zookeeper.serverCnxnFactory修改
if (config.getClientPortAddress() != null) {
// 初始化网络通信的相关配置,NettyServerCnxnFactory的默认handler为CnxnChannelHandler
cnxnFactory = ServerCnxnFactory.createFactory();
cnxnFactory.configure(config.getClientPortAddress(), config.getMaxClientCnxns(), config.getClientPortListenBacklog(), false);
}
......
// 初始化服务配置,配置来自于配置文件或者默认值,继承自ZooKeeperThread
quorumPeer = getQuorumPeer();
......
// 设置选举算法类型,默认为3
quorumPeer.setElectionType(config.getElectionAlg());
......
// zookeeper内存数据库
quorumPeer.setZKDatabase(new ZKDatabase(quorumPeer.getTxnFactory()));
......
quorumPeer.start();
......
QuorumPeer
public synchronized void start(),QuorumPeer的启动方法
// 从本地文件中加载历史数据
loadDataBase();
// 绑定NettyServerCnxnFactory的端口并启动
startServerCnxnFactory();
try {
adminServer.start();
} catch (AdminServerException e) {
LOG.warn("Problem starting AdminServer", e);
}
// 启动Leader选举
startLeaderElection();
startJvmPauseMonitor();
super.start();
public synchronized void startLeaderElection(),集群leader选举;
// 服务端有四种状态LOOKING(寻找 Leader 状态,认为当前服务器没有 Leader,需要进行 Leader 选举)
// FOLLOWING(跟随者,当前服务器角色是 Follower)、LEADING(领导者,当前服务器角色是 Leader)
// OBSERVING(观察者,当前服务器角色是 Observer);启动后服务端默认状态为LOOKING
if (getPeerState() == ServerState.LOOKING) {
// 初始化选票,服务ID,最大的事务ID,当前选举周期
currentVote = new Vote(myid, getLastLoggedZxid(), getCurrentEpoch());
}
// electionType在runFromConfig中设置,默认为3
this.electionAlg = createElectionAlgorithm(electionType);
protected Election createElectionAlgorithm(int electionAlgorithm),创建选举算法并执行选举
QuorumCnxManager qcm = createCnxnManager();
......
QuorumCnxManager.Listener listener = qcm.listener;
if (listener != null) {
listener.start();
FastLeaderElection fle = new FastLeaderElection(this, qcm);
fle.start();
le = fle;
} else {
LOG.error("Null listener when initializing cnx manager");
}
super.start()即QuorumPeer的run方法
......
while (running) {
......
switch (getPeerState()) {
case LOOKING:
......
// 执行FastLeaderElection.lookForLeader方法,执行完成后会返回Leader的选票
setCurrentVote(makeLEStrategy().lookForLeader());
......
case OBSERVING:
......
setObserver(makeObserver(logFactory));
observer.observeLeader();
......
case FOLLOWING:
......
setFollower(makeFollower(logFactory));
// 建立与Leader的Socket,与Leader同步数据,接收Leader的数据
follower.followLeader();
......
case LEADING:
......
setLeader(makeLeader(logFactory));
// 建立数据通信的ServerSocket,与Follower建立通信,给Follower发送Ping消息
leader.lead();
setLeader(null);
......
}
}
......
FastLeaderElection leader选举类
关键属性
// 存放需要发送的选票信息
LinkedBlockingQueue<ToSend> sendqueue;
// 存放收到的选票信息
LinkedBlockingQueue<Notification> recvqueue;
Messenger messenger;
Messenger的初始化方法
this.ws = new WorkerSender(manager);
this.wsThread = new Thread(this.ws, "WorkerSender[myid=" + self.getId() + "]");
this.wsThread.setDaemon(true);
this.wr = new WorkerReceiver(manager);
this.wrThread = new Thread(this.wr, "WorkerReceiver[myid=" + self.getId() + "]");
this.wrThread.setDaemon(true);
FastLeaderElection.start()调用Messenger.start(),Messenger.start()方法如下
// 运行发送选票线程
this.wsThread.start();
// 运行接收选票线程
this.wrThread.start();
FastLeaderElection.lookForLeader方法
Map<Long, Vote> recvset = new HashMap<Long, Vote>();
Map<Long, Vote> outofelection = new HashMap<Long, Vote>();
int notTimeout = minNotificationInterval;
synchronized (this) {
// 选举周期+1
logicalclock.incrementAndGet();
// 更新当前节点的选票
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
// 给其他节点发送选票(内容为当前节点的ID,最大事务ID,选举周期)
sendNotifications();
// 一直循环直到找到Leader
while ((self.getPeerState() == ServerState.LOOKING) && (!stop)) {
Notification n = recvqueue.poll(notTimeout, TimeUnit.MILLISECONDS);
if (n == null) {
......
// 与其他节点简历连接
manager.connectAll();
......
}else if (validVoter(n.sid) && validVoter(n.leader)) {
switch (n.state) {
case LOOKING:
if (n.electionEpoch > logicalclock.get()) {
// 收到选票的周期 > 当前节点的周期
// 设置当前节点的选票周期为收到的选票的周期
logicalclock.set(n.electionEpoch);
// 清理已经收到的选票
recvset.clear();
// 如果新的选票的事务ID+节点id的优先级大于当前节点的优先级,更新当前节点的选票为收到的选票
// 否则只更新当前选票的周期
if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
} else {
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
// 重新发送选票给其他节点
sendNotifications();
}else if (n.electionEpoch < logicalclock.get()) {
//选票周期 < 当前节点的周期,不处理
}else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) {
// 周期相同,且选票优先级高于当前节点时,更新当前节点的选票为收到的选票
updateProposal(n.leader, n.zxid, n.peerEpoch);
// 重新发送选票给其他节点
sendNotifications();
}
// 更新sid的选票
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
// 如果recvset中的选票和当前节点的选票相同,在收到选票的集合中加入此节点
voteSet = getVoteTracker(recvset, new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch));
// 选票过半,voteSet中包含过半节点
if (voteSet.hasAllQuorums()) {
// 等待一段时间看是否还有新的选票
while ((n = recvqueue.poll(finalizeWait, TimeUnit.MILLISECONDS)) != null) {
if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) {
recvqueue.put(n);
break;
}
}
// 选举结束
if (n == null) {
setPeerState(proposedLeader, voteSet);
Vote endVote = new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch);
leaveInstance(endVote);
return endVote;
}
}
}
}
}
// 选票的pick逻辑
// 先比较选举周期,再比较事务ID,最后比较节点ID,大的优先
protected boolean totalOrderPredicate(long newId, long newZxid, long newEpoch, long curId, long curZxid, long curEpoch) {
return ((newEpoch > curEpoch)
|| ((newEpoch == curEpoch)
&& ((newZxid > curZxid)
|| ((newZxid == curZxid)
&& (newId > curId)))));
}
// 判断选票是否过半
public boolean hasAllQuorums() {
// qvAcksetPairs的size应该为1
for (QuorumVerifierAcksetPair qvAckset : qvAcksetPairs) {
if (!qvAckset.getQuorumVerifier().containsQuorum(qvAckset.getAckset())) {
return false;
}
}
return true;
}
private void sendNotifications(),生成需要发送给具有选举权的节点(包括自己)的选举信息并放入sendqueue队列
// self.getCurrentAndNextConfigVoters()获取所有具有投票权的节点的ID集合
for (long sid : self.getCurrentAndNextConfigVoters()) {
QuorumVerifier qv = self.getQuorumVerifier();
// 生成投票信息,目标为sid
ToSend notmsg = new ToSend(
ToSend.mType.notification,
proposedLeader,
proposedZxid,
logicalclock.get(),
QuorumPeer.ServerState.LOOKING,
sid,
proposedEpoch,
qv.toString().getBytes(UTF_8));
sendqueue.offer(notmsg);
}
FastLeaderElection.WorkerSender
继承自ZooKeeperThread,发送选举消息线程
public void run() {
//获取需要发送的选票
ToSend m = sendqueue.poll(3000, TimeUnit.MILLISECONDS);
process(m);
}
void process(ToSend m) {
ByteBuffer requestBuffer = buildMsg(m.state.ordinal(), m.leader, m.zxid, m.electionEpoch, m.peerEpoch, m.configData);
manager.toSend(m.sid, requestBuffer);
}
public void toSend(Long sid, ByteBuffer b) {
// 发送给自己的选票,直接放到接收消息的队列里
if (this.mySid == sid) {
b.position(0);
addToRecvQueue(new Message(b.duplicate(), sid));
} else {
// 将发送给其他节点的消息放到对应的阻塞队列中
BlockingQueue<ByteBuffer> bq = queueSendMap.computeIfAbsent(sid, serverId -> new CircularBlockingQueue<>(SEND_CAPACITY));
addToSendQueue(bq, b);
// 如果不存在连接,新建连接
connectOne(sid);
}
FastLeaderElection.WorkerReceiver
继承自ZooKeeperThread,接收选举消息线程
while (!stop) {
}
QuorumCnxManager
QuorumCnxManager.connectAll,与其他节点建立连接
// 将节点的sid和对应的发送信息的线程绑定
final ConcurrentHashMap<Long, SendWorker> senderWorkerMap;
public void connectAll(){
for (Enumeration<Long> en = queueSendMap.keys(); en.hasMoreElements(); ) {
sid = en.nextElement();
connectOne(sid);
}
}
// 与其他节点建立通信
synchronized void connectOne(long sid) {
// 与sid存在连接时,不再重复建立连接
if (senderWorkerMap.get(sid) != null) {
......
return;
}
synchronized (self.QV_LOCK) {
......
if (connectOne(sid, lastProposedView.get(sid).electionAddr)) {
return;
}
......
}
}
synchronized boolean connectOne(long sid, MultipleAddresses electionAddr) {
......
return initiateConnectionAsync(electionAddr, sid);
}
// 通过QuorumConnectionReqThread线程同其他节点异步的建立连接
public boolean initiateConnectionAsync(final MultipleAddresses electionAddr, final Long sid){
connectionExecutor.execute(new QuorumConnectionReqThread(electionAddr, sid));
}
QuorumCnxManager.Listener
QuorumCnxManager.Listener继承自ZooKeeperThrea,与集群内的其他节点的选举端口创建BIO连接,并处理选举请求
Set<InetSocketAddress> addresses;
if (self.getQuorumListenOnAllIPs()) {
addresses = self.getElectionAddress().getWildcardAddresses();
} else {
addresses = self.getElectionAddress().getAllAddresses();
}
// 对addresses中的每一个都生成一个ListenerHandler,ListenerHandler实现自Runnable
// ListenerHandler因为异常达到最大的重试次数后,会调用自己的close方法和latch.countDown方法
CountDownLatch latch = new CountDownLatch(addresses.size());
listenerHandlers = addresses.stream().map(address ->
new ListenerHandler(address, self.shouldUsePortUnification(), self.isSslQuorum(), latch))
.collect(Collectors.toList());
// 新建里集群节点数-1的线程池,并将ListenerHandler集合放入线程池中执行
final ExecutorService executor = Executors.newFixedThreadPool(addresses.size());
try {
listenerHandlers.forEach(executor::submit);
} finally {
// 线程池会等待已经执行的线程执行完成才关闭
executor.shutdown();
}
......
// 等待所有的ListenerHandler都结束
latch.await();
......
ListenerHandler.run
public void run() {
try {
acceptConnections();
......
} finally {
latch.countDown();
}
}
private void acceptConnections() {
int numRetries = 0;
while ((!shutdown) && (portBindMaxRetry == 0 || numRetries < portBindMaxRetry)) {
try {
// 建立socket连接
serverSocket = createNewServerSocket();
while (!shutdown) {
......
client = serverSocket.accept();
......
// 处理连接请求
receiveConnection(client);
numRetries = 0;
......
}
} catch (IOException e) {
......
numRetries++;
......
}
}
}
public void receiveConnection(final Socket sock){
din = new DataInputStream(new BufferedInputStream(sock.getInputStream()));
handleConnection(sock, din);
}
private void handleConnection(Socket sock, DataInputStream din){
// 请求连接的节点ID小于当前节点ID,如果存在连接,关闭连接;新建从当前节点到请求节点的连接
if (sid < self.getId()){
if (electionAddr != null) {
connectOne(sid, electionAddr);
} else {
// 与sid建立连接,并新起发送和接收消息的线程
connectOne(sid);
}
}else if (sid == self.getId()) {
// 不存在这种情况,出现了说明有问题
} else { // Otherwise start worker threads to receive data.
// 建立与请求节点关联的发送和接收消息的线程
SendWorker sw = new SendWorker(sock, sid);
RecvWorker rw = new RecvWorker(sock, din, sid, sw);
sw.setRecv(rw);
SendWorker vsw = senderWorkerMap.get(sid);
if (vsw != null) {
vsw.finish();
}
senderWorkerMap.put(sid, sw);
queueSendMap.putIfAbsent(sid, new CircularBlockingQueue<>(SEND_CAPACITY));
sw.start();
rw.start();
}
}
QuorumCnxManager.QuorumConnectionReqThread,继承自ZooKeeperThread
public void run() {
initiateConnection(electionAddr, sid);
}
//
public void initiateConnection(final MultipleAddresses electionAddr, final Long sid) {
......
//建立socket端口
sock.connect(electionAddr.getReachableOrOne(), cnxTO);
......
// 通过sock建立接受和发送消息的线程
startConnection(sock, sid);
......
}
private boolean startConnection(Socket sock, Long sid){
......
// 只允许节点id大的向节点id小的建立连接
if (sid > self.getId()) {
LOG.info("Have smaller server identifier, so dropping the connection: (myId:{} --> sid:{})", self.getId(), sid);
closeSocket(sock);
// Otherwise proceed with the connection
} else {
// 发送消息
SendWorker sw = new SendWorker(sock, sid);
// 接收消息
RecvWorker rw = new RecvWorker(sock, din, sid, sw);
sw.setRecv(rw);
senderWorkerMap.put(sid, sw);
// 发送的线程会绑定阻塞队列
queueSendMap.putIfAbsent(sid, new CircularBlockingQueue<>(SEND_CAPACITY));
sw.start();
rw.start();
}
}
QuorumCnxManager.SendWorker
给其他节点发送消息
while (running && !shutdown && sock != null) {
......
BlockingQueue<ByteBuffer> bq = queueSendMap.get(sid);
......
// 从sid对应的阻塞队列中获取消息
b = pollSendQueue(bq, 1000, TimeUnit.MILLISECONDS);
......
lastMessageSent.put(sid, b);
send(b);
......
}
QuorumCnxManager.RecvWorker
接收其他节点发送的消息
while (running && !shutdown && sock != null) {
......
int length = din.readInt();
......
// 从sid对应的阻塞队列中获取消息
final byte[] msgArray = new byte[length];
din.readFully(msgArray, 0, length);
// 将从Socket中得到的消息放到QuorumCnxManager
addToRecvQueue(new Message(ByteBuffer.wrap(msgArray), sid));
......
}