Zookeeper 选举机制思路分享 从启动讲起
ZK 启动
shell 启动 寻找ZK启动引导类
以下是本机启动ZK的shell 命令
➜ bin pwd
/home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin
➜ bin ./zkServer.sh start /home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/conf/zoo1.cfg
ZooKeeper JMX enabled by default
Using config: /home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/conf/zoo1.cfg
➜ bin ps -ef|grep java
cmm 3154 3085 0 2月23 tty2 00:17:13 /home/cmm/opt/application/idea-IU-183.5429.30/jre64/bin/java -classpath /home/cmm/opt/application/idea-IU-183.5429.30/lib/bootstrap.jar:/home/cmm/opt/application/idea-IU-183.5429.30/lib/extensions.jar:/home/cmm/opt/application/idea-IU-183.5429.30/lib/util.jar:/home/cmm/opt/application/idea-IU-183.5429.30/lib/jdom.jar:/home/cmm/opt/application/idea-IU-183.5429.30/lib/log4j.jar:/home/cmm/opt/application/idea-IU-183.5429.30/lib/trove4j.jar:/home/cmm/opt/application/idea-IU-183.5429.30/lib/jna.jar:/home/cmm/opt/application/idea-IU-183.5429.30/jre64/lib/tools.jar -Xms128m -Xmx750m -XX:ReservedCodeCacheSize=240m -XX:+UseConcMarkSweepGC -XX:SoftRefLRUPolicyMSPerMB=50 -ea -Dsun.io.useCanonCaches=false -Djava.net.preferIPv4Stack=true -Djdk.http.auth.tunneling.disabledSchemes="" -XX:+HeapDumpOnOutOfMemoryError -XX:-OmitStackTraceInFastThrow -Dawt.useSystemAAFontSettings=lcd -Dsun.java2d.renderer=sun.java2d.marlin.MarlinRenderingEngine -XX:ErrorFile=/home/cmm/java_error_in_IDEA_%p.log -XX:HeapDumpPath=/home/cmm/java_error_in_IDEA.hprof -Didea.paths.selector=IntelliJIdea2018.3 -Djb.vmOptionsFile=/home/cmm/opt/application/idea-IU-183.5429.30/bin/idea64.vmoptions -Didea.jre.check=true com.intellij.idea.Main
cmm 6841 1391 12 09:22 pts/0 00:00:01 /home/cmm/opt/common/jdk-12/bin/java -Dzookeeper.log.dir=. -Dzookeeper.root.logger=INFO,CONSOLE -cp /home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin/../build/classes:/home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin/../build/lib/*.jar:/home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin/../lib/slf4j-log4j12-1.7.25.jar:/home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin/../lib/slf4j-api-1.7.25.jar:/home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin/../lib/netty-3.10.6.Final.jar:/home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin/../lib/log4j-1.2.17.jar:/home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin/../lib/jline-0.9.94.jar:/home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin/../lib/audience-annotations-0.5.0.jar:/home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin/../zookeeper-3.4.13.jar:/home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin/../src/java/lib/*.jar:/home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin/../conf:/home/cmm/opt/common/jdk-12/lib/ -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.local.only=false org.apache.zookeeper.server.quorum.QuorumPeerMain /home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin/../conf/zoo.cfg
cmm 6874 6186 0 09:22 pts/0 00:00:00 grep --color=auto --exclude-dir=.bzr --exclude-dir=CVS --exclude-dir=.git --exclude-dir=.hg --exclude-dir=.svn java
可以看到 最终的启动类为 com.intellij.idea.Main (IDEA), org.apache.zookeeper.server.quorum.QuorumPeerMain(ZK)
ZK 启动逻辑
PS: 本文使用的是 3.6.0-SNAPSHOT 版本的zookeeper代码 可能与其他版本存在差异
org.apache.zookeeper.server.quorum.QuorumPeerMain#main(String[])
org.apache.zookeeper.server.quorum.QuorumPeerMain#main(String[])
public static void main(String[] args) {
QuorumPeerMain main = new QuorumPeerMain();
try {
//主要逻辑在这里执行
main.initializeAndRun(args);
} catch (...);
...
}
LOG.info("Exiting normally");
System.exit(ExitCode.EXECUTION_FINISHED.getValue());
}
protected void initializeAndRun(String[] args)
throws ConfigException, IOException, AdminServerException
{
// 加载配置 即配置文件位置 结合shell命令可以看到 /home/cmm/opt/common/zookeeeper/zookeeper-3.4.13/bin/../conf/zoo.cfg
QuorumPeerConfig config = new QuorumPeerConfig();
if (args.length == 1) {
config.parse(args[0]);
}
// Start and schedule the the purge task
DatadirCleanupManager purgeMgr = new DatadirCleanupManager(config
.getDataDir(), config.getDataLogDir(), config
.getSnapRetainCount(), config.getPurgeInterval());
purgeMgr.start();
if (args.length == 1 && config.isDistributed()) {
//根据配置启动
runFromConfig(config);
} else {
LOG.warn("Either no config or no quorum defined in config, running "
+ " in standalone mode");
// there is only server in the quorum -- run as standalone
ZooKeeperServerMain.main(args);
}
}
public void runFromConfig(QuorumPeerConfig config) throws IOException, AdminServerException{
...
// 启动逻辑处理类
// 加载配置文件,设置对应的属性
quorumPeer = getQuorumPeer();
...
quorumPeer.initialize();
// 启动逻辑
quorumPeer.start();
quorumPeer.join();
}
//org/apache/zookeeper/server/quorum/QuorumPeer.java:907
@Override
public synchronized void start() {
if (!getView().containsKey(myid)) {
throw new RuntimeException("My id " + myid + " not in the peer list");
}
// 加载数据
loadDataBase();
startServerCnxnFactory();
try {
adminServer.start();
} catch (AdminServerException e) {
LOG.warn("Problem starting AdminServer", e);
System.out.println(e);
}
// 开始进行选举 其实设置选举策略 默认使用的是 FastLeaderElection
startLeaderElection();
// run 做了什么
super.start();
}
// org/apache/zookeeper/server/quorum/QuorumPeer.java:1141
@Override
public void run() {
updateThreadName();
...
try {
/*
* Main loop
*/
while (running) {
switch (getPeerState()) {
// 启动时大家都知道默认是LOOKING状态
case LOOKING:
LOG.info("LOOKING");
ServerMetrics.LOOKING_COUNT.add(1);
if (Boolean.getBoolean("readonlymode.enabled")) {
try {
// 核心操作
setCurrentVote(makeLEStrategy().lookForLeader());
} catch (Exception e) {
...
} finally {
...
}
} else {
try {
// 核心操作 所以重点是 lookForLeader()
setCurrentVote(makeLEStrategy().lookForLeader());
} catch (Exception e) {
...
}
}
break;
case OBSERVING:
//... do observer
break;
case FOLLOWING:
//... do flower
break;
case LEADING:
//... do leader
break;
}
start_fle = Time.currentElapsedTime();
}
} finally {
...
}
}
选举机制核心逻辑 FastLeaderElection#lookForLeader()
// org/apache/zookeeper/server/quorum/FastLeaderElection.java:914
public Vote lookForLeader() throws InterruptedException {
...
try {
/**
* 1. 投票给自己
* 2. 接收其他服务器的选票
* 3. PK
* 4. 投票
* 5. 统计 与自己投相同票数的数量是否超过半数
*
*/
Map<Long, Vote> recvset = new HashMap<Long, Vote>(); // 投票箱 key:其他服务器的sid, vote 选票
Map<Long, Vote> outofelection = new HashMap<Long, Vote>();
int notTimeout = minNotificationInterval;
synchronized(this){
logicalclock.incrementAndGet();
// 更新提议 包含 myid,lastZxid,epoch 更新为自己当前的值 (投给自己)
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
sendNotifications(); //启动时先投票给自己
SyncedLearnerTracker voteSet;
while ((self.getPeerState() == ServerState.LOOKING) &&
(!stop)){ // 当前服务器未停止,且处于looking状态
/*
* Remove next notification from queue, times out after 2 times
* the termination time
* 获取其他服务器的选票 recvqueue:接收其他服务器选票的投票箱
*/
Notification n = recvqueue.poll(notTimeout,
TimeUnit.MILLISECONDS);
/*
* Sends more notifications if haven't received enough.
* Otherwise processes new notification.
*/
if(n == null){
// 获取到的选票是空的,判断有没有需要发送的选票
// 因为之前已经 sendNotifications() 过一次,(启动时投票给自己),所以如果到这里,发现有东西没有发送出去,那么可能原因是连接还没有建立
if(manager.haveDelivered()){ // 判断连接是否建立
sendNotifications(); // 已经建立过了连接但是未从其他服务器上获取到选票,此时重新发送一次选票
} else {
// 与所有可以参与投票的服务器建立连接
manager.connectAll();
}
...
}
else if (validVoter(n.sid) && validVoter(n.leader)) {
// 此时接收到了其他服务器的选票
/*
* Only proceed if the vote comes from a replica in the current or next
* voting view for a replica in the current or next voting view.
*/
switch (n.state) {
case LOOKING:
...
// If notification > current, replace and send messages out
if (n.electionEpoch > logicalclock.get()) { //如果接收到的投票轮次比自己的高
logicalclock.set(n.electionEpoch); //设置自己的时钟(投票轮次)为选票的的轮次
recvset.clear(); //清空自己的投票箱
// 比较选票对应的服务器和本机, 比较顺序为 leader的任期,事件时间(zxid),serverId(sid)
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
// 更新为选票对应服务器上的 leader,zxid,选举轮次
updateProposal(n.leader, n.zxid, n.peerEpoch);
} else {
// 此时自己的数据更加'新',则更新信息为自己的数据
updateProposal(getInitId(),
getInitLastLoggedZxid(),
getPeerEpoch());
}
// 发送选票 把自己当前的 leader,zxid,leader任期,投票轮次
sendNotifications();
} else if (n.electionEpoch < logicalclock.get()) {
// 接收的选票的投票轮次比自己小,直接不管
if(LOG.isDebugEnabled()){
LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x"
+ Long.toHexString(n.electionEpoch)
+ ", logicalclock=0x" + Long.toHexString(logicalclock.get()));
}
break;
} else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)) { // 此时投票轮次相同进行PK
// 更新为选票对应服务器上的 leader,zxid,选举轮次
updateProposal(n.leader, n.zxid, n.peerEpoch);
sendNotifications();
}
if(LOG.isDebugEnabled()){
LOG.debug("Adding vote: from=" + n.sid +
", proposed leader=" + n.leader +
", proposed zxid=0x" + Long.toHexString(n.zxid) +
", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
}
// don't care about the version if it's in LOOKING statef
// 保存接收到的选票.此时并不关心版本,都会加入到投票箱中
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
//根据当前的投票箱(接收到的选票),以及自己的选票来判断能不能选出leader
voteSet = getVoteTracker(
recvset, new Vote(proposedLeader, proposedZxid,
logicalclock.get(), proposedEpoch));
// 大致逻辑为,判断和自己投票相同的server是否超过半数
if (voteSet.hasAllQuorums()) {
// 如果符合过半验证,本台服务器就认为可以选出leader
// Verify if there is any change in the proposed leader
while((n = recvqueue.poll(finalizeWait,
TimeUnit.MILLISECONDS)) != null){
// 如果又接收到选票了,如果选票比当前选举出来的更加优秀,将选票加入接收队列,退出本次循环,返回上层循环
// 我已经选出leader,但是有新的选票,如果新选票没有我的优秀,直接用管了,继续获取看看有没有新选票
// 如果新的选票比我优秀,那么把选票放进接收队列,重新开始选举
// 如果不够优秀,继续循环,直到没有获取到新选票
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)){
recvqueue.put(n);
break;
}
}
/*
* This predicate is true once we don't read any new
* relevant message from the reception queue
*/
if (n == null) {
// 如果没有获取到新的选票,那么leader已经选举出来了
// 设置leader信息,如果自己是leader,那么设置为LEADING,如果是其他服务器那么就设置成对应状态
setPeerState(proposedLeader, voteSet);
Vote endVote = new Vote(proposedLeader,
proposedZxid, logicalclock.get(),
proposedEpoch);
// 返回本次最终的投票
leaveInstance(endVote);
return endVote;
}
}
break;
case OBSERVING:
LOG.debug("Notification from observer: " + n.sid);
break;
case FOLLOWING:
case LEADING:
// 大致逻辑为将接收到的选票放入收票箱,判断是否大多数人斗都选择了同一个leader
// 完成过半机制判断就完事了
/*
* Consider all notifications from the same epoch
* together.
*/
if(n.electionEpoch == logicalclock.get()){
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
voteSet = getVoteTracker(recvset, new Vote(n.version,
n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state));
if (voteSet.hasAllQuorums() &&
checkLeader(outofelection, n.leader, n.electionEpoch)) {
setPeerState(n.leader, voteSet);
Vote endVote = new Vote(n.leader,
n.zxid, n.electionEpoch, n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
}
/*
* Before joining an established ensemble, verify that
* a majority are following the same leader.
*/
outofelection.put(n.sid, new Vote(n.version, n.leader,
n.zxid, n.electionEpoch, n.peerEpoch, n.state));
voteSet = getVoteTracker(outofelection, new Vote(n.version,
n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state));
if (voteSet.hasAllQuorums() &&
checkLeader(outofelection, n.leader, n.electionEpoch)) {
synchronized(this){
logicalclock.set(n.electionEpoch);
setPeerState(n.leader, voteSet);
}
Vote endVote = new Vote(n.leader, n.zxid,
n.electionEpoch, n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
break;
default:
LOG.warn("Notification state unrecoginized: " + n.state
+ " (n.state), " + n.sid + " (n.sid)");
break;
}
} else {
...
}
}
return null;
} finally {
...
}
}
选举机制方法小结
如果大家收到的票是 LOOKING 节点的,说明大家都在投票.根据判断规则 leader任期,zxid,sid(myid)比较,选出最优秀的一个节点.
其中有个细节,如果已经完成过半判断,即大多数人认为最优秀的那个节点与你认为最优秀的节点是同一个的话,正常来说会结束.
但是ZK作了一个判断,当收到一个选票,选票的leader比自己的leader更优秀时,他会进行变票,更改自己的投票
只要投票未完成,但是发现了更加优秀的leader节点就会选择变票
如果收到的是flower,和leader节点发的选票,就直接加入选票箱,直到超过半数投票相同就完成了.