在 下载好zookeeper源码的前提下,首先找到QuorumPeerMain类,从它的的main方法开始
进入initializeAndRun方法
这段判断逻辑是用来判断是单机模式还是集群模式,我们以单机模式为例,继续查看runFromConfig方法
ServerCnxnFactory主要是用来为客户端提供读写功能的server,类似于Serversocket。
接下去就是zookeeper的逻辑主线程,在设置完相关参数后,就启动主线程。
quorumPeer重写了线程的start()方法
先跟进去看看loadDataBase
loaddatabase方法主要是从本地文件中恢复数据,以及获取最新的 zxid。
接着看startServerCnxnFactory(),最终我们来到了NettyServerCnxnFactory中
通过netty的bootstrap绑定IP地址来为后续的zkClient服务。
然后我们看startLeaderElection()
我们跟进去看看createElectionAlgorithm()
默认是fast选举,急着看FastLeaderElection的构造函数,主要初始化了接收队列和发送队列
初始化完成后,接着看fle.start()方法,主要是启动发送线程和接受线程
再回过头来看最终调用super.start(),这个会调用QuirumPeer的run()方法,
前面这部分主要通过jmx监听来监听一些属性,并不是核心逻辑,主要的逻辑看下面
/*
* Main loop
*/
while (running) {
switch (getPeerState()) { //判断当前节点的状态
case LOOKING: //如果是LOOKING,则进入选举流程
LOG.info("LOOKING");
if (Boolean.getBoolean("readonlymode.enabled")) {
LOG.info("Attempting to start ReadOnlyZooKeeperServer");
// Create read-only server but don't start it immediately
final ReadOnlyZooKeeperServer roZk =
new ReadOnlyZooKeeperServer(logFactory, this, this.zkDb);
// Instead of starting roZk immediately, wait some grace
// period before we decide we're partitioned.
//
// Thread is used here because otherwise it would require
// changes in each of election strategy classes which is
// unnecessary code coupling.
Thread roZkMgr = new Thread() {
public void run() {
try {
// lower-bound grace period to 2 secs
sleep(Math.max(2000, tickTime));
if (ServerState.LOOKING.equals(getPeerState())) {
roZk.startup();
}
} catch (InterruptedException e) {
LOG.info("Interrupted while attempting to start
ReadOnlyZooKeeperServer, not started");
} catch (Exception e) {
LOG.error("FAILED to start ReadOnlyZooKeeperServer", e);
}
}
};
try {
roZkMgr.start();
reconfigFlagClear();
if (shuttingDownLE) {
shuttingDownLE = false;
startLeaderElection();
}
//此处通过策略模式来决定当前用哪个选举算法来进行领导选举
setCurrentVote(makeLEStrategy().lookForLeader());
} catch (Exception e) {
LOG.warn("Unexpected exception", e);
setPeerState(ServerState.LOOKING);
} finally {
// If the thread is in the the grace period, interrupt
// to come out of waiting.
roZkMgr.interrupt();
roZk.shutdown();
}
} else {
try {
reconfigFlagClear();
if (shuttingDownLE) {
shuttingDownLE = false;
startLeaderElection();
}
setCurrentVote(makeLEStrategy().lookForLeader());
} catch (Exception e) {
LOG.warn("Unexpected exception", e);
setPeerState(ServerState.LOOKING);
}
}
break;
case OBSERVING:
try {
LOG.info("OBSERVING");
setObserver(makeObserver(logFactory));
observer.observeLeader();
} catch (Exception e) {
LOG.warn("Unexpected exception",e );
} finally {
observer.shutdown();
setObserver(null);
updateServerState();
}
break;
case FOLLOWING:
try {
LOG.info("FOLLOWING");
setFollower(makeFollower(logFactory));
follower.followLeader();
} catch (Exception e) {
LOG.warn("Unexpected exception",e);
} finally {
follower.shutdown();
setFollower(null);
updateServerState();
}
break;
case LEADING:
LOG.info("LEADING");
try {
setLeader(makeLeader(logFactory));
leader.lead();
setLeader(null);
} catch (Exception e) {
LOG.warn("Unexpected exception",e);
} finally {
if (leader != null) {
leader.shutdown("Forcing shutdown");
setLeader(null);
}
updateServerState();
}
break;
}
start_fle = Time.currentElapsedTime();
}
先是调用 setCurrentVote(makeLEStrategy().lookForLeader());最终根据策略应该运行 FastLeaderElection 中的选举算法,然后就是lookForLeader()
try {
//收到的投票
HashMap<Long, Vote> recvset = new HashMap<Long, Vote>();
//存储选举结果
HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>();
int notTimeout = finalizeWait;
synchronized(this){
logicalclock.incrementAndGet(); //增加逻辑时钟
//吃耍自己的zxid和epoch
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
LOG.info("New election. My id = " + self.getId() +
", proposed zxid=0x" + Long.toHexString(proposedZxid));
sendNotifications(); //发送投票,包括发送给自己
/*
* Loop in which we exchange notifications until we find a leader
*/
while ((self.getPeerState() == ServerState.LOOKING) &&
(!stop)){//主循环,直到选举出leader
/*
* Remove next notification from queue, times out after 2 times
* the termination time
*/
// 从IO线程里拿到投票消息,自己的投票也在这里处理
//LinkedBlockedQueue()
Notification n = recvqueue.poll(notTimeout,
TimeUnit.MILLISECONDS);
/*
* Sends more notifications if haven't received enough.
* Otherwise processes new notification.
*/
if(n == null){
//如果空闲的情况下,消息发完了,继续发送,一直到选出leader为止
if(manager.haveDelivered()){
sendNotifications();
} else {
//消息还没投递出去,可能是其他server还没启动,尝试再连接
manager.connectAll();
}
/*
* Exponential backoff
*/
//延长超时时间
int tmpTimeOut = notTimeout*2;
notTimeout = (tmpTimeOut < maxNotificationInterval?
tmpTimeOut : maxNotificationInterval);
LOG.info("Notification time out: " + notTimeout);
}
//收到了投票消息,判断收到的消息是不是属于这个集群内
else if (self.getCurrentAndNextConfigVoters().contains(n.sid)) {
/*
* Only proceed if the vote comes from a replica in the current or next
* voting view.
*/
switch (n.state) {//判断收到消息的节点的状态
case LOOKING:
if (getInitLastLoggedZxid() == -1) {
LOG.debug("Ignoring notification as our zxid is -1");
break;
}
if (n.zxid == -1) {
LOG.debug("Ignoring notification from member with -1 zxid" + n.sid);
break;
}
// If notification > current, replace and send messages out
//判断接收到的节点epoch大于logicalclock,则表示当前是新一轮的选举
if (n.electionEpoch > logicalclock.get()) {
logicalclock.set(n.electionEpoch); //更新本地的logicalclock
recvset.clear(); //清空接收队列
//检查收到的这个消息是否可以胜出,一次比较epoch,zxid、myid
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
//胜出以后,把投票改为对方的票据
updateProposal(n.leader, n.zxid, n.peerEpoch);
} else {//否则,票据不变
updateProposal(getInitId(),
getInitLastLoggedZxid(),
getPeerEpoch());
}
sendNotifications();//继续广播消息,让其他节点知道我现在的票据
//如果收到的消息epoch小于当前节点的epoch,则忽略这条消息
} else if (n.electionEpoch < logicalclock.get()) {
if(LOG.isDebugEnabled()){
LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x"
+ Long.toHexString(n.electionEpoch)
+ ", logicalclock=0x" + Long.toHexString(logicalclock.get()));
}
break;
//如果是epoch相同的话,就继续比较zxid、myid,如果胜出,则更新自己的票据,并且发出广播
} else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
sendNotifications();
}
if(LOG.isDebugEnabled()){
LOG.debug("Adding vote: from=" + n.sid +
", proposed leader=" + n.leader +
", proposed zxid=0x" + Long.toHexString(n.zxid) +
", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
}
//添加到本机投票集合,用来做选举终结判断
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
//判断选举是否结束,默认算法是超过半数server同意
if (termPredicate(recvset,
new Vote(proposedLeader, proposedZxid,
logicalclock.get(), proposedEpoch))) {
// Verify if there is any change in the proposed leader
//一直等新的notification到达,直到超时
while((n = recvqueue.poll(finalizeWait,
TimeUnit.MILLISECONDS)) != null){
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)){
recvqueue.put(n);
break;
}
}
/*
* This predicate is true once we don't read any new
* relevant message from the reception queue
*/
//确定leader
if (n == null) {
//修改状态,LEADING or FOLLOWING
self.setPeerState((proposedLeader == self.getId()) ?
ServerState.LEADING: learningState());
//返回最终投票结果
Vote endVote = new Vote(proposedLeader,
proposedZxid, proposedEpoch);
leaveInstance(endVote);
return endVote;
}
}
break;
//如果收到的选票状态不是LOOKING,比如这台机器刚加入一个已经正在运行的zk集群时
//OBSERVING机器不参与选举
case OBSERVING:
LOG.debug("Notification from observer: " + n.sid);
break;
//这2种需要参与选举
case FOLLOWING:
case LEADING:
/*
* Consider all notifications from the same epoch
* together.
*/
if(n.electionEpoch == logicalclock.get()){ //判断epoch是否相同
//加入到本机的投票集合
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
//投票是否结束,如果结束,再确认LEADER是否有效
//如果结束,修改自己的状态并返回投票结果
if(termPredicate(recvset, new Vote(n.leader,
n.zxid, n.electionEpoch, n.peerEpoch, n.state))
&& checkLeader(outofelection, n.leader, n.electionEpoch)) {
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
}
/*
* Before joining an established ensemble, verify that
* a majority are following the same leader.
* Only peer epoch is used to check that the votes come
* from the same ensemble. This is because there is at
* least one corner case in which the ensemble can be
* created with inconsistent zxid and election epoch
* info. However, given that only one ensemble can be
* running at a single point in time and that each
* epoch is used only once, using only the epoch to
* compare the votes is sufficient.
*
* @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
*/
outofelection.put(n.sid, new Vote(n.leader,
IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state));
if (termPredicate(outofelection, new Vote(n.leader,
IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state))
&& checkLeader(outofelection, n.leader, IGNOREVALUE)) {
synchronized(this){
logicalclock.set(n.electionEpoch);
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
}
Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
break;
default:
LOG.warn("Notification state unrecoginized: " + n.state
+ " (n.state), " + n.sid + " (n.sid)");
break;
}
} else {
LOG.warn("Ignoring notification from non-cluster member " + n.sid);
}
}
return null;
消息是如何发送的呢?sendNotifications点进去看下
然后我们就去看跟sendqueue关联的workersender的run方法
拉取信息,并且通过process()处理。最终会走到manager的toSend方法