ZK服务端启动过程相关的简单类图如下:
从zkServer.sh中发现ZK启动类为:org.apache.zookeeper.server.quorum.QuorumPeerMain,下面分析QuorumPeerMain启动代码。
/**
zookeeper服务端启动类
* To start the replicated server specify the configuration file name on
* the command line.
* @param args path to the configfile
*/
public static void main(String[] args) {
QuorumPeerMain main = new QuorumPeerMain();
try {
// 初始化并启动
main.initializeAndRun(args);
} catch (IllegalArgumentException e) {
LOG.error("Invalid arguments, exiting abnormally", e);
LOG.info(USAGE);
System.err.println(USAGE);
System.exit(2);
} catch (ConfigException e) {
LOG.error("Invalid config, exiting abnormally", e);
System.err.println("Invalid config, exiting abnormally");
System.exit(2);
} catch (DatadirException e) {
LOG.error("Unable to access datadir, exiting abnormally", e);
System.err.println("Unable to access datadir, exiting abnormally");
System.exit(3);
} catch (AdminServerException e) {
LOG.error("Unable to start AdminServer, exiting abnormally", e);
System.err.println("Unable to start AdminServer, exiting abnormally");
System.exit(4);
} catch (Exception e) {
LOG.error("Unexpected exception, exiting abnormally", e);
System.exit(1);
}
LOG.info("Exiting normally");
System.exit(0);
}
protected void initializeAndRun(String[] args)
throws ConfigException, IOException, AdminServerException
{
QuorumPeerConfig config = new QuorumPeerConfig();
// 判断是否有配置文件,如果有配置文件则加载到内存中
if (args.length == 1) {
config.parse(args[0]);
}
// Start and schedule the the purge task
DatadirCleanupManager purgeMgr = new DatadirCleanupManager(config
.getDataDir(), config.getDataLogDir(), config
.getSnapRetainCount(), config.getPurgeInterval());
// 这里启动了一个定时任务,定期去清理磁盘上的日志数据
purgeMgr.start();
if (args.length == 1 && config.isDistributed()) {
// 集群方式运行走下面的方法
runFromConfig(config);
} else {
LOG.warn("Either no config or no quorum defined in config, running "
+ " in standalone mode");
// there is only server in the quorum -- run as standalone
// 单机运行
ZooKeeperServerMain.main(args);
}
}
purgeTask继承了runable接口,最终调用PurgeTxnLog.purge方法。
/**
* Validates the purge configuration and schedules the purge task. Purge
* task keeps the most recent <code>snapRetainCount</code> number of
* snapshots and deletes the remaining for every <code>purgeInterval</code>
* hour(s).
* <p>
* <code>purgeInterval</code> of <code>0</code> or
* <code>negative integer</code> will not schedule the purge task.
* </p>
*
* @see PurgeTxnLog#purge(File, File, int)
*/
public void start() {
// 如果清理任务正在执行中,返回
if (PurgeTaskStatus.STARTED == purgeTaskStatus) {
LOG.warn("Purge task is already running.");
return;
}
// 如果定时时间不大于0,返回
// Don't schedule the purge task with zero or negative purge interval.
if (purgeInterval <= 0) {
LOG.info("Purge task is not scheduled.");
return;
}
timer = new Timer("PurgeTask", true);
// 这里创建了一个TimerTask,每purgeInterval小时执行一次
// 清理事务日志和快照日志
TimerTask task = new PurgeTask(dataLogDir, snapDir, snapRetainCount);
timer.scheduleAtFixedRate(task, 0, TimeUnit.HOURS.toMillis(purgeInterval));
purgeTaskStatus = PurgeTaskStatus.STARTED;
}
/**
* Purges the snapshot and logs keeping the last num snapshots and the
* corresponding logs. If logs are rolling or a new snapshot is created
* during this process, these newest N snapshots or any data logs will be
* excluded from current purging cycle.
*
* @param dataDir the dir that has the logs 事务日志
* @param snapDir the dir that has the snapshots
* @param num the number of snapshots to keep 保存多少个快照
* @throws IOException
*/
public static void purge(File dataDir, File snapDir, int num) throws IOException {
// 要求至少保留三个snapshots
if (num < 3) {
throw new IllegalArgumentException(COUNT_ERR_MSG);
}
FileTxnSnapLog txnLog = new FileTxnSnapLog(dataDir, snapDir);
List<File> snaps = txnLog.findNRecentSnapshots(num);
int numSnaps = snaps.size();
if (numSnaps > 0) {
purgeOlderSnapshots(txnLog, snaps.get(numSnaps - 1));
}
}
// VisibleForTesting
static void purgeOlderSnapshots(FileTxnSnapLog txnLog, File snapShot) {
final long leastZxidToBeRetain = Util.getZxidFromName(
snapShot.getName(), PREFIX_SNAPSHOT);
/**
* We delete all files with a zxid in their name that is less than leastZxidToBeRetain.
* This rule applies to both snapshot files as well as log files, with the following
* exception for log files.
*
* A log file with zxid less than X may contain transactions with zxid larger than X. More
* precisely, a log file named log.(X-a) may contain transactions newer than snapshot.X if
* there are no other log files with starting zxid in the interval (X-a, X]. Assuming the
* latter condition is true, log.(X-a) must be retained to ensure that snapshot.X is
* recoverable. In fact, this log file may very well extend beyond snapshot.X to newer
* snapshot files if these newer snapshots were not accompanied by log rollover (possible in
* the learner state machine at the time of this writing). We can make more precise
* determination of whether log.(leastZxidToBeRetain-a) for the smallest 'a' is actually
* needed or not (e.g. not needed if there's a log file named log.(leastZxidToBeRetain+1)),
* but the complexity quickly adds up with gains only in uncommon scenarios. It's safe and
* simple to just preserve log.(leastZxidToBeRetain-a) for the smallest 'a' to ensure
* recoverability of all snapshots being retained. We determine that log file here by
* calling txnLog.getSnapshotLogs().
*/
final Set<File> retainedTxnLogs = new HashSet<File>();
retainedTxnLogs.addAll(Arrays.asList(txnLog.getSnapshotLogs(leastZxidToBeRetain)));
/**
* Finds all candidates for deletion, which are files with a zxid in their name that is less
* than leastZxidToBeRetain. There's an exception to this rule, as noted above.
*/
class MyFileFilter implements FileFilter{
private final String prefix;
MyFileFilter(String prefix){
this.prefix=prefix;
}
public boolean accept(File f){
if(!f.getName().startsWith(prefix + "."))
return false;
if (retainedTxnLogs.contains(f)) {
return false;
}
long fZxid = Util.getZxidFromName(f.getName(), prefix);
if (fZxid >= leastZxidToBeRetain) {
return false;
}
return true;
}
}
// add all non-excluded log files
File[] logs = txnLog.getDataDir().listFiles(new MyFileFilter(PREFIX_LOG));
List<File> files = new ArrayList<>();
if (logs != null) {
files.addAll(Arrays.asList(logs));
}
// add all non-excluded snapshot files to the deletion list
File[] snapshots = txnLog.getSnapDir().listFiles(new MyFileFilter(PREFIX_SNAPSHOT));
if (snapshots != null) {
files.addAll(Arrays.asList(snapshots));
}
// remove the old files
for(File f: files)
{
final String msg = "Removing file: "+
DateFormat.getDateTimeInstance().format(f.lastModified())+
"\t"+f.getPath();
LOG.info(msg);
System.out.println(msg);
// 删除日志文件
if(!f.delete()){
System.err.println("Failed to remove "+f.getPath());
}
}
}
public void runFromConfig(QuorumPeerConfig config)
throws IOException, AdminServerException
{
try {
// 注册日志相关
ManagedUtil.registerLog4jMBeans();
} catch (JMException e) {
LOG.warn("Unable to register log4j JMX control", e);
}
LOG.info("Starting quorum peer, myid=" + config.getServerId());
try {
ServerCnxnFactory cnxnFactory = null;
ServerCnxnFactory secureCnxnFactory = null;
if (config.getClientPortAddress() != null) {
cnxnFactory = ServerCnxnFactory.createFactory();
cnxnFactory.configure(config.getClientPortAddress(),
config.getMaxClientCnxns(),
false);
}
if (config.getSecureClientPortAddress() != null) {
secureCnxnFactory = ServerCnxnFactory.createFactory();
secureCnxnFactory.configure(config.getSecureClientPortAddress(),
config.getMaxClientCnxns(),
true);
}
quorumPeer = getQuorumPeer();
quorumPeer.setTxnFactory(new FileTxnSnapLog(
config.getDataLogDir(),
config.getDataDir()));
...... // 省略初始化赋值代码
quorumPeer.setQuorumCnxnThreadsSize(config.quorumCnxnThreadsSize);
quorumPeer.initialize();
// 继承Thread,调用线程启动方法
quorumPeer.start();
quorumPeer.join();
} catch (InterruptedException e) {
// warn, but generally this is ok
LOG.warn("Quorum Peer interrupted", e);
}
}
@Override
public synchronized void start() {
// 当前视图中不包含当前服务的id,抛异常
if (!getView().containsKey(myid)) {
throw new RuntimeException("My id " + myid + " not in the peer list");
}
// 加载日志数据到内存中
loadDataBase();
startServerCnxnFactory();
try {
// 启动adminServer,通过浏览器可以访问
// http://localhost:8080/commands/
adminServer.start();
} catch (AdminServerException e) {
LOG.warn("Problem starting AdminServer", e);
System.out.println(e);
}
// 启动快速选举
startLeaderElection();
super.start();
}
private void loadDataBase() {
try {
// 从磁盘将数据加载到内存中
zkDb.loadDataBase();
// load the epochs
// 得到最新的zxid
long lastProcessedZxid = zkDb.getDataTree().lastProcessedZxid;
// 得到zxid对应代数
long epochOfZxid = ZxidUtils.getEpochFromZxid(lastProcessedZxid);
try {
// 当前的代数
currentEpoch = readLongFromFile(CURRENT_EPOCH_FILENAME);
} catch(FileNotFoundException e) {
// pick a reasonable epoch number
// this should only happen once when moving to a
// new code version
currentEpoch = epochOfZxid;
LOG.info(CURRENT_EPOCH_FILENAME
+ " not found! Creating with a reasonable default of {}. This should only happen when you are upgrading your installation",
currentEpoch);
writeLongToFile(CURRENT_EPOCH_FILENAME, currentEpoch);
}
// 如果zxid代数大于当前的代数,说当前代数的zxid最近的zxid,抛异常
if (epochOfZxid > currentEpoch) {
throw new IOException("The current epoch, " + ZxidUtils.zxidToString(currentEpoch) + ", is older than the last zxid, " + lastProcessedZxid);
}
try {
acceptedEpoch = readLongFromFile(ACCEPTED_EPOCH_FILENAME);
} catch(FileNotFoundException e) {
// pick a reasonable epoch number
// this should only happen once when moving to a
// new code version
acceptedEpoch = epochOfZxid;
LOG.info(ACCEPTED_EPOCH_FILENAME
+ " not found! Creating with a reasonable default of {}. This should only happen when you are upgrading your installation",
acceptedEpoch);
writeLongToFile(ACCEPTED_EPOCH_FILENAME, acceptedEpoch);
}
if (acceptedEpoch < currentEpoch) {
throw new IOException("The accepted epoch, " + ZxidUtils.zxidToString(acceptedEpoch) + " is less than the current epoch, " + ZxidUtils.zxidToString(currentEpoch));
}
} catch(IOException ie) {
LOG.error("Unable to load database on disk", ie);
throw new RuntimeException("Unable to run quorum server ", ie);
}
}
/**
* 快速选举实现
*/
synchronized public void startLeaderElection() {
try {
// 如果当前服务状态为寻主状态,则创建当前的一个投票
if (getPeerState() == ServerState.LOOKING) {
currentVote = new Vote(myid, getLastLoggedZxid(), getCurrentEpoch());
}
} catch(IOException e) {
RuntimeException re = new RuntimeException(e.getMessage());
re.setStackTrace(e.getStackTrace());
throw re;
}
// if (!getView().containsKey(myid)) {
// throw new RuntimeException("My id " + myid + " not in the peer list");
//}
// electionAlg: 选举方法 默认3 FastLeaderElection
// 故不会走下面的逻辑
if (electionType == 0) {
try {
// 建立socket连接
udpSocket = new DatagramSocket(getQuorumAddress().getPort());
// 创建响应线程,只是响应这个节点的当前领导者的请求
responder = new ResponderThread();
responder.start();
} catch (SocketException e) {
throw new RuntimeException(e);
}
}
// 创建快速选举算法
this.electionAlg = createElectionAlgorithm(electionType);
}
/**
* @deprecated As of release 3.4.0, this class has been deprecated, since
* it is used with one of the udp-based versions of leader election, which
* we are also deprecating.
*
* This class simply responds to requests for the current leader of this
* node.
* <p>
* The request contains just an xid generated by the requestor.
* <p>
* The response has the xid, the id of this server, the id of the leader,
* and the zxid of the leader.
*
*
*/
@Deprecated
class ResponderThread extends ZooKeeperThread {
ResponderThread() {
super("ResponderThread");
}
volatile boolean running = true;
@Override
public void run() {
try {
byte b[] = new byte[36];
ByteBuffer responseBuffer = ByteBuffer.wrap(b);
DatagramPacket packet = new DatagramPacket(b, b.length);
while (running) {
// 从socket中接收数据包
udpSocket.receive(packet);
if (packet.getLength() != 4) {
LOG.warn("Got more than just an xid! Len = "
+ packet.getLength());
} else {
responseBuffer.clear();
responseBuffer.getInt(); // Skip the xid
responseBuffer.putLong(myid);
Vote current = getCurrentVote();
// buffer中填充数据
switch (getPeerState()) {
case LOOKING:
responseBuffer.putLong(current.getId());
responseBuffer.putLong(current.getZxid());
break;
case LEADING:
responseBuffer.putLong(myid);
try {
long proposed;
synchronized(leader) {
proposed = leader.lastProposed;
}
responseBuffer.putLong(proposed);
} catch (NullPointerException npe) {
// This can happen in state transitions,
// just ignore the request
}
break;
case FOLLOWING:
responseBuffer.putLong(current.getId());
try {
responseBuffer.putLong(follower.getZxid());
} catch (NullPointerException npe) {
// This can happen in state transitions,
// just ignore the request
}
break;
case OBSERVING:
// Do nothing, Observers keep themselves to
// themselves.
break;
}
packet.setData(b);
// 将数据发送出去
udpSocket.send(packet);
}
packet.setLength(b.length);
}
} catch (RuntimeException e) {
LOG.warn("Unexpected runtime exception in ResponderThread",e);
} catch (IOException e) {
LOG.warn("Unexpected IO exception in ResponderThread",e);
} finally {
LOG.warn("QuorumPeer responder thread exited");
}
}
}
核心leader选举逻辑:
@SuppressWarnings("deprecation")
protected Election createElectionAlgorithm(int electionAlgorithm){
Election le=null;
//TODO: use a factory rather than a switch
switch (electionAlgorithm) {
case 0:
le = new LeaderElection(this);
break;
case 1:
le = new AuthFastLeaderElection(this);
break;
case 2:
le = new AuthFastLeaderElection(this, true);
break;
case 3:
QuorumCnxManager qcm = createCnxnManager();
QuorumCnxManager oldQcm = qcmRef.getAndSet(qcm);
if (oldQcm != null) {
LOG.warn("Clobbering already-set QuorumCnxManager (restarting leader election?)");
oldQcm.halt();
}
QuorumCnxManager.Listener listener = qcm.listener;
if(listener != null){
listener.start();
FastLeaderElection fle = new FastLeaderElection(this, qcm);
// 快速lead选举线程启动,执行快速lead选举流程
fle.start();
le = fle;
} else {
LOG.error("Null listener when initializing cnx manager");
}
break;
default:
assert false;
}
return le;
}
fle.start()最终会调用Messenger的start方法,启动发送工作线程和接收工作线程。
/**
* This method starts the sender and receiver threads.
*/
public void start() {
this.messenger.start();
}
/**
* 最终会调用Messenger的start方法
* Starts instances of WorkerSender and WorkerReceiver
*/
void start(){
// 本质调用WorkerSender的start方法
this.wsThread.start();
// 本质调用WorkerReceiver的start方法
this.wrThread.start();
}
// WorkerSender不停从发送阻塞队列中拉取投票然后进行处理
class WorkerSender extends ZooKeeperThread {
volatile boolean stop;
QuorumCnxManager manager;
WorkerSender(QuorumCnxManager manager){
super("WorkerSender");
this.stop = false;
this.manager = manager;
}
public void run() {
while (!stop) {
try {
// 从发送队列拉取数据:当前peer想要把消息发送给其它的peers
ToSend m = sendqueue.poll(3000, TimeUnit.MILLISECONDS);
if(m == null) continue;
process(m);
} catch (InterruptedException e) {
break;
}
}
LOG.info("WorkerSender is down");
}
}
// WorkerReceiver不停从recvQueue中拉取任务处理
class WorkerReceiver extends ZooKeeperThread {
volatile boolean stop;
QuorumCnxManager manager;
WorkerReceiver(QuorumCnxManager manager) {
super("WorkerReceiver");
this.stop = false;
this.manager = manager;
}
public void run() {
Message response;
while (!stop) {
// Sleeps on receive
try {
// 从ArrayBlockingQueue<Message> recvQueue中拉取Message
response = manager.pollRecvQueue(3000, TimeUnit.MILLISECONDS);
if(response == null) continue;
final int capacity = response.buffer.capacity();
// The current protocol and two previous generations all send at least 28 bytes
if (capacity < 28) {
LOG.error("Got a short response from server {}: {}", response.sid, capacity);
continue;
}
......
快速选举算法具体实现逻辑下一篇文章再介绍吧!感谢您的阅读!