查看zookeeper服务端启动脚本
或者直接查看项目下启动脚本
ZooKeeper服务器的启动,大体可以分为以下五个步骤:配置文件解析、初始化数据管理器、初始化网络I/O管理器、数据恢复与对外服务。
预启动
统一由QuorumPeerMain作为启动类
/**
* To start the replicated server specify the configuration file name on the
* command line.
*
* @param args path to the configfile
*/
public static void main(String[] args) {
QuorumPeerMain main = new QuorumPeerMain();
try {
main.initializeAndRun(args);
} catch (IllegalArgumentException e) {
LOG.error("Invalid arguments, exiting abnormally", e);
LOG.info(USAGE);
System.err.println(USAGE);
System.exit(2);
} catch (ConfigException e) {
LOG.error("Invalid config, exiting abnormally", e);
System.err.println("Invalid config, exiting abnormally");
System.exit(2);
} catch (DatadirException e) {
LOG.error("Unable to access datadir, exiting abnormally", e);
System.err.println("Unable to access datadir, exiting abnormally");
System.exit(3);
} catch (AdminServerException e) {
LOG.error("Unable to start AdminServer, exiting abnormally", e);
System.err.println("Unable to start AdminServer, exiting abnormally");
System.exit(4);
} catch (Exception e) {
LOG.error("Unexpected exception, exiting abnormally", e);
System.exit(1);
}
LOG.info("Exiting normally");
System.exit(0);
}
QuorumPeerMain默认构造没有啥逻辑,只是构造了一个对象
调用initializeAndRun方法
解析配置文件zoo.cfg
protected void initializeAndRun(String[] args) throws ConfigException, IOException, AdminServerException {
// 解析配置文件zoo.cfg 其中参数0作为配置文件的path
QuorumPeerConfig config = new QuorumPeerConfig();
if (args.length == 1) {
config.parse(args[0]);
}
// Start and schedule the the purge task
// 创建并启动历史文件清理器 DatadirCleanupManager 包括对事务日志和快照数据文件进行定时清理
DatadirCleanupManager purgeMgr = new DatadirCleanupManager(config.getDataDir(),
config.getDataLogDir(), config.getSnapRetainCount(), config.getPurgeInterval());
purgeMgr.start();
// 判断当前是集群模式还是单机模式的启动
if (args.length == 1 && config.isDistributed()) {
// 集群模式启动
runFromConfig(config);
} else {
// 以单机模式启动
LOG.warn("Either no config or no quorum defined in config, running " + " in standalone mode");
// there is only server in the quorum -- run as standalone
ZooKeeperServerMain.main(args);
}
}
根据isDistributed判断是单机模式还是集群模式
启动单机模式
ZooKeeperServerMain是单机模式下的ZooKeeper服务器最主要的类
// org.apache.zookeeper.server.ZooKeeperServerMain
/*
* Start up the ZooKeeper server.
*
* @param args the configfile or the port datadir [ticktime]
*/
public static void main(String[] args) {
ZooKeeperServerMain main = new ZooKeeperServerMain();
try {
main.initializeAndRun(args);
} catch (IllegalArgumentException e) {
LOG.error("Invalid arguments, exiting abnormally", e);
LOG.info(USAGE);
System.err.println(USAGE);
System.exit(2);
} catch (ConfigException e) {
LOG.error("Invalid config, exiting abnormally", e);
System.err.println("Invalid config, exiting abnormally");
System.exit(2);
} catch (DatadirException e) {
LOG.error("Unable to access datadir, exiting abnormally", e);
System.err.println("Unable to access datadir, exiting abnormally");
System.exit(3);
} catch (AdminServerException e) {
LOG.error("Unable to start AdminServer, exiting abnormally", e);
System.err.println("Unable to start AdminServer, exiting abnormally");
System.exit(4);
} catch (Exception e) {
LOG.error("Unexpected exception, exiting abnormally", e);
System.exit(1);
}
LOG.info("Exiting normally");
System.exit(0);
}
再次进行配置参数的解析并执行服务器启动
protected void initializeAndRun(String[] args)
throws ConfigException, IOException, AdminServerException
{
try {
ManagedUtil.registerLog4jMBeans();
} catch (JMException e) {
LOG.warn("Unable to register log4j JMX control", e);
}
// 再次进行配置文件的解析
ServerConfig config = new ServerConfig();
if (args.length == 1) {
config.parse(args[0]);
} else {
config.parse(args);
}
// 创建服务器
runFromConfig(config);
}
创建服务器实例ZooKeeperServer
/**
* 创建服务器实例ZooKeeperServer
* ZooKeeperServer是单机版ZooKeeper服务端最为核心的实体类
* ZooKeeper服务器首先会进行服务器实例的创建、接下去的步骤则是对该服务器实例的初始化工作,包括连接器、内存数据库和请求处理器等组件的初始化工作
* Run from a ServerConfig.
* @param config ServerConfig to use.
* @throws IOException
* @throws AdminServerException
*/
public void runFromConfig(ServerConfig config)
throws IOException, AdminServerException {
LOG.info("Starting server");
FileTxnSnapLog txnLog = null;
try {
// Note that this thread isn't going to be doing anything else,
// so rather than spawning another thread, we will just call
// run() in this thread.
// create a file logger url from the command line args
// 创建ZooKeeper数据管理器FileTxnSnapLog
/*
* FileTxnSnapLog是ZooKeeper上层服务器和底层数据存储之间的对接层,提供了一系列操作数据文件的接口,包括事务日志文件和快照数据文件.
* ZooKeeper根据zoo.cfg文件中解析出的快照数据目录dataDir和事务日志目录dataLogDir来创建FileTxnSnapLog
*/
txnLog = new FileTxnSnapLog(config.dataLogDir, config.dataDir);
// 创建ZooKeeperServer对象
final ZooKeeperServer zkServer = new ZooKeeperServer(txnLog,
config.tickTime, config.minSessionTimeout, config.maxSessionTimeout, null);
// 创建服务器统计器ServerStats
txnLog.setServerStats(zkServer.serverStats());
// Registers shutdown handler which will be used to know the
// server error or shutdown state changes.
final CountDownLatch shutdownLatch = new CountDownLatch(1);
zkServer.registerServerShutdownHandler(
new ZooKeeperServerShutdownHandler(shutdownLatch));
// Start Admin server
adminServer = AdminServerFactory.createAdminServer();
adminServer.setZooKeeperServer(zkServer);
adminServer.start();
boolean needStartZKServer = true;
if (config.getClientPortAddress() != null) {
// 创建服务端网络连接工厂 ServerCnxnFactory
cnxnFactory = ServerCnxnFactory.createFactory();
// 初始化服务端网络连接工厂
cnxnFactory.configure(config.getClientPortAddress(), config.getMaxClientCnxns(), false);
// 启动服务端网络连接工厂主线程
cnxnFactory.startup(zkServer);
// zkServer has been started. So we don't need to start it again in secureCnxnFactory.
needStartZKServer = false;
}
if (config.getSecureClientPortAddress() != null) {
secureCnxnFactory = ServerCnxnFactory.createFactory();
secureCnxnFactory.configure(config.getSecureClientPortAddress(), config.getMaxClientCnxns(), true);
secureCnxnFactory.startup(zkServer, needStartZKServer);
}
containerManager = new ContainerManager(zkServer.getZKDatabase(), zkServer.firstProcessor,
Integer.getInteger("znode.container.checkIntervalMs", (int) TimeUnit.MINUTES.toMillis(1)),
Integer.getInteger("znode.container.maxPerMinute", 10000)
);
containerManager.start();
// Watch status of ZooKeeper server. It will do a graceful shutdown
// if the server is not running or hits an internal error.
shutdownLatch.await();
shutdown();
if (cnxnFactory != null) {
cnxnFactory.join();
}
if (secureCnxnFactory != null) {
secureCnxnFactory.join();
}
if (zkServer.canShutdown()) {
zkServer.shutdown(true);
}
} catch (InterruptedException e) {
// warn, but generally this is ok
LOG.warn("Server interrupted", e);
} finally {
if (txnLog != null) {
txnLog.close();
}
}
}
创建ServerCnxnFactory
ServerCnxnFactory 服务端网络连接器工厂
// org.apache.zookeeper.server.ServerCnxnFactory
static public ServerCnxnFactory createFactory() throws IOException {
// 早起版本中,ZooKeeper都是自己实现NIO框架,从3.4.0版本开始,引入了Netty.可以通过配置系统属性zookeeper.serverCnxnFactory
// 来指定使用ZooKeeper自己实现的NIO还是使用Netty框架来作为ZooKeeper服务端网络连接工厂
String serverCnxnFactoryName = System.getProperty(ZOOKEEPER_SERVER_CNXN_FACTORY);
if (serverCnxnFactoryName == null) {
serverCnxnFactoryName = NIOServerCnxnFactory.class.getName();
}
try {
ServerCnxnFactory serverCnxnFactory = (ServerCnxnFactory) Class.forName(serverCnxnFactoryName)
.getDeclaredConstructor().newInstance();
LOG.info("Using {} as server connection factory", serverCnxnFactoryName);
return serverCnxnFactory;
} catch (Exception e) {
IOException ioe = new IOException("Couldn't instantiate " + serverCnxnFactoryName);
ioe.initCause(e);
throw ioe;
}
}
NIOServerCnxnFactory implements a multi-threaded ServerCnxnFactory using NIO non-blocking socket calls.
Communication between threads is handled via queues.
- 1 accept thread, which accepts new connections and assigns to a selector thread
- 1-N selector threads, each of which selects on 1/N of the connections. The reason the factory supports more than one selector thread is that with large numbers of connections, select() itself can become a performance bottleneck.
- 0-M socket I/O worker threads, which perform basic socket reads and writes. If configured with 0 worker threads, the selector threads do the socket I/O directly.
- 1 connection expiration thread, which closes idle connections; this is necessary to expire connections on which no session is established.
Typical (default) thread counts are: on a 32 core machine, 1 accept thread, 1 connection expiration thread, 4 selector threads, and 64 worker threads.
ServerCnxnFactory中会定义一堆的线程来处理连接和执行读写操作
1个accept thread,用于接收新的连接,然后赋值给一个selector thread
1到N个selector thread,每个用于选择所有连接中的一部分(1/N),用超过1个的selector thread主要是因为可能存在大量的连接,select()方法可能会成为整个性能的一个瓶颈
0到多个工作线程,主要执行基本的socket读写操作。如果配置为0,那么selector thread会直接执行socket的读写操作
1个过期连接清理线程,主要是用于清理那些会话不再有效的连接,减少服务器的压力
@Override
public void configure(InetSocketAddress addr, int maxcc, boolean secure) throws IOException {
if (secure) {
throw new UnsupportedOperationException("SSL isn't supported in NIOServerCnxn");
}
configureSaslLogin();
maxClientCnxns = maxcc;
sessionlessCnxnTimeout = Integer.getInteger(ZOOKEEPER_NIO_SESSIONLESS_CNXN_TIMEOUT, 10000);
// We also use the sessionlessCnxnTimeout as expiring interval for
// cnxnExpiryQueue. These don't need to be the same, but the expiring
// interval passed into the ExpiryQueue() constructor below should be
// less than or equal to the timeout.
cnxnExpiryQueue = new ExpiryQueue<NIOServerCnxn>(sessionlessCnxnTimeout);
expirerThread = new ConnectionExpirerThread();
int numCores = Runtime.getRuntime().availableProcessors();
// 32 cores sweet spot seems to be 4 selector threads
numSelectorThreads = Integer.getInteger(ZOOKEEPER_NIO_NUM_SELECTOR_THREADS,
Math.max((int) Math.sqrt((float) numCores / 2), 1));
if (numSelectorThreads < 1) {
throw new IOException("numSelectorThreads must be at least 1");
}
numWorkerThreads = Integer.getInteger(ZOOKEEPER_NIO_NUM_WORKER_THREADS, 2 * numCores);
workerShutdownTimeoutMS = Long.getLong(ZOOKEEPER_NIO_SHUTDOWN_TIMEOUT, 5000);
LOG.info("Configuring NIO connection handler with " + (sessionlessCnxnTimeout / 1000)
+ "s sessionless connection" + " timeout, " + numSelectorThreads + " selector thread(s), "
+ (numWorkerThreads > 0 ? numWorkerThreads : "no") + " worker threads, and "
+ (directBufferBytes == 0 ? "gathered writes."
: ("" + (directBufferBytes / 1024) + " kB direct buffers.")));
for (int i = 0; i < numSelectorThreads; ++i) {
selectorThreads.add(new SelectorThread(i));
}
// 打开端口
this.ss = ServerSocketChannel.open();
ss.socket().setReuseAddress(true);
LOG.info("binding to port " + addr);
ss.socket().bind(addr);
ss.configureBlocking(false);
// 初始化一个Thread,作为整个ServerCnxnFactory的主线程
// 创建一个AcceptThread
acceptThread = new AcceptThread(ss, addr, selectorThreads);
}
用于接收客户端的连接并交给SelectorThread进行处理
/**
* There is a single AcceptThread which accepts new connections and assigns them
* to a SelectorThread using a simple round-robin scheme to spread them across
* the SelectorThreads. It enforces maximum number of connections per IP and
* attempts to cope with running out of file descriptors by briefly sleeping
* before retrying.
*/
private class AcceptThread extends AbstractSelectThread {
private final ServerSocketChannel acceptSocket;
private final SelectionKey acceptKey;
private final RateLogger acceptErrorLogger = new RateLogger(LOG);
private final Collection<SelectorThread> selectorThreads;
private Iterator<SelectorThread> selectorIterator;
private volatile boolean reconfiguring = false;
public AcceptThread(ServerSocketChannel ss, InetSocketAddress addr,
Set<SelectorThread> selectorThreads) throws IOException {
super("NIOServerCxnFactory.AcceptThread:" + addr);
this.acceptSocket = ss;
this.acceptKey = acceptSocket.register(selector, SelectionKey.OP_ACCEPT);
this.selectorThreads = Collections
.unmodifiableList(new ArrayList<SelectorThread>(selectorThreads));
selectorIterator = this.selectorThreads.iterator();
}
// ...
}
/**
* The SelectorThread receives newly accepted connections from the AcceptThread
* and is responsible for selecting for I/O readiness across the connections.
* This thread is the only thread that performs any non-threadsafe or
* potentially blocking calls on the selector (registering new connections and
* reading/writing interest ops).
*
* Assignment of a connection to a SelectorThread is permanent and only one
* SelectorThread will ever interact with the connection. There are 1-N
* SelectorThreads, with connections evenly apportioned between the
* SelectorThreads.
*
* If there is a worker thread pool, when a connection has I/O to perform the
* SelectorThread removes it from selection by clearing its interest ops and
* schedules the I/O for processing by a worker thread. When the work is
* complete, the connection is placed on the ready queue to have its interest
* ops restored and resume selection.
*
* If there is no worker thread pool, the SelectorThread performs the I/O
* directly.
*/
class SelectorThread extends AbstractSelectThread {
private final int id;
private final Queue<SocketChannel> acceptedQueue;
private final Queue<SelectionKey> updateQueue;
public SelectorThread(int id) throws IOException {
super("NIOServerCxnFactory.SelectorThread-" + id);
this.id = id;
acceptedQueue = new LinkedBlockingQueue<SocketChannel>();
updateQueue = new LinkedBlockingQueue<SelectionKey>();
}
}
启动ServerCnxnFactory
// org.apache.zookeeper.server.ServerCnxnFactory
public void startup(ZooKeeperServer zkServer) throws IOException, InterruptedException {
startup(zkServer, true);
}
// org.apache.zookeeper.server.NIOServerCnxnFactory
// 启动ServerCnxnFactory主线程
@Override
public void startup(ZooKeeperServer zks, boolean startServer) throws IOException, InterruptedException {
// 启动SelectorThread、 acceptThread、expirerThread
start();
setZooKeeperServer(zks);
if (startServer) {
// 恢复本地数据
zks.startdata();
// 创建并启动会话管理器
zks.startup();
}
}
@Override
public void start() {
stopped = false;
if (workerPool == null) {
workerPool = new WorkerService("NIOWorker", numWorkerThreads, false);
}
for (SelectorThread thread : selectorThreads) {
if (thread.getState() == Thread.State.NEW) {
thread.start();
}
}
// ensure thread is started once and only once
if (acceptThread.getState() == Thread.State.NEW) {
acceptThread.start();
}
if (expirerThread.getState() == Thread.State.NEW) {
expirerThread.start();
}
}
final public void setZooKeeperServer(ZooKeeperServer zks) {
this.zkServer = zks;
if (zks != null) {
if (secure) {
zks.setSecureServerCnxnFactory(this);
} else {
zks.setServerCnxnFactory(this);
}
}
}
恢复本地数据
public void startdata() throws IOException, InterruptedException {
// check to see if zkDb is not null
// 每次在ZooKeeper启动的时候,都需要从本地快照数据文件和事务日志文件中进行数据回复
if (zkDb == null) {
zkDb = new ZKDatabase(this.txnLogFactory);
}
// 处理快照文件
if (!zkDb.isInitialized()) {
loadData();
}
}
创建并启动会话管理器
public synchronized void startup() {
if (sessionTracker == null) {
// 创建一个会话管理器 SessionTracker
createSessionTracker();
}
// 启动会话管理器
startSessionTracker();
// 初始化ZooKeeper的请求处理链
setupRequestProcessors();
// 注册JMX服务 ZooKeeper会将服务器运行时的一些信息以JMX的方式暴露到外部
registerJMX();
setState(State.RUNNING);
notifyAll();
}
初始化ZooKeeper的请求处理链
PrepRequestProcessor->SyncRequestProcessor->FinalRequestProcessor
// org.apache.zookeeper.server.ZooKeeperServer
protected void setupRequestProcessors() {
// 典型的责任链模式
RequestProcessor finalProcessor = new FinalRequestProcessor(this);
RequestProcessor syncProcessor = new SyncRequestProcessor(this, finalProcessor);
((SyncRequestProcessor) syncProcessor).start();
firstProcessor = new PrepRequestProcessor(this, syncProcessor);
((PrepRequestProcessor) firstProcessor).start();
}
/**
* This request processor is generally at the start of a RequestProcessor
* change. It sets up any transactions associated with requests that change the
* state of the system. It counts on ZooKeeperServer to update
* outstandingRequests, so that it can take into account transactions that are
* in the queue to be applied when generating a transaction.
*
* <li>Leader服务器的请求预处理器,也是Leader服务器的第一个请求处理器.在ZooKeeper中,我们将那些会改变服务器状态的
* 请求称为“事务请求”--通常指的就是那些创建节点、更新数据、删除节点以及创建会话等要求. {@link PrepRequestProcessor}能够
* 识别出当前客户端请求是否是事务请求.对于事务请求,处理器或对其进行一系列预处理,诸如创建请求事务头、事务体、会话检查、ACL检查和版本检查等</li>
*/
public class PrepRequestProcessor extends ZooKeeperCriticalThread implements RequestProcessor {
private static final Logger LOG = LoggerFactory.getLogger(PrepRequestProcessor.class);
static boolean skipACL;
static {
skipACL = System.getProperty("zookeeper.skipACL", "no").equals("yes");
if (skipACL) {
LOG.info("zookeeper.skipACL==\"yes\", ACL checks will be skipped");
}
}
/**
* this is only for testing purposes. should never be used otherwise
*/
private static boolean failCreate = false;
LinkedBlockingQueue<Request> submittedRequests = new LinkedBlockingQueue<Request>();
private final RequestProcessor nextProcessor;
ZooKeeperServer zks;
public PrepRequestProcessor(ZooKeeperServer zks, RequestProcessor nextProcessor) {
super("ProcessThread(sid:" + zks.getServerId() + " cport:" + zks.getClientPort() + "):",
zks.getZooKeeperServerListener());
this.nextProcessor = nextProcessor;
this.zks = zks;
}
/**
* method for tests to set failCreate
*
* @param b
*/
public static void setFailCreate(boolean b) {
failCreate = b;
}
@Override
public void run() {
try {
while (true) {
Request request = submittedRequests.take();
long traceMask = ZooTrace.CLIENT_REQUEST_TRACE_MASK;
if (request.type == OpCode.ping) {
traceMask = ZooTrace.CLIENT_PING_TRACE_MASK;
}
if (LOG.isTraceEnabled()) {
ZooTrace.logRequest(LOG, traceMask, 'P', request, "");
}
if (Request.requestOfDeath == request) {
break;
}
// 进行请求的处理
pRequest(request);
}
} catch (RequestProcessorException e) {
if (e.getCause() instanceof XidRolloverException) {
LOG.info(e.getCause().getMessage());
}
handleException(this.getName(), e);
} catch (Exception e) {
handleException(this.getName(), e);
}
LOG.info("PrepRequestProcessor exited loop!");
}
/**
* This method will be called inside the ProcessRequestThread, which is a
* singleton, so there will be a single thread calling this code.
*
* @param request
*/
protected void pRequest(Request request) throws RequestProcessorException {
// LOG.info("Prep>>> cxid = " + request.cxid + " type = " +
// request.type + " id = 0x" + Long.toHexString(request.sessionId));
request.setHdr(null);
request.setTxn(null);
try {
switch (request.type) {
case OpCode.createContainer:
case OpCode.create:
case OpCode.create2:
CreateRequest create2Request = new CreateRequest();
pRequest2Txn(request.type, zks.getNextZxid(), request, create2Request, true);
break;
case OpCode.createTTL:
CreateTTLRequest createTtlRequest = new CreateTTLRequest();
pRequest2Txn(request.type, zks.getNextZxid(), request, createTtlRequest, true);
break;
case OpCode.deleteContainer:
case OpCode.delete:
DeleteRequest deleteRequest = new DeleteRequest();
pRequest2Txn(request.type, zks.getNextZxid(), request, deleteRequest, true);
break;
case OpCode.setData:
SetDataRequest setDataRequest = new SetDataRequest();
pRequest2Txn(request.type, zks.getNextZxid(), request, setDataRequest, true);
break;
// ... 各种逻辑处理
request.zxid = zks.getZxid();
// 调用下一个链
nextProcessor.processRequest(request);
}
}
/**
* 事务日志记录处理器 -- 该处理器主要用来将事务请求日志记录到事务日志文件中去,同时还会触发ZooKeeper进行数据快照
* This RequestProcessor logs requests to disk. It batches the requests to do
* the io efficiently. The request is not passed to the next RequestProcessor
* until its log has been synced to disk.
*
* SyncRequestProcessor is used in 3 different cases
* <li>1. Leader</li>-
* <li>Sync request to disk and forward it to AckRequestProcessor which send ack
* back to itself.</li>
* <li>2. Follower</li>-
* <li>Sync request to disk and forward request to SendAckRequestProcessor which
* send the packets to leader. SendAckRequestProcessor is flushable which allow
* us to force push packets to leader.</li>
* <li>3. Observer</li>-
* <li>Sync committed request to disk (received as INFORM packet). It never send
* ack back to the leader, so the nextProcessor will be null. This change the
* semantic of txnlog on the observer since it only contains committed
* txns.</li>
*/
public class SyncRequestProcessor extends ZooKeeperCriticalThread implements RequestProcessor {
private static final Logger LOG = LoggerFactory.getLogger(SyncRequestProcessor.class);
private final ZooKeeperServer zks;
private final LinkedBlockingQueue<Request> queuedRequests = new LinkedBlockingQueue<Request>();
private final RequestProcessor nextProcessor;
private Thread snapInProcess = null;
volatile private boolean running;
/**
* Transactions that have been written and are waiting to be flushed to disk.
* Basically this is the list of SyncItems whose callbacks will be invoked after
* flush returns successfully.
*/
private final LinkedList<Request> toFlush = new LinkedList<Request>();
private final Random r = new Random();
/**
* The number of log entries to log before starting a snapshot
*/
private static int snapCount = ZooKeeperServer.getSnapCount();
private final Request requestOfDeath = Request.requestOfDeath;
public SyncRequestProcessor(ZooKeeperServer zks, RequestProcessor nextProcessor) {
super("SyncThread:" + zks.getServerId(), zks.getZooKeeperServerListener());
this.zks = zks;
this.nextProcessor = nextProcessor;
running = true;
}
@Override
public void run() {
try {
int logCount = 0;
// we do this in an attempt to ensure that not all of the servers
// in the ensemble take a snapshot at the same time
int randRoll = r.nextInt(snapCount / 2);
while (true) {
Request si = null;
if (toFlush.isEmpty()) {
si = queuedRequests.take();
} else {
si = queuedRequests.poll();
if (si == null) {
flush(toFlush);
continue;
}
}
if (si == requestOfDeath) {
break;
}
if (si != null) {
// track the number of records written to the log
// 事务日志记录
/*
* 针对每个事务请求,都会通过事务日志的形式将其记录下来.Leader服务器和Follower服务器的请求处理链路上都有
* 这个处理器,两者在事务日志的记录功能上是完全一致的。
* 完成事务日志记录后,每个Follower服务器都会向Leader服务器发送ACK消息,表名自身完成了事务日志的记录,以便
* Leader服务器统计每个事务请求的投票情况
*/
if (zks.getZKDatabase().append(si)) {
logCount++;
/*
* 确定是否需要进行数据快照
* 每进行一次事务日志记录之后,ZooKeeper都会检测当前是否需要进行数据快照.
* 理论上进行snapCount次事务操作后就会开始数据快照,但是考虑到数据快照对于所有机器的整体性能的影响,
* 需要尽量避免ZooKeeper集群中的所有机器在同一时刻进行数据快照.因此ZooKeeper在具体的实现中,并不是
* 严格按照这个策略执行的,而是采取“过半随机”策略,即符合如下条件就进行快照:
* logCount > (snapCount / 2 + randRoll)
* 其中logCount代表了当前已经记录的事务日志数量,randRoll为1~snapCount/2之间的随机数
*/
if (logCount > (snapCount / 2 + randRoll)) {
randRoll = r.nextInt(snapCount / 2);
// roll the log 切换事务日志文件
zks.getZKDatabase().rollLog();
// take a snapshot
if (snapInProcess != null && snapInProcess.isAlive()) {
LOG.warn("Too busy to snap, skipping");
} else {
/*
* 创建数据快照异步线程
* 为了保证数据快照过程不影响ZooKeeper的主流程,这里需要创建一个单独的异步线程
* 来进行数据快照
*/
snapInProcess = new ZooKeeperThread("Snapshot Thread") {
public void run() {
try {
zks.takeSnapshot();
} catch (Exception e) {
LOG.warn("Unexpected exception", e);
}
}
};
snapInProcess.start();
}
logCount = 0;
}
} else if (toFlush.isEmpty()) {
// optimization for read heavy workloads
// iff this is a read, and there are no pending
// flushes (writes), then just pass this to the next
// processor
if (nextProcessor != null) {
nextProcessor.processRequest(si);
if (nextProcessor instanceof Flushable) {
((Flushable) nextProcessor).flush();
}
}
continue;
}
toFlush.add(si);
if (toFlush.size() > 1000) {
flush(toFlush);
}
}
}
} catch (Throwable t) {
handleException(this.getName(), t);
} finally {
running = false;
}
LOG.info("SyncRequestProcessor exited!");
}
}
package org.apache.zookeeper.server;
/**
* 最后请求处理器 -- 该处理器主要用来进行客户端请求返回之前的收尾工作,包括创建客户端请求的响应;针对事务请求,
* 该处理器还会负责将事务应用到内存数据库中去
* This Request processor actually applies any transaction associated with a
* request and services any queries. It is always at the end of a
* RequestProcessor chain (hence the name), so it does not have a nextProcessor
* member.
*
* This RequestProcessor counts on ZooKeeperServer to populate the
* outstandingRequests member of ZooKeeperServer.
*/
public class FinalRequestProcessor implements RequestProcessor {
private static final Logger LOG = LoggerFactory.getLogger(FinalRequestProcessor.class);
ZooKeeperServer zks;
public FinalRequestProcessor(ZooKeeperServer zks) {
this.zks = zks;
}
public void processRequest(Request request) {
if (LOG.isDebugEnabled()) {
LOG.debug("Processing request:: " + request);
}
// request.addRQRec(">final");
long traceMask = ZooTrace.CLIENT_REQUEST_TRACE_MASK;
if (request.type == OpCode.ping) {
traceMask = ZooTrace.SERVER_PING_TRACE_MASK;
}
if (LOG.isTraceEnabled()) {
ZooTrace.logRequest(LOG, traceMask, 'E', request, "");
}
ProcessTxnResult rc = null;
/*
* 请求流转到FinalRequestProcessor处理器后,也就接近请求处理的尾声了.处理器会首先检查outstandingChanges
* 队列中请求的有效性,如果发现这些请求已经落后于正在处理的请求,那么就直接从outstandingChanges队列中移除
*/
synchronized (zks.outstandingChanges) {
// Need to process local session requests
rc = zks.processTxn(request);
// request.hdr is set for write requests, which are the only ones
// that add to outstandingChanges.
if (request.getHdr() != null) {
TxnHeader hdr = request.getHdr();
Record txn = request.getTxn();
long zxid = hdr.getZxid();
while (!zks.outstandingChanges.isEmpty() && zks.outstandingChanges.peek().zxid <= zxid) {
ChangeRecord cr = zks.outstandingChanges.remove();
if (cr.zxid < zxid) {
LOG.warn("Zxid outstanding " + cr.zxid + " is less than current " + zxid);
}
if (zks.outstandingChangesForPath.get(cr.path) == cr) {
zks.outstandingChangesForPath.remove(cr.path);
}
}
}
// do not add non quorum packets to the queue.
if (request.isQuorum()) {
/*
* 事务应用
* 在之前的请求处理逻辑中,我们仅仅是将该事务请求记录到了事务日志中去,而内存数据库中的状态尚未变更
* 因此,在这个环节,我们需要将事务变更应用到内存数据库中.但是需要注意的一点是,对于“会话创建”这类事务请求,
* ZooKeeper做了特殊处理——因为在ZooKeeper内存中,会话的管理都是由SessionTracker负责的,而在
* 会话创建的步骤9中,ZooKeeper已经将会话信息注册到了SessionTracker中,因此此处无须对内存数据库
* 做任何处理,只需要再次向SessionTracker进行会话注册即可.
*/
zks.getZKDatabase().addCommittedProposal(request);
}
}
// ZOOKEEPER-558:
// In some cases the server does not close the connection (e.g., closeconn
// buffer
// was not being queued — ZOOKEEPER-558) properly. This happens, for example,
// when the client closes the connection. The server should still close the
// session, though.
// Calling closeSession() after losing the cnxn, results in the client close
// session response being dropped.
if (request.type == OpCode.closeSession && connClosedByClient(request)) {
// We need to check if we can close the session id.
// Sometimes the corresponding ServerCnxnFactory could be null because
// we are just playing diffs from the leader.
if (closeSession(zks.serverCnxnFactory, request.sessionId)
|| closeSession(zks.secureServerCnxnFactory, request.sessionId)) {
return;
}
}
if (request.cnxn == null) {
return;
}
ServerCnxn cnxn = request.cnxn;
String lastOp = "NA";
zks.decInProcess();
Code err = Code.OK;
Record rsp = null;
try {
if (request.getHdr() != null && request.getHdr().getType() == OpCode.error) {
/*
* When local session upgrading is disabled, leader will reject the ephemeral
* node creation due to session expire. However, if this is the follower that
* issue the request, it will have the correct error code, so we should use that
* and report to user
*/
if (request.getException() != null) {
throw request.getException();
} else {
throw KeeperException
.create(KeeperException.Code.get(((ErrorTxn) request.getTxn()).getErr()));
}
}
KeeperException ke = request.getException();
if (ke != null && request.type != OpCode.multi) {
throw ke;
}
if (LOG.isDebugEnabled()) {
LOG.debug("{}", request);
}
switch (request.type) {
case OpCode.ping: {
zks.serverStats().updateLatency(request.createTime);
lastOp = "PING";
cnxn.updateStatsForResponse(request.cxid, request.zxid, lastOp, request.createTime,
Time.currentElapsedTime());
cnxn.sendResponse(new ReplyHeader(-2, zks.getZKDatabase().getDataTreeLastProcessedZxid(), 0),
null, "response");
return;
}
case OpCode.createSession: {
zks.serverStats().updateLatency(request.createTime);
lastOp = "SESS";
// 20 统计处理
cnxn.updateStatsForResponse(request.cxid, request.zxid, lastOp, request.createTime,
Time.currentElapsedTime());
zks.finishSessionInit(request.cnxn, true);
return;
}
case OpCode.multi: {
lastOp = "MULT";
rsp = new MultiResponse();
for (ProcessTxnResult subTxnResult : rc.multiResult) {
OpResult subResult;
switch (subTxnResult.type) {
case OpCode.check:
subResult = new CheckResult();
break;
case OpCode.create:
subResult = new CreateResult(subTxnResult.path);
break;
case OpCode.create2:
case OpCode.createTTL:
case OpCode.createContainer:
subResult = new CreateResult(subTxnResult.path, subTxnResult.stat);
break;
case OpCode.delete:
case OpCode.deleteContainer:
subResult = new DeleteResult();
break;
case OpCode.setData:
subResult = new SetDataResult(subTxnResult.stat);
break;
case OpCode.error:
subResult = new ErrorResult(subTxnResult.err);
break;
default:
throw new IOException("Invalid type of op");
}
((MultiResponse) rsp).add(subResult);
}
break;
}
case OpCode.create: {
lastOp = "CREA";
rsp = new CreateResponse(rc.path);
err = Code.get(rc.err);
break;
}
case OpCode.create2:
case OpCode.createTTL:
case OpCode.createContainer: {
lastOp = "CREA";
rsp = new Create2Response(rc.path, rc.stat);
err = Code.get(rc.err);
break;
}
case OpCode.delete:
case OpCode.deleteContainer: {
lastOp = "DELE";
err = Code.get(rc.err);
break;
}
case OpCode.setData: {
lastOp = "SETD";
rsp = new SetDataResponse(rc.stat);
err = Code.get(rc.err);
break;
}
case OpCode.reconfig: {
lastOp = "RECO";
rsp = new GetDataResponse(
((QuorumZooKeeperServer) zks).self.getQuorumVerifier().toString().getBytes(),
rc.stat);
err = Code.get(rc.err);
break;
}
case OpCode.setACL: {
lastOp = "SETA";
rsp = new SetACLResponse(rc.stat);
err = Code.get(rc.err);
break;
}
case OpCode.closeSession: {
lastOp = "CLOS";
err = Code.get(rc.err);
break;
}
case OpCode.sync: {
lastOp = "SYNC";
SyncRequest syncRequest = new SyncRequest();
ByteBufferInputStream.byteBuffer2Record(request.request, syncRequest);
rsp = new SyncResponse(syncRequest.getPath());
break;
}
case OpCode.check: {
lastOp = "CHEC";
rsp = new SetDataResponse(rc.stat);
err = Code.get(rc.err);
break;
}
case OpCode.exists: {
lastOp = "EXIS";
// TODO we need to figure out the security requirement for this!
ExistsRequest existsRequest = new ExistsRequest();
ByteBufferInputStream.byteBuffer2Record(request.request, existsRequest);
String path = existsRequest.getPath();
if (path.indexOf('\0') != -1) {
throw new KeeperException.BadArgumentsException();
}
Stat stat = zks.getZKDatabase().statNode(path, existsRequest.getWatch() ? cnxn : null);
rsp = new ExistsResponse(stat);
break;
}
case OpCode.getData: {
lastOp = "GETD";
GetDataRequest getDataRequest = new GetDataRequest();
ByteBufferInputStream.byteBuffer2Record(request.request, getDataRequest);
DataNode n = zks.getZKDatabase().getNode(getDataRequest.getPath());
if (n == null) {
throw new KeeperException.NoNodeException();
}
PrepRequestProcessor.checkACL(zks, zks.getZKDatabase().aclForNode(n), ZooDefs.Perms.READ,
request.authInfo);
Stat stat = new Stat();
byte b[] = zks.getZKDatabase().getData(getDataRequest.getPath(), stat,
getDataRequest.getWatch() ? cnxn : null);
rsp = new GetDataResponse(b, stat);
break;
}
case OpCode.setWatches: {
lastOp = "SETW";
SetWatches setWatches = new SetWatches();
// XXX We really should NOT need this!!!!
request.request.rewind();
ByteBufferInputStream.byteBuffer2Record(request.request, setWatches);
long relativeZxid = setWatches.getRelativeZxid();
zks.getZKDatabase().setWatches(relativeZxid, setWatches.getDataWatches(),
setWatches.getExistWatches(), setWatches.getChildWatches(), cnxn);
break;
}
case OpCode.getACL: {
lastOp = "GETA";
GetACLRequest getACLRequest = new GetACLRequest();
ByteBufferInputStream.byteBuffer2Record(request.request, getACLRequest);
DataNode n = zks.getZKDatabase().getNode(getACLRequest.getPath());
if (n == null) {
throw new KeeperException.NoNodeException();
}
PrepRequestProcessor.checkACL(zks, zks.getZKDatabase().aclForNode(n),
ZooDefs.Perms.READ | ZooDefs.Perms.ADMIN, request.authInfo);
Stat stat = new Stat();
List<ACL> acl = zks.getZKDatabase().getACL(getACLRequest.getPath(), stat);
try {
PrepRequestProcessor.checkACL(zks, zks.getZKDatabase().aclForNode(n), ZooDefs.Perms.ADMIN,
request.authInfo);
rsp = new GetACLResponse(acl, stat);
} catch (KeeperException.NoAuthException e) {
List<ACL> acl1 = new ArrayList<ACL>(acl.size());
for (ACL a : acl) {
if ("digest".equals(a.getId().getScheme())) {
Id id = a.getId();
Id id1 = new Id(id.getScheme(), id.getId().replaceAll(":.*", ":x"));
acl1.add(new ACL(a.getPerms(), id1));
} else {
acl1.add(a);
}
}
rsp = new GetACLResponse(acl1, stat);
}
break;
}
case OpCode.getChildren: {
lastOp = "GETC";
GetChildrenRequest getChildrenRequest = new GetChildrenRequest();
ByteBufferInputStream.byteBuffer2Record(request.request, getChildrenRequest);
DataNode n = zks.getZKDatabase().getNode(getChildrenRequest.getPath());
if (n == null) {
throw new KeeperException.NoNodeException();
}
PrepRequestProcessor.checkACL(zks, zks.getZKDatabase().aclForNode(n), ZooDefs.Perms.READ,
request.authInfo);
List<String> children = zks.getZKDatabase().getChildren(getChildrenRequest.getPath(), null,
getChildrenRequest.getWatch() ? cnxn : null);
rsp = new GetChildrenResponse(children);
break;
}
case OpCode.getChildren2: {
lastOp = "GETC";
GetChildren2Request getChildren2Request = new GetChildren2Request();
ByteBufferInputStream.byteBuffer2Record(request.request, getChildren2Request);
Stat stat = new Stat();
DataNode n = zks.getZKDatabase().getNode(getChildren2Request.getPath());
if (n == null) {
throw new KeeperException.NoNodeException();
}
PrepRequestProcessor.checkACL(zks, zks.getZKDatabase().aclForNode(n), ZooDefs.Perms.READ,
request.authInfo);
List<String> children = zks.getZKDatabase().getChildren(getChildren2Request.getPath(), stat,
getChildren2Request.getWatch() ? cnxn : null);
rsp = new GetChildren2Response(children, stat);
break;
}
case OpCode.checkWatches: {
lastOp = "CHKW";
CheckWatchesRequest checkWatches = new CheckWatchesRequest();
ByteBufferInputStream.byteBuffer2Record(request.request, checkWatches);
WatcherType type = WatcherType.fromInt(checkWatches.getType());
boolean containsWatcher = zks.getZKDatabase().containsWatcher(checkWatches.getPath(), type,
cnxn);
if (!containsWatcher) {
String msg = String.format(Locale.ENGLISH, "%s (type: %s)", checkWatches.getPath(), type);
throw new KeeperException.NoWatcherException(msg);
}
break;
}
case OpCode.removeWatches: {
lastOp = "REMW";
RemoveWatchesRequest removeWatches = new RemoveWatchesRequest();
ByteBufferInputStream.byteBuffer2Record(request.request, removeWatches);
WatcherType type = WatcherType.fromInt(removeWatches.getType());
boolean removed = zks.getZKDatabase().removeWatch(removeWatches.getPath(), type, cnxn);
if (!removed) {
String msg = String.format(Locale.ENGLISH, "%s (type: %s)", removeWatches.getPath(),
type);
throw new KeeperException.NoWatcherException(msg);
}
break;
}
}
} catch (SessionMovedException e) {
// session moved is a connection level error, we need to tear
// down the connection otw ZOOKEEPER-710 might happen
// ie client on slow follower starts to renew session, fails
// before this completes, then tries the fast follower (leader)
// and is successful, however the initial renew is then
// successfully fwd/processed by the leader and as a result
// the client and leader disagree on where the client is most
// recently attached (and therefore invalid SESSION MOVED generated)
cnxn.sendCloseSession();
return;
} catch (KeeperException e) {
err = e.code();
} catch (Exception e) {
// log at error level as we are returning a marshalling
// error to the user
LOG.error("Failed to process " + request, e);
StringBuilder sb = new StringBuilder();
ByteBuffer bb = request.request;
bb.rewind();
while (bb.hasRemaining()) {
sb.append(Integer.toHexString(bb.get() & 0xff));
}
LOG.error("Dumping request buffer: 0x" + sb.toString());
err = Code.MARSHALLINGERROR;
}
long lastZxid = zks.getZKDatabase().getDataTreeLastProcessedZxid();
ReplyHeader hdr = new ReplyHeader(request.cxid, lastZxid, err.intValue());
zks.serverStats().updateLatency(request.createTime);
cnxn.updateStatsForResponse(request.cxid, lastZxid, lastOp, request.createTime,
Time.currentElapsedTime());
try {
cnxn.sendResponse(hdr, rsp, "response");
if (request.type == OpCode.closeSession) {
cnxn.sendCloseSession();
}
} catch (IOException e) {
LOG.error("FIXMSG", e);
}
}
private boolean closeSession(ServerCnxnFactory serverCnxnFactory, long sessionId) {
if (serverCnxnFactory == null) {
return false;
}
return serverCnxnFactory.closeSession(sessionId);
}
private boolean connClosedByClient(Request request) {
return request.cnxn == null;
}
public void shutdown() {
// we are the final link in the chain
LOG.info("shutdown of request processor complete");
}
}
请求处理逻辑
AcceptThread等待连接
private final Collection<SelectorThread> selectorThreads;
private Iterator<SelectorThread> selectorIterator;
public void run() {
try {
while (!stopped && !acceptSocket.socket().isClosed()) {
try {
select();
} catch (RuntimeException e) {
LOG.warn("Ignoring unexpected runtime exception", e);
} catch (Exception e) {
LOG.warn("Ignoring unexpected exception", e);
}
}
} finally {
closeSelector();
// This will wake up the selector threads, and tell the
// worker thread pool to begin shutdown.
if (!reconfiguring) {
NIOServerCnxnFactory.this.stop();
}
LOG.info("accept thread exitted run method");
}
}
private void select() {
try {
selector.select();
Iterator<SelectionKey> selectedKeys = selector.selectedKeys().iterator();
while (!stopped && selectedKeys.hasNext()) {
SelectionKey key = selectedKeys.next();
selectedKeys.remove();
if (!key.isValid()) {
continue;
}
if (key.isAcceptable()) {
// doAccept会分配SelectorThread来处理I/O操作
if (!doAccept()) {
// If unable to pull a new connection off the accept
// queue, pause accepting to give us time to free
// up file descriptors and so the accept thread
// doesn't spin in a tight loop.
pauseAccept(10);
}
} else {
LOG.warn("Unexpected ops in accept select " + key.readyOps());
}
}
} catch (IOException e) {
LOG.warn("Ignoring IOException while selecting", e);
}
}
/**
* Accept new socket connections. Enforces maximum number of connections per
* client IP address. Round-robin assigns to selector thread for handling.
* Returns whether pulled a connection off the accept queue or not. If
* encounters an error attempts to fast close the socket.
*
* @return whether was able to accept a connection or not
*/
private boolean doAccept() {
boolean accepted = false;
SocketChannel sc = null;
try {
sc = acceptSocket.accept();
accepted = true;
InetAddress ia = sc.socket().getInetAddress();
int cnxncount = getClientCnxnCount(ia);
if (maxClientCnxns > 0 && cnxncount >= maxClientCnxns) {
throw new IOException("Too many connections from " + ia + " - max is " + maxClientCnxns);
}
LOG.debug("Accepted socket connection from " + sc.socket().getRemoteSocketAddress());
sc.configureBlocking(false);
// Round-robin assign this connection to a selector thread
if (!selectorIterator.hasNext()) {
selectorIterator = selectorThreads.iterator();
}
SelectorThread selectorThread = selectorIterator.next();
if (!selectorThread.addAcceptedConnection(sc)) {
throw new IOException("Unable to add connection to selector queue"
+ (stopped ? " (shutdown in progress)" : ""));
}
acceptErrorLogger.flush();
} catch (IOException e) {
// accept, maxClientCnxns, configureBlocking
acceptErrorLogger.rateLimitLog("Error accepting new connection: " + e.getMessage());
fastCloseSock(sc);
}
return accepted;
}
SelectorThread处理连接
/**
* Place new accepted connection onto a queue for adding. Do this so only the
* selector thread modifies what keys are registered with the selector.
*/
public boolean addAcceptedConnection(SocketChannel accepted) {
if (stopped || !acceptedQueue.offer(accepted)) {
return false;
}
wakeupSelector();
return true;
}
/**
* The main loop for the thread selects() on the connections and dispatches
* ready I/O work requests, then registers all pending newly accepted
* connections and updates any interest ops on the queue.
*/
public void run() {
try {
while (!stopped) {
try {
select();
processAcceptedConnections();
processInterestOpsUpdateRequests();
} catch (RuntimeException e) {
LOG.warn("Ignoring unexpected runtime exception", e);
} catch (Exception e) {
LOG.warn("Ignoring unexpected exception", e);
}
}
// Close connections still pending on the selector. Any others
// with in-flight work, let drain out of the work queue.
for (SelectionKey key : selector.keys()) {
NIOServerCnxn cnxn = (NIOServerCnxn) key.attachment();
if (cnxn.isSelectable()) {
cnxn.close();
}
cleanupSelectionKey(key);
}
SocketChannel accepted;
while ((accepted = acceptedQueue.poll()) != null) {
fastCloseSock(accepted);
}
updateQueue.clear();
} finally {
closeSelector();
// This will wake up the accept thread and the other selector
// threads, and tell the worker thread pool to begin shutdown.
NIOServerCnxnFactory.this.stop();
LOG.info("selector thread exitted run method");
}
}
private void select() {
try {
selector.select();
Set<SelectionKey> selected = selector.selectedKeys();
ArrayList<SelectionKey> selectedList = new ArrayList<SelectionKey>(selected);
Collections.shuffle(selectedList);
Iterator<SelectionKey> selectedKeys = selectedList.iterator();
while (!stopped && selectedKeys.hasNext()) {
SelectionKey key = selectedKeys.next();
selected.remove(key);
if (!key.isValid()) {
cleanupSelectionKey(key);
continue;
}
if (key.isReadable() || key.isWritable()) {
handleIO(key);
} else {
LOG.warn("Unexpected ops in select " + key.readyOps());
}
}
} catch (IOException e) {
LOG.warn("Ignoring IOException while selecting", e);
}
}
进行IO处理
/**
* Schedule I/O for processing on the connection associated with the given
* SelectionKey. If a worker thread pool is not being used, I/O is run directly
* by this thread.
*/
private void handleIO(SelectionKey key) {
IOWorkRequest workRequest = new IOWorkRequest(this, key);
NIOServerCnxn cnxn = (NIOServerCnxn) key.attachment();
// Stop selecting this key while processing on its
// connection
cnxn.disableSelectable();
key.interestOps(0);
touchCnxn(cnxn);
workerPool.schedule(workRequest);
}
任务调度
/**
* Schedule work to be done by the thread assigned to this id. Thread
* assignment is a single mod operation on the number of threads. If a
* worker thread pool is not being used, work is done directly by
* this thread.
*/
public void schedule(WorkRequest workRequest, long id) {
if (stopped) {
workRequest.cleanup();
return;
}
ScheduledWorkRequest scheduledWorkRequest =
new ScheduledWorkRequest(workRequest);
// If we have a worker thread pool, use that; otherwise, do the work
// directly.
int size = workers.size();
if (size > 0) {
try {
// make sure to map negative ids as well to [0, size-1]
int workerNum = ((int) (id % size) + size) % size;
ExecutorService worker = workers.get(workerNum);
worker.execute(scheduledWorkRequest);
} catch (RejectedExecutionException e) {
LOG.warn("ExecutorService rejected execution", e);
workRequest.cleanup();
}
} else {
// When there is no worker thread pool, do the work directly
// and wait for its completion
scheduledWorkRequest.run();
}
}
// 任务抽象
private class ScheduledWorkRequest implements Runnable {
private final WorkRequest workRequest;
ScheduledWorkRequest(WorkRequest workRequest) {
this.workRequest = workRequest;
}
@Override
public void run() {
try {
// Check if stopped while request was on queue
if (stopped) {
workRequest.cleanup();
return;
}
workRequest.doWork();
} catch (Exception e) {
LOG.warn("Unexpected exception", e);
workRequest.cleanup();
}
}
}
任务模板
public void doWork() throws InterruptedException {
if (!key.isValid()) {
selectorThread.cleanupSelectionKey(key);
return;
}
if (key.isReadable() || key.isWritable()) {
cnxn.doIO(key);
// Check if we shutdown or doIO() closed this connection
if (stopped) {
cnxn.close();
return;
}
if (!key.isValid()) {
selectorThread.cleanupSelectionKey(key);
return;
}
touchCnxn(cnxn);
}
// Mark this connection as once again ready for selection
cnxn.enableSelectable();
// Push an update request on the queue to resume selecting
// on the current set of interest ops, which may have changed
// as a result of the I/O operations we just performed.
if (!selectorThread.addInterestOpsUpdateRequest(key)) {
cnxn.close();
}
}
/**
* Handles read/write IO on connection.
* 1. I/O层接收来自客户端的请求
* 在ZooKeeper中,NIOServerCnxn实例维护每一个客户端连接,客户端与服务端的所有通信
* 都是有NIOServerCnxn负责的——其负责统一接收来自客户端的所有请求,并将请求内容从底层
* 网络I/O中完整地读取出来
*/
void doIO(SelectionKey k) throws InterruptedException {
try {
if (isSocketOpen() == false) {
LOG.warn("trying to do i/o on a null socket for session:0x" + Long.toHexString(sessionId));
return;
}
if (k.isReadable()) {
int rc = sock.read(incomingBuffer);
if (rc < 0) {
throw new EndOfStreamException("Unable to read additional data from client sessionid 0x"
+ Long.toHexString(sessionId) + ", likely client has closed socket");
}
if (incomingBuffer.remaining() == 0) {
boolean isPayload;
if (incomingBuffer == lenBuffer) { // start of next request
incomingBuffer.flip();
isPayload = readLength(k);
incomingBuffer.clear();
} else {
// continuation
isPayload = true;
}
if (isPayload) { // not the case for 4letterword
readPayload();
} else {
// four letter words take care
// need not do anything else
return;
}
}
}
if (k.isWritable()) {
handleWrite(k);
if (!initialized && !getReadInterest() && !getWriteInterest()) {
throw new CloseRequestException("responded to info probe");
}
}
} catch (CancelledKeyException e) {
// ... 各种异常处理
close();
}
}
/** Read the request payload (everything following the length prefix) */
private void readPayload() throws IOException, InterruptedException {
if (incomingBuffer.remaining() != 0) { // have we read length bytes?
int rc = sock.read(incomingBuffer); // sock is non-blocking, so ok
if (rc < 0) {
throw new EndOfStreamException("Unable to read additional data from client sessionid 0x"
+ Long.toHexString(sessionId) + ", likely client has closed socket");
}
}
if (incomingBuffer.remaining() == 0) { // have we read length bytes?
packetReceived();
incomingBuffer.flip();
// 2. 判断是否是客户端“会话创建”请求
if (!initialized) {
// NIOServerCnxn实体尚未被初始化,那么可以确定该客户端请求一定是“会话创建”请求
// 服务端处理创建连接事件
readConnectRequest();
} else {
// 服务端处理其他请求事件
readRequest();
}
lenBuffer.clear();
incomingBuffer = lenBuffer;
}
}
private void readConnectRequest() throws IOException, InterruptedException {
if (!isZKServerRunning()) {
throw new IOException("ZooKeeperServer not running");
}
// 3. 反序列化ConnectRequest请求
zkServer.processConnectRequest(this, incomingBuffer);
initialized = true;
}
private void readRequest() throws IOException {
zkServer.processPacket(this, incomingBuffer);
}
ZooKeeperServer.processPacket
// 进行反序列化
public void processPacket(ServerCnxn cnxn, ByteBuffer incomingBuffer) throws IOException {
// We have the request, now process and setup for next
InputStream bais = new ByteBufferInputStream(incomingBuffer);
BinaryInputArchive bia = BinaryInputArchive.getArchive(bais);
RequestHeader h = new RequestHeader();
h.deserialize(bia, "header");
// Through the magic of byte buffers, txn will not be
// pointing
// to the start of the txn
incomingBuffer = incomingBuffer.slice();
if (h.getType() == OpCode.auth) {
LOG.info("got auth packet " + cnxn.getRemoteSocketAddress());
AuthPacket authPacket = new AuthPacket();
ByteBufferInputStream.byteBuffer2Record(incomingBuffer, authPacket);
String scheme = authPacket.getScheme();
AuthenticationProvider ap = ProviderRegistry.getProvider(scheme);
Code authReturn = KeeperException.Code.AUTHFAILED;
if (ap != null) {
try {
authReturn = ap.handleAuthentication(cnxn, authPacket.getAuth());
} catch (RuntimeException e) {
LOG.warn("Caught runtime exception from AuthenticationProvider: " + scheme + " due to "
+ e);
authReturn = KeeperException.Code.AUTHFAILED;
}
}
if (authReturn == KeeperException.Code.OK) {
if (LOG.isDebugEnabled()) {
LOG.debug("Authentication succeeded for scheme: " + scheme);
}
LOG.info("auth success " + cnxn.getRemoteSocketAddress());
ReplyHeader rh = new ReplyHeader(h.getXid(), 0, KeeperException.Code.OK.intValue());
cnxn.sendResponse(rh, null, null);
} else {
if (ap == null) {
LOG.warn("No authentication provider for scheme: " + scheme + " has "
+ ProviderRegistry.listProviders());
} else {
LOG.warn("Authentication failed for scheme: " + scheme);
}
// send a response...
ReplyHeader rh = new ReplyHeader(h.getXid(), 0, KeeperException.Code.AUTHFAILED.intValue());
cnxn.sendResponse(rh, null, null);
// ... and close connection
cnxn.sendBuffer(ServerCnxnFactory.closeConn);
cnxn.disableRecv();
}
return;
} else {
if (h.getType() == OpCode.sasl) {
Record rsp = processSasl(incomingBuffer, cnxn);
ReplyHeader rh = new ReplyHeader(h.getXid(), 0, KeeperException.Code.OK.intValue());
cnxn.sendResponse(rh, rsp, "response"); // not sure about 3rd arg..what is it?
return;
} else {
Request si = new Request(cnxn, cnxn.getSessionId(), h.getXid(), h.getType(), incomingBuffer,
cnxn.getAuthInfo());
si.setOwner(ServerCnxn.me);
// Always treat packet from the client as a possible
// local request.
setLocalSessionFlag(si);
// 提交请求到处理链进行处理
submitRequest(si);
}
}
cnxn.incrOutstandingRequests(h);
}
提交请求到处理链进行处理
public void submitRequest(Request si) {
if (firstProcessor == null) {
synchronized (this) {
try {
// Since all requests are passed to the request
// processor it should wait for setting up the request
// processor chain. The state will be updated to RUNNING
// after the setup.
while (state == State.INITIAL) {
wait(1000);
}
} catch (InterruptedException e) {
LOG.warn("Unexpected interruption", e);
}
if (firstProcessor == null || state != State.RUNNING) {
throw new RuntimeException("Not started");
}
}
}
try {
touch(si.cnxn);
boolean validpacket = Request.isValid(si.type);
if (validpacket) {
// 将请求交给ZooKeeper的处理器链进行处理
firstProcessor.processRequest(si);
if (si.cnxn != null) {
incInProcess();
}
} else {
LOG.warn("Received packet at server of unknown type " + si.type);
new UnimplementedRequestProcessor().processRequest(si);
}
} catch (MissingSessionException e) {
if (LOG.isDebugEnabled()) {
LOG.debug("Dropping request: " + e.getMessage());
}
} catch (RequestProcessorException e) {
LOG.error("Unable to process request:" + e.getMessage(), e);
}
}
后续就是各个处理器进行处理了。
总结一下整个流程:
- 首先AcceptThread等待客户端的请求,当有请求来的时候会遍历selectorThreads进行addAcceptedConnection,就是将连接存放到acceptedQueue队列中。
acceptedQueue = new LinkedBlockingQueue();
- SelectorThread线程会查询事件selector.selectedKeys()并进行IO处理(handleIO(key)),并抽象成一个IOWorkRequest任务,通过WorkerService进行计划处理
- 每个IOWorkRequest都包含有NIOServerCnxn连接对象,会转交NIOServerCnxn进行IO处理,最后会由ZooKeeperServer将请求进行反序列化并提交给调用链依次进行处理
IOWorkRequest(SelectorThread selectorThread, SelectionKey key) {
this.selectorThread = selectorThread;
this.key = key;
this.cnxn = (NIOServerCnxn) key.attachment();
}
- 首先是请求预处理器PrepRequestProcessor识别出当前客户端请求是否是事务请求.对于事务请求,处理器或对其进行一系列预处理,诸如创建请求事务头、事务体、会话检查、ACL检查和版本检查等,然后是SyncRequestProcessor进行持久化记录,最后由FinalRequestProcessor进行收尾工作,包括创建客户端请求的响应;针对事务请求,该处理器还会负责将事务应用到内存数据库中去