Hadoop通信机制采用自己编写的RPC,相比较其他复杂的RPC框架来说,更容易自我控制且更精练。
RPC在Hadoop中实现 client和server 之间通信和数据传输的作用,以及 datanode和namenode心跳数据,
jobTracker 和 taskTacker之间通信的作用。
ipc.Server源码分析
ipc.Server是一个abstract修饰的抽象类,被ipc.RPC.Server继承。NameNode在初始时通过ipc.Server初始化了ipc.RPC.Server。
初始化和启动Server
server在NameNode的initialize()方法中被初始化:
/**
* Initialize name-node.
*
* @param conf the configuration
*/
private void initialize(Configuration conf) throws IOException {
InetSocketAddress socAddr = NameNode.getAddress(conf);
UserGroupInformation.setConfiguration(conf);
SecurityUtil.login(conf, DFSConfigKeys.DFS_NAMENODE_KEYTAB_FILE_KEY,
DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, socAddr.getHostName());
int handlerCount = conf.getInt("dfs.namenode.handler.count", 10);
// set service-level authorization security policy
if (serviceAuthEnabled =
conf.getBoolean(
ServiceAuthorizationManager.SERVICE_AUTHORIZATION_CONFIG, false)) {
ServiceAuthorizationManager.refresh(conf, new HDFSPolicyProvider());
}
myMetrics = NameNodeInstrumentation.create(conf);
this.namesystem = new FSNamesystem(this, conf);
if (UserGroupInformation.isSecurityEnabled()) {
namesystem.activateSecretManager();
}
// create rpc server
InetSocketAddress dnSocketAddr = getServiceRpcServerAddress(conf);
if (dnSocketAddr != null) {
int serviceHandlerCount =
conf.getInt(DFSConfigKeys.DFS_NAMENODE_SERVICE_HANDLER_COUNT_KEY,
DFSConfigKeys.DFS_NAMENODE_SERVICE_HANDLER_COUNT_DEFAULT);
this.serviceRpcServer = RPC.getServer(this, dnSocketAddr.getHostName(),
dnSocketAddr.getPort(), serviceHandlerCount,
false, conf, namesystem.getDelegationTokenSecretManager());
this.serviceRPCAddress = this.serviceRpcServer.getListenerAddress();
setRpcServiceServerAddress(conf);
}
this.server = RPC.getServer(this, socAddr.getHostName(), // 由RPC的getServer()方法来获得server,返回的是ipc.RPC.Server,
socAddr.getPort(), handlerCount, false, conf, namesystem // ipc.Server server = RPC.getServer();
.getDelegationTokenSecretManager());
// The rpc-server port can be ephemeral... ensure we have the correct info
this.serverAddress = this.server.getListenerAddress();
FileSystem.setDefaultUri(conf, getUri(serverAddress));
LOG.info("Namenode up at: " + this.serverAddress);
startHttpServer(conf);
this.server.start(); //start RPC server 启动RPC server
if (serviceRpcServer != null) {
serviceRpcServer.start();
}
startTrashEmptier(conf);
}
在RCP.Server中的getServer()方法:
/** Construct a server for a protocol implementation instance listening on a
* port and address, with a secret manager. */
public static Server getServer(final Object instance, final String bindAddress, final int port,
final int numHandlers,
final boolean verbose, Configuration conf,
SecretManager<? extends TokenIdentifier> secretManager)
throws IOException {
return new Server(instance, conf, bindAddress, port, numHandlers, verbose, secretManager);
} // 返回的是ipc.RPC.Server的对象
获得了一个RPC server, 由于ipc.RPC.Server是ipc.Server的子类,继承父类的成员函数,故可以使用父类 start()方法,启动这个RPC server, ipc.Server中的start()方法:
/** Starts the service. Must be called before any calls will be handled. */
public synchronized void start() { // 启动server, 同时启动responder,listener,handlers
responder.start();
listener.start();
handlers = new Handler[handlerCount];
for (int i = 0; i < handlerCount; i++) {
handlers[i] = new Handler(i);
handlers[i].start();
}
}
Server处理客户端请求
首先是建立连接。Hadoop RPC 的Server端采用Java NIO来建立连接,以处理多客户端的连接请求。
Server使用Listener来监听客户端的连接,Listener是Server的一个内部类,继承自Thread类,其构造函数如下:
public Listener() throws IOException {
address = new InetSocketAddress(bindAddress, port);
// Create a new server socket and set to non blocking mode:创建一个新的server socket,并设置为非阻塞模式
acceptChannel = ServerSocketChannel.open();
acceptChannel.configureBlocking(false);
// Bind the server socket to the local host and port:将该server socket绑定到本地的端口
bind(acceptChannel.socket(), address, backlogLength);
port = acceptChannel.socket().getLocalPort(); //Could be an ephemeral port
// create a selector;创建一个selector,java.nio.channels.Selector;
selector= Selector.open();
readers = new Reader[readThreads]; // Reader是Listener的一个内部类,实现了Runnable接口
readPool = Executors.newFixedThreadPool(readThreads);
for (int i = 0; i < readThreads; i++) {// 创建多个reader线程并启动,<span style="font-family: Arial, Helvetica, sans-serif;">以便处理多客户端请求下的服务器相应延迟问题。</span>
Selector readSelector = Selector.open();
Reader reader = new Reader(readSelector);// 每个reader对应一个selector
readers[i] = reader;
readPool.execute(reader); // readPool是一个java.util.concurrent.ExecutorService类型,execute方法的参数是一个Runnalbe类型,
} // Reader实现了Runnalbe接口,将Reader的实例reader作为参数传给execute方法并执行,那么reader线程执行了。
// Register accepts on the server socket with the selector.使用selector在server socket上注册连接事件使用
acceptChannel.register(selector, SelectionKey.OP_ACCEPT);
this.setName("IPC Server listener on " + port);
this.setDaemon(true);
}
启动Listener,run()方法:
@Override
public void run() {
LOG.info(getName() + ": starting");
SERVER.set(Server.this); // SERVER是在ipc.Server的一个成员:private static final ThreadLocal<Server> SERVER = new ThreadLocal<Server>();
while (running) { // 将Server set到本线程自己的map结构中,为value,而key = Thread.currentThread()
SelectionKey key = null;
try {
selector.select();
Iterator<SelectionKey> iter = selector.selectedKeys().iterator();
while (iter.hasNext()) {
key = iter.next();
iter.remove();
try {
if (key.isValid()) {
if (key.isAcceptable())
doAccept(key); // 具体的连接方法
}
} catch (IOException e) {
}
key = null;
}
} catch (OutOfMemoryError e) {
// we can run out of memory if we have too many threads
// log the event and sleep for a minute and give
// some thread(s) a chance to finish
LOG.warn("Out of Memory in server select", e);
closeCurrentConnection(key, e);
cleanupConnections(true);
try { Thread.sleep(60000); } catch (Exception ie) {}
} catch (Exception e) {
closeCurrentConnection(key, e);
}
cleanupConnections(false);
}
LOG.info("Stopping " + this.getName());
synchronized (this) {
try {
acceptChannel.close();
selector.close();
} catch (IOException e) { }
selector= null;
acceptChannel= null;
// clean up all connections
while (!connectionList.isEmpty()) {
closeConnection(connectionList.remove(0));
}
}
}
Listener类中的doAccept()方法:
void doAccept(SelectionKey key) throws IOException, OutOfMemoryError {
Connection c = null;
ServerSocketChannel server = (ServerSocketChannel) key.channel();
SocketChannel channel;
while ((channel = server.accept()) != null) { // server.accept(),建立socket连接
channel.configureBlocking(false);
channel.socket().setTcpNoDelay(tcpNoDelay);
Reader reader = getReader(); // 从reader池中获取一个reader线程
try {
reader.startAdd(); // adding = true; readSelector.wakeup()
SelectionKey readKey = reader.registerChannel(channel); // 给该reader注册channel,表示该reader感兴趣,返回一个readKey
c = new Connection(readKey, channel, System.currentTimeMillis()); // 给该reader创建一个连接对象
readKey.attach(c); // 将该reader的连接对象注入到readKey
synchronized (connectionList) {
connectionList.add(numConnections, c);
numConnections++;
}
if (LOG.isDebugEnabled())
LOG.debug("Server connection from " + c.toString() +
"; # active connections: " + numConnections +
"; # queued calls: " + callQueue.size());
} finally {
reader.finishAdd(); //adding = false;this.notify(),唤醒一个reader(唤醒的原因是之前在Listener的构造函数中创建的每个reader线程
} // 都使用了wait()方法在等待,见Reader的run()方法。
}
}
reader被唤醒后,继续执行,见Listener.Reader的run方法:
public void run() {
LOG.info("Starting SocketReader");
synchronized (this) {
while (running) {
SelectionKey key = null;
try {
readSelector.select();
while (adding) {
this.wait(1000); // 每个reader线程启动后都会在此处wait
}
Iterator<SelectionKey> iter = readSelector.selectedKeys().iterator();
while (iter.hasNext()) {
key = iter.next();
iter.remove();
if (key.isValid()) {
if (key.isReadable()) {
doRead(key); // reader线程被唤醒后,执行Listener的doRead()方法
}
}
key = null;
}
} catch (InterruptedException e) {
if (running) { // unexpected -- log it
LOG.info(getName() + " caught: " +
StringUtils.stringifyException(e));
}
} catch (IOException ex) {
LOG.error("Error in Reader", ex);
}
}
}
}
Listener.doReader()方法:
void doRead(SelectionKey key) throws InterruptedException {
int count = 0;
Connection c = (Connection)key.attachment(); // 获得一个连接
if (c == null) {
return;
}
c.setLastContact(System.currentTimeMillis());
try {
count = c.readAndProcess(); // 由连接对象(Connection的对象c)来接收并处理请求
} catch (InterruptedException ieo) {
LOG.info(getName() + ": readAndProcess caught InterruptedException", ieo);
throw ieo;
} catch (Exception e) {
LOG.info(getName() + ": readAndProcess threw exception " + e + ". Count of bytes read: " + count, e);
count = -1; //so that the (count < 0) block is executed
}
if (count < 0) {
if (LOG.isDebugEnabled())
LOG.debug(getName() + ": disconnecting client " +
c + ". Number of active connections: "+
numConnections);
closeConnection(c);
c = null;
}
else {
c.setLastContact(System.currentTimeMillis());
}
}
Connection.readAndProcess()方法:
public int readAndProcess() throws IOException, InterruptedException {
while (true) {
/* Read at most one RPC. If the header is not read completely yet
* then iterate until we read first RPC or until there is no data left.
*/
int count = -1;
if (dataLengthBuffer.remaining() > 0) {
count = channelRead(channel, dataLengthBuffer);
if (count < 0 || dataLengthBuffer.remaining() > 0)
return count;
}
if (!rpcHeaderRead) {
//Every connection is expected to send the header.
if (rpcHeaderBuffer == null) {
rpcHeaderBuffer = ByteBuffer.allocate(2);
}
count = channelRead(channel, rpcHeaderBuffer);
if (count < 0 || rpcHeaderBuffer.remaining() > 0) {
return count;
}
int version = rpcHeaderBuffer.get(0);
byte[] method = new byte[] {rpcHeaderBuffer.get(1)};
authMethod = AuthMethod.read(new DataInputStream(
new ByteArrayInputStream(method)));
dataLengthBuffer.flip();
if (!HEADER.equals(dataLengthBuffer) || version != CURRENT_VERSION) {
//Warning is ok since this is not supposed to happen.
LOG.warn("Incorrect header or version mismatch from " +
hostAddress + ":" + remotePort +
" got version " + version +
" expected version " + CURRENT_VERSION);
return -1;
}
dataLengthBuffer.clear();
if (authMethod == null) {
throw new IOException("Unable to read authentication method");
}
if (isSecurityEnabled && authMethod == AuthMethod.SIMPLE) {
AccessControlException ae = new AccessControlException(
"Authentication is required");
setupResponse(authFailedResponse, authFailedCall, Status.FATAL,
null, ae.getClass().getName(), ae.getMessage());
responder.doRespond(authFailedCall);
throw ae;
}
if (!isSecurityEnabled && authMethod != AuthMethod.SIMPLE) {
doSaslReply(SaslStatus.SUCCESS, new IntWritable(
SaslRpcServer.SWITCH_TO_SIMPLE_AUTH), null, null);
authMethod = AuthMethod.SIMPLE;
// client has already sent the initial Sasl message and we
// should ignore it. Both client and server should fall back
// to simple auth from now on.
skipInitialSaslHandshake = true;
}
if (authMethod != AuthMethod.SIMPLE) {
useSasl = true;
}
rpcHeaderBuffer = null;
rpcHeaderRead = true;
continue;
}
if (data == null) {
dataLengthBuffer.flip();
dataLength = dataLengthBuffer.getInt();
if (dataLength == Client.PING_CALL_ID) {
if(!useWrap) { //covers the !useSasl too
dataLengthBuffer.clear();
return 0; //ping message
}
}
if (dataLength < 0) {
LOG.warn("Unexpected data length " + dataLength + "!! from " +
getHostAddress());
}
data = ByteBuffer.allocate(dataLength);
}
count = channelRead(channel, data); // 读取请求
if (data.remaining() == 0) { //
dataLengthBuffer.clear();
data.flip();
if (skipInitialSaslHandshake) {
data = null;
skipInitialSaslHandshake = false;
continue;
}
boolean isHeaderRead = headerRead;
if (useSasl) {
saslReadAndProcess(data.array());
} else {
processOneRpc(data.array()); // 处理请求
}
data = null;
if (!isHeaderRead) {
continue;
}
}
return count;
}
}
Connection的processOneRpc()方法
private void processOneRpc(byte[] buf) throws IOException,
InterruptedException {
if (headerRead) {
processData(buf); // 处理请求
} else {
processHeader(buf);
headerRead = true;
if (!authorizeConnection()) {
throw new AccessControlException("Connection from " + this
+ " for protocol " + header.getProtocol()
+ " is unauthorized for user " + user);
}
}
}
Connection的processData()方法,封装成Call,存入callQueue
private void processData(byte[] buf) throws IOException, InterruptedException {
DataInputStream dis =
new DataInputStream(new ByteArrayInputStream(buf));
int id = dis.readInt(); // try to read an id 尝试读取id
if (LOG.isDebugEnabled())
LOG.debug(" got #" + id);
Writable param = ReflectionUtils.newInstance(paramClass, conf);//read param 读取参数
param.readFields(dis);
Call call = new Call(id, param, this); // 将以上读取的结果,id、参数等封装成Call对象call
callQueue.put(call); // queue the call; maybe blocked here 将call存入callQueue(java.util.concurrent.BlockingQueue<Call>类型);
incRpcCount(); // Increment the rpc count 增加RPC请求的计数
}
请求被封装为Cal对象后,存入callQueue之后该如何处理呢?
交给Server的另一个内部类:Handler来做,Handler的run()方法:
@Override
public void run() {
LOG.info(getName() + ": starting");
SERVER.set(Server.this);
ByteArrayOutputStream buf =
new ByteArrayOutputStream(INITIAL_RESP_BUF_SIZE);
while (running) {
try {
final Call call = callQueue.take(); // pop the queue; maybe blocked here 从callQueue队列中弹出一个call,可能会在此阻塞
if (LOG.isDebugEnabled())
LOG.debug(getName() + ": has #" + call.id + " from " +
call.connection);
String errorClass = null;
String error = null;
Writable value = null;
CurCall.set(call); // CurCall:ThreadLocal<Call>类型
try {
// Make the call as the user via Subject.doAs, thus associating
// the call with the Subject
if (call.connection.user == null) {
value = call(call.connection.protocol, call.param, //调用ipc.Server的call方法,具体实现是ipc.RPC.Server的call方法
call.timestamp);
} else {
value =
call.connection.user.doAs
(new PrivilegedExceptionAction<Writable>() {
@Override
public Writable run() throws Exception {
// make the call
return call(call.connection.protocol,
call.param, call.timestamp);
}
}
);
}
} catch (Throwable e) {
LOG.info(getName()+", call "+call+": error: " + e, e);
errorClass = e.getClass().getName();
error = StringUtils.stringifyException(e);
}
CurCall.set(null);
synchronized (call.connection.responseQueue) {
// setupResponse() needs to be sync'ed together with
// responder.doResponse() since setupResponse may use
// SASL to encrypt response data and SASL enforces
// its own message ordering.
setupResponse(buf, call,
(error == null) ? Status.SUCCESS : Status.ERROR,
value, errorClass, error);
// Discard the large buf and reset it back to
// smaller size to freeup heap
if (buf.size() > maxRespSize) {
LOG.warn("Large response size " + buf.size() + " for call " +
call.toString());
buf = new ByteArrayOutputStream(INITIAL_RESP_BUF_SIZE);
}
responder.doRespond(call); //调用Responder的doRespond来响应客户端的请求
}
} catch (InterruptedException e) {
if (running) { // unexpected -- log it
LOG.info(getName() + " caught: " +
StringUtils.stringifyException(e));
}
} catch (Exception e) {
LOG.info(getName() + " caught: " +
StringUtils.stringifyException(e));
}
}
LOG.info(getName() + ": exiting");
}
ipc.Server的内部类Responder的doRespond()方法:
//
// Enqueue a response from the application.
//
void doRespond(Call call) throws IOException {
synchronized (call.connection.responseQueue) {
call.connection.responseQueue.addLast(call); // 将响应结果放入responseQueue队尾(在Connection中定义:LinkList<Call>类型)
if (call.connection.responseQueue.size() == 1) {
processResponse(call.connection.responseQueue, true);//返回响应结果
}
}
}
Responder的processResponse()方法:
// Processes one response. Returns true if there are no more pending
// data for this channel.
//
private boolean processResponse(LinkedList<Call> responseQueue,
boolean inHandler) throws IOException {
boolean error = true;
boolean done = false; // there is more data for this channel.
int numElements = 0;
Call call = null;
try {
synchronized (responseQueue) {
//
// If there are no items for this channel, then we are done
//
numElements = responseQueue.size(); // 获取responseQueue中响应call的个数
if (numElements == 0) { // 如果为0,返回TRUE
error = false;
return true; // no more data for this channel.
}
//
// Extract the first call
//
call = responseQueue.removeFirst(); // responseQueue中响应call未处理,则从这个队列中取出一个
SocketChannel channel = call.connection.channel;
if (LOG.isDebugEnabled()) {
LOG.debug(getName() + ": responding to #" + call.id + " from " +
call.connection);
}
//
// Send as much data as we can in the non-blocking fashion
//
int numBytes = channelWrite(channel, call.response); // 将这个响应call发送出去,采用非阻塞模式
if (numBytes < 0) {
return true;
}
if (!call.response.hasRemaining()) { // 这个响应call发送完了,call.response是ByteBuffer类型
call.connection.decRpcCount(); // rpc请求的个数减一
if (numElements == 1) { // last call fully processes.
done = true; // no more data for this channel.
} else {
done = false; // more calls pending to be sent.
}
if (LOG.isDebugEnabled()) {
LOG.debug(getName() + ": responding to #" + call.id + " from " +
call.connection + " Wrote " + numBytes + " bytes.");
}
} else { // 无法发完一个响应call
//
// If we were unable to write the entire response out, then
// insert in Selector queue.
//
call.connection.responseQueue.addFirst(call); // 将这个call放到队首
if (inHandler) {
// set the serve time when the response has to be sent later
call.timestamp = System.currentTimeMillis();
incPending(); // pending ++
try {
// Wakeup the thread blocked on select, only then can the call
// to channel.register() complete.
writeSelector.wakeup(); // 唤醒阻塞在这个selector上的线程,
channel.register(writeSelector, SelectionKey.OP_WRITE, call);
} catch (ClosedChannelException e) {
//Its ok. channel might be closed else where.
done = true;
} finally {
decPending(); // pending --
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(getName() + ": responding to #" + call.id + " from " +
call.connection + " Wrote partial " + numBytes +
" bytes.");
}
}
error = false; // everything went off well
}
} finally {
if (error && call != null) {
LOG.warn(getName()+", call " + call + ": output error");
done = true; // error. no more data for this channel.
closeConnection(call.connection);
}
}
return done;
}