DN的主要工作用来存取数据,与其产生块交互的主要有两种角色:客户端和其他DN,数据块的收发是比较繁重的工作,虽然DN不必面临高并发的场景,但如果是串行服务的话必然会降低效率,为此,DN在每次接到块操作请求时,都会产生一个线程用于服务,这里说的dataXceiverServer就类似一餐馆老板,每来一个客人,就派一个小伙计出来服务,一对一的。DataXceiver就是小伙计的角色。dataXceiverServer的创建时机如下
void startDataNode(Configuration conf,
AbstractList<File> dataDirs, SecureResources resources
) throws IOException {
.............
// 创建一个ServerSocket
ServerSocket ss;
if(secureResources == null) {
//如果指定写超时时间,则用ServerSocketChannel来创建,该方法可以设置非阻塞模式,在网络环境差的情况下非常有用
ss = (socketWriteTimeout > 0) ?
ServerSocketChannel.open().socket() : new ServerSocket();
//将ServerSocket与端口绑定,默认端口为50010
Server.bind(ss, socAddr, 0);
} else {
ss = resources.getStreamingSocket();
}
//设置接收缓冲区
ss.setReceiveBufferSize(DEFAULT_DATA_SOCKET_SIZE);
// 获得真正绑定的端口
tmpPort = ss.getLocalPort();
selfAddr = new InetSocketAddress(ss.getInetAddress().getHostAddress(),
tmpPort);
//更新注册体内容,用于向NN汇报
this.dnRegistration.setName(machineName + ":" + tmpPort);
LOG.info("Opened info server at " + tmpPort);
//创建一个线程组,对外监听的DataXceiverServer和对外服务的DataXceiver都放在该组中,注意这里都是后台守护线程
this.threadGroup = new ThreadGroup("dataXceiverServer");
//开始创建dataXceiverServer,该服务器会在DN最后做主循环前启动
this.dataXceiverServer = new Daemon(threadGroup,
new DataXceiverServer(ss, conf, this));
this.threadGroup.setDaemon(true); // auto destroy when empty
................
}
下面看下dataXceiverServer的线程执行体,和我们自己编写的网络服务端差不多
public void run() {
//一直循环
while (datanode.shouldRun) {
try {
//监听中.....
Socket s = ss.accept();
s.setTcpNoDelay(true);
//创建一个新的线程服务客户端,注意他也是放在dataXceiverServer这个线程组里的
new Daemon(datanode.threadGroup,
new DataXceiver(s, datanode, this)).start();
} catch (SocketTimeoutException ignored) {
// wake up to see if should continue to run
} catch (AsynchronousCloseException ace) {
LOG.warn(datanode.dnRegistration + ":DataXceiveServer:"
+ StringUtils.stringifyException(ace));
datanode.shouldRun = false;
} catch (IOException ie) {
LOG.warn(datanode.dnRegistration + ":DataXceiveServer: IOException due to:"
+ StringUtils.stringifyException(ie));
} catch (Throwable te) {
LOG.error(datanode.dnRegistration + ":DataXceiveServer: Exiting due to:"
+ StringUtils.stringifyException(te));
datanode.shouldRun = false;
}
}
try {
//关闭ss
ss.close();
} catch (IOException ie) {
LOG.warn(datanode.dnRegistration + ":DataXceiveServer: Close exception due to: "
+ StringUtils.stringifyException(ie));
}
LOG.info("Exiting DataXceiveServer");
}
上面这个run方法并不是核心,真正干活的是DataXceiver,让我们看看他的执行体
public void run() {
DataInputStream in=null;
try {
//构建读入流,因为先要读取一些验证信息
in = new DataInputStream(
new BufferedInputStream(NetUtils.getInputStream(s),
SMALL_BUFFER_SIZE));
//传输版本对比
short version = in.readShort();
if ( version != DataTransferProtocol.DATA_TRANSFER_VERSION ) {
throw new IOException( "Version Mismatch" );
}
//判断是否为本地操作,用于更新相关统计信息
boolean local = s.getInetAddress().equals(s.getLocalAddress());
//读取操作码
byte op = in.readByte();
// 确保活动线程数不大于dataXceiverServer.maxXceiverCount(256),这个限制还是很宽松的
int curXceiverCount = datanode.getXceiverCount();
if (curXceiverCount > dataXceiverServer.maxXceiverCount) {
throw new IOException("xceiverCount " + curXceiverCount
+ " exceeds the limit of concurrent xcievers "
+ dataXceiverServer.maxXceiverCount);
}
//记录启动时间
long startTime = DataNode.now();
//根据不同操作码执行操作
switch ( op ) {
//读取数据块
case DataTransferProtocol.OP_READ_BLOCK:
readBlock( in );
datanode.myMetrics.addReadBlockOp(DataNode.now() - startTime);
if (local)
datanode.myMetrics.incrReadsFromLocalClient();
else
datanode.myMetrics.incrReadsFromRemoteClient();
break;
//写入数据块
case DataTransferProtocol.OP_WRITE_BLOCK:
writeBlock( in );
datanode.myMetrics.addWriteBlockOp(DataNode.now() - startTime);
if (local)
datanode.myMetrics.incrWritesFromLocalClient();
else
datanode.myMetrics.incrWritesFromRemoteClient();
break;
//替换数据块
case DataTransferProtocol.OP_REPLACE_BLOCK: // for balancing purpose; send to a destination
replaceBlock(in);
datanode.myMetrics.addReplaceBlockOp(DataNode.now() - startTime);
break;
//拷贝数据块
case DataTransferProtocol.OP_COPY_BLOCK:
// for balancing purpose; send to a proxy source
copyBlock(in);
datanode.myMetrics.addCopyBlockOp(DataNode.now() - startTime);
break;
//检测数据块
case DataTransferProtocol.OP_BLOCK_CHECKSUM: //get the checksum of a block
getBlockChecksum(in);
datanode.myMetrics.addBlockChecksumOp(DataNode.now() - startTime);
break;
default:
throw new IOException("Unknown opcode " + op + " in data stream");
}
} catch (Throwable t) {
LOG.error(datanode.dnRegistration + ":DataXceiver",t);
} finally {
LOG.debug(datanode.dnRegistration + ":Number of active connections is: "
+ datanode.getXceiverCount());
//关闭相关流并移除服务socket
IOUtils.closeStream(in);
IOUtils.closeSocket(s);
dataXceiverServer.childSockets.remove(s);
}
}
以读取数据块为例,看下是如何操作的
private void readBlock(DataInputStream in) throws IOException {
//
// 读取都信息
//
long blockId = in.readLong(); //读取blockId
Block block = new Block( blockId, 0 , in.readLong());//读取stamp标记
long startOffset = in.readLong();//读取偏移量
long length = in.readLong();//读取长度
String clientName = Text.readString(in);//客户端名称
Token<BlockTokenIdentifier> accessToken = new Token<BlockTokenIdentifier>();
accessToken.readFields(in);
//获得输出流用于发送block
OutputStream baseStream = NetUtils.getOutputStream(s,
datanode.socketWriteTimeout);
DataOutputStream out = new DataOutputStream(
new BufferedOutputStream(baseStream, SMALL_BUFFER_SIZE));
if (datanode.isBlockTokenEnabled) {
try {
datanode.blockTokenSecretManager.checkAccess(accessToken, null, block,
BlockTokenSecretManager.AccessMode.READ);
} catch (InvalidToken e) {
try {
out.writeShort(DataTransferProtocol.OP_STATUS_ERROR_ACCESS_TOKEN);
out.flush();
throw new IOException("Access token verification failed, for client "
+ remoteAddress + " for OP_READ_BLOCK for block " + block);
} finally {
IOUtils.closeStream(out);
}
}
}
// 构建blockSender用于发送block
BlockSender blockSender = null;
//构建日志信息类似下面这种,我们在日志里会经常看到
//src: /127.0.0.1:50010, dest: /127.0.0.1:50243, bytes: %d, op: HDFS_READ, cliID: DFSClient_-880133444, offset: %d, srvID: DS-1789183053-125.120.30.128-50010-1379249313769, blockid: blk_5420252401562768646_1005, duration: %d
//源地址、目的地址、发送量、操作类型、客户端名称、偏移量等等....
final String clientTraceFmt =
clientName.length() > 0 && ClientTraceLog.isInfoEnabled()
? String.format(DN_CLIENTTRACE_FORMAT, localAddress, remoteAddress,
"%d", "HDFS_READ", clientName, "%d",
datanode.dnRegistration.getStorageID(), block, "%d")
: datanode.dnRegistration + " Served block " + block + " to " +
s.getInetAddress();
try {
try {
//当读取的块大于一个块大小的时候,length值则为一个块大小,这是HDFS的IO单位
blockSender = new BlockSender(block, startOffset, length,
true, true, false, datanode, clientTraceFmt);
} catch(IOException e) {
out.writeShort(DataTransferProtocol.OP_STATUS_ERROR);
throw e;
}
out.writeShort(DataTransferProtocol.OP_STATUS_SUCCESS); // 发送操作状态
long read = blockSender.sendBlock(out, baseStream, null); // 真正的发送数据
if (blockSender.isBlockReadFully()) {
// 如果数据块发送完毕,则等客户端返回状态,以确定是否成功
try {
if (in.readShort() == DataTransferProtocol.OP_STATUS_CHECKSUM_OK &&
datanode.blockScanner != null) {
datanode.blockScanner.verifiedByClient(block);
}
} catch (IOException ignored) {}
}
//更新相关统计信息
datanode.myMetrics.incrBytesRead((int) read);
datanode.myMetrics.incrBlocksRead();
} catch ( SocketException ignored ) {
// Its ok for remote side to close the connection anytime.
datanode.myMetrics.incrBlocksRead();
} catch ( IOException ioe ) {
/* What exactly should we do here?
* Earlier version shutdown() datanode if there is disk error.
*/
LOG.warn(datanode.dnRegistration + ":Got exception while serving " +
block + " to " +
s.getInetAddress() + ":\n" +
StringUtils.stringifyException(ioe) );
throw ioe;
} finally {
//关闭相关流
IOUtils.closeStream(out);
IOUtils.closeStream(blockSender);
}
}
发送数据的操作是我们关心的,这也是BlockSender的主要功能,他首先会向客户端发送校验信息,比如校验时chunk的大小,然后会确定发送包的大小,这些都确定好后,则真正开始发送
long sendBlock(DataOutputStream out, OutputStream baseStream,
BlockTransferThrottler throttler) throws IOException {
if( out == null ) {
throw new IOException( "out stream is null" );
}
this.throttler = throttler;//节流器
long initialOffset = offset;//偏移量
long totalRead = 0;//本次读取的总长度,用于返回
OutputStream streamForSendChunks = out;
//记录开始时间
final long startTime = ClientTraceLog.isInfoEnabled() ? System.nanoTime() : 0;
try {
try {
//写入校验头信息,告诉客户端如何校验
checksum.writeHeader(out);
if ( chunkOffsetOK ) {
out.writeLong( offset );//从哪里开始校验
}
out.flush();//刷新到客户端
} catch (IOException e) { //socket error
throw ioeToSocketException(e);
}
//下面这段判断主要用于确定发送的包大小
int maxChunksPerPacket;
int pktSize = DataNode.PKT_HEADER_LEN + SIZE_OF_INTEGER;
if (transferToAllowed && !verifyChecksum &&
baseStream instanceof SocketOutputStream &&
blockIn instanceof FileInputStream) {
FileChannel fileChannel = ((FileInputStream)blockIn).getChannel();
// 记录文件位置,发送块的其实位置
blockInPosition = fileChannel.position();
streamForSendChunks = baseStream;
// 计算每个包发送多少chunks
maxChunksPerPacket = (Math.max(BUFFER_SIZE,
MIN_BUFFER_WITH_TRANSFERTO)
+ bytesPerChecksum - 1)/bytesPerChecksum;
// 计算包大小,此值会确定分配缓存的大小
pktSize += (bytesPerChecksum + checksumSize) * maxChunksPerPacket;
} else {
maxChunksPerPacket = Math.max(1,
(BUFFER_SIZE + bytesPerChecksum - 1)/bytesPerChecksum);
pktSize += (bytesPerChecksum + checksumSize) * maxChunksPerPacket;
}
//分配缓冲区
ByteBuffer pktBuf = ByteBuffer.allocate(pktSize);
//循环发送chunks
while (endOffset > offset) {
long len = sendChunks(pktBuf, maxChunksPerPacket,
streamForSendChunks);
offset += len;
totalRead += len + ((len + bytesPerChecksum - 1)/bytesPerChecksum*
checksumSize);
seqno++;
}
try {
写结束标记位并刷新
out.writeInt(0); // mark the end of block
out.flush();
} catch (IOException e) { //socket error
throw ioeToSocketException(e);
}
}
catch (RuntimeException e) {
LOG.error("unexpected exception sending block", e);
throw new IOException("unexpected runtime exception", e);
}
finally {
if (clientTraceFmt != null) {
//记录发送时间
final long endTime = System.nanoTime();
ClientTraceLog.info(String.format(clientTraceFmt, totalRead, initialOffset, endTime - startTime));
}
close();
}
//记录是否发送完毕
blockReadFully = (initialOffset == 0 && offset >= blockLength);
return totalRead;
}
如何发送chunk的呢?
private int sendChunks(ByteBuffer pkt, int maxChunks, OutputStream out)
throws IOException {
// 至多读取一个包大小的chunks
int len = Math.min((int) (endOffset - offset),
bytesPerChecksum*maxChunks);
// 通过该计算确定一个包读取整数个chunk,以防客户端校验出错
if (len > bytesPerChecksum && len % bytesPerChecksum != 0) {
len -= len % bytesPerChecksum;
}
if (len == 0) {
return 0;
}
//计算本次读取多少chunk
int numChunks = (len + bytesPerChecksum - 1)/bytesPerChecksum;
int packetLen = len + numChunks*checksumSize + 4;
pkt.clear();
// 写包头信息,长度、偏移、序列号
pkt.putInt(packetLen);
pkt.putLong(offset);
pkt.putLong(seqno);
pkt.put((byte)((offset + len >= endOffset) ? 1 : 0));
//why no ByteBuf.putBoolean()?
pkt.putInt(len);
int checksumOff = pkt.position();
int checksumLen = numChunks * checksumSize;
//获取包存储区,用于存放发送的数据
byte[] buf = pkt.array();
if (checksumSize > 0 && checksumIn != null) {
try {
//读取一组chunk的checksum
checksumIn.readFully(buf, checksumOff, checksumLen);
} catch (IOException e) {
LOG.warn(" Could not read or failed to veirfy checksum for data" +
" at offset " + offset + " for block " + block + " got : "
+ StringUtils.stringifyException(e));
IOUtils.closeStream(checksumIn);
checksumIn = null;
if (corruptChecksumOk) {
if (checksumOff < checksumLen) {
// Just fill the array with zeros.
Arrays.fill(buf, checksumOff, checksumLen, (byte) 0);
}
} else {
throw e;
}
}
}
//更新偏移量,开始读取数据
int dataOff = checksumOff + checksumLen;
if (blockInPosition < 0) {
//normal transfer
IOUtils.readFully(blockIn, buf, dataOff, len);
if (verifyChecksum) {
int dOff = dataOff;
int cOff = checksumOff;
int dLeft = len;
for (int i=0; i<numChunks; i++) {
checksum.reset();
int dLen = Math.min(dLeft, bytesPerChecksum);
checksum.update(buf, dOff, dLen);
if (!checksum.compare(buf, cOff)) {//校验码对比,如果失败则记录日志,在DataBlockScanner中还会用到,具体看下篇
throw new ChecksumException("Checksum failed at " +
(offset + len - dLeft), len);
}
dLeft -= dLen;
dOff += dLen;
cOff += checksumSize;
}
}
// only recompute checksum if we can't trust the meta data due to
// concurrent writes
if (memoizedBlock.hasBlockChanged(len)) {
ChecksumUtil.updateChunkChecksum(
buf, checksumOff, dataOff, len, checksum
);
}
try {
out.write(buf, 0, dataOff + len);
} catch (IOException e) {
throw ioeToSocketException(e);
}
} else {
try {
//use transferTo(). Checks on out and blockIn are already done.
SocketOutputStream sockOut = (SocketOutputStream) out;
FileChannel fileChannel = ((FileInputStream) blockIn).getChannel();
if (memoizedBlock.hasBlockChanged(len)) {
fileChannel.position(blockInPosition);
IOUtils.readFileChannelFully(
fileChannel,
buf,
dataOff,
len
);
//更新校验码
ChecksumUtil.updateChunkChecksum(
buf, checksumOff, dataOff, len, checksum
);
sockOut.write(buf, 0, dataOff + len);
} else {
//写出数据
sockOut.write(buf, 0, dataOff);
// no need to flush. since we know out is not a buffered stream.
sockOut.transferToFully(fileChannel, blockInPosition, len);
}
blockInPosition += len;
} catch (IOException e) {
/* exception while writing to the client (well, with transferTo(),
* it could also be while reading from the local file).
*/
throw ioeToSocketException(e);
}
}
if (throttler != null) { // 发送完一个chunk,则通过节流器判断是否超出阈值,如果则产生等待,以节省网络带宽
throttler.throttle(packetLen);
}
return len;
}