在Linux下的hdfs客户端编写,写操作(客户端) - HDFS的文件操作流_服务器应用_Linux公社-Linux系统门户网站...

在前面的博文中我主要从客户端的角度讲述了HDFS文件写操作的工作流程,但是关于客户端是如何把数据块传送到数据节点,同时数据节点又是如何来接受来自客户端的数据块呢?这就是本文将要讨论的。

上一次在DataStreamer线程,那么现在我们就来具体的看看客户端是如何传输数据的。先来看看底层文件写入流DFSOutputSream的核心代码:

/**

* @param b                要写入文件的数据块

* @param offset         开始位置

* @param len             数据长度

* @param checksum  数据块b[offset~offset+len]的校验码

*/

protected synchronized void writeChunk(byte[] b, int offset, int len, byte[] checksum)  throws IOException {

checkOpen();

isClosed();

int cklen = checksum.length;

int bytesPerChecksum = this.checksum.getBytesPerChecksum();

if (len > bytesPerChecksum) {

throw new IOException("writeChunk() buffer size is " + len +

" is larger than supported  bytesPerChecksum " +

bytesPerChecksum);

}

if (checksum.length != this.checksum.getChecksumSize()) {

throw new IOException("writeChunk() checksum size is supposed to be " +

this.checksum.getChecksumSize() +

" but found to be " + checksum.length);

}

synchronized (dataQueue) {

// If queue is full, then wait till we can create  enough space

while (!closed && dataQueue.size() + ackQueue.size()  > maxPackets) {

try {

dataQueue.wait();

} catch (InterruptedException  e) {

}

}

isClosed();

if (currentPacket == null) {

currentPacket = new Packet(packetSize, chunksPerPacket,bytesCurBlock);

}

currentPacket.writeChecksum(checksum, 0, cklen);

currentPacket.writeData(b, offset, len);

currentPacket.numChunks++;

bytesCurBlock += len;

if (currentPacket.numChunks == currentPacket.maxChunks || bytesCurBlock == blockSize) {

//当前Block是否已满

if (bytesCurBlock == blockSize) {

currentPacket.lastPacketInBlock = true;

bytesCurBlock = 0;

lastFlushOffset = -1;

}

LOG.debug("the packet is full,so put it to dataQueue");

dataQueue.addLast(currentPacket);

dataQueue.notifyAll();

currentPacket = null;

if (appendChunk) {

appendChunk = false;

resetChecksumChunk(bytesPerChecksum);

}

int psize = Math.min((int)(blockSize-bytesCurBlock), writePacketSize);

computePacketChunkSize(psize, bytesPerChecksum);

}

}

}

从DFSOutputSream的核心函数writeChunk()我们可以看出,DFSOutputSream先把写入的数据缓存到packet中,当packet满了,或者是当前Block满了,则把packet放入队列dataQueue,等待其它的工作者把该packet发送到目标数据节点上。其实,这个工作者就是DataStreamer,它是DFSOutputSream的一个内部线程类,下面就来看看DataStreamer是如何工作的吧!

private class DataStreamer extends Daemon {

private volatile boolean closed = false;

public void run() {

while (!closed && clientRunning) {

if (hasError && response != null) {

try {

response.close();

response.join();

response = null;

} catch (InterruptedException  e) {

}

}

Packet one = null;

synchronized (dataQueue) {

//处理I/O错误

boolean doSleep = processDatanodeError(hasError, false);

// wait for a packet to be sent.

while ((!closed && !hasError && clientRunning && dataQueue.size() == 0) || doSleep) {

try {

dataQueue.wait(1000);

} catch (InterruptedException  e) {

}

doSleep = false;

}

if (closed || hasError || dataQueue.size() == 0 || !clientRunning) {

continue;

}

try {

//从队列dataQueue中取出一个将要发送的packet

one = dataQueue.getFirst();

long offsetInBlock = one.offsetInBlock;

// 当前还没有一个可用的数据块

if (blockStream == null) {

//向NameNode节点申请一个数据块Block,同时创建一个blockStream

nodes = nextBlockOutputStream(src);

this.setName("DataStreamer for file " + src +  " block " + block);

response = new ResponseProcessor(nodes);

response.start();

}

if (offsetInBlock >= blockSize) {

throw new IOException("BlockSize " + blockSize +  " is smaller than data size. " +   " Offset of packet in block " +   offsetInBlock +  " Aborting file " + src);

}

ByteBuffer buf = one.getBuffer();

// move packet from dataQueue to ackQueue

dataQueue.removeFirst();

dataQueue.notifyAll();

synchronized (ackQueue) {

ackQueue.addLast(one);

ackQueue.notifyAll();

}

// write out data to remote datanode

blockStream.write(buf.array(), buf.position(), buf.remaining());

//一个数据块是否已满了

if (one.lastPacketInBlock) {

blockStream.writeInt(0); // indicate end-of-block

}

blockStream.flush();

} catch (Throwable e) {

LOG.warn("DataStreamer Exception: " +

StringUtils.stringifyException(e));

if (e instanceof IOException) {

setLastException((IOException)e);

}

hasError = true;

}

}

if (closed || hasError || !clientRunning) {

continue;

}

// 如果一个Block的所有packet已发送完了,就等到所有来自数据节点的apcket的ack

if (one.lastPacketInBlock) {

synchronized (ackQueue) {

while (!hasError && ackQueue.size() != 0 && clientRunning) {

try {

ackQueue.wait();   // wait for acks to arrive from datanodes

} catch (InterruptedException  e) {

}

}

}

this.setName("DataStreamer for file " + src);

response.close();        // ignore all errors in Response

try {

response.join();

response = null;

} catch (InterruptedException  e) {

}

if (closed || hasError || !clientRunning) {

continue;

}

synchronized (dataQueue) {

try {

blockStream.close();

blockReplyStream.close();

} catch (IOException e) {

}

nodes = null;

response = null;

blockStream = null;

blockReplyStream = null;

}

}

if (progress != null) { progress.progress(); }

// This is used by unit test to trigger race conditions.

if (artificialSlowdown != 0 && clientRunning) {

try {

Thread.sleep(artificialSlowdown);

} catch (InterruptedException e) {}

}

}

}

// shutdown thread

void close() {

closed = true;

synchronized (dataQueue) {

dataQueue.notifyAll();

}

synchronized (ackQueue) {

ackQueue.notifyAll();

}

this.interrupt();

}

}

上面的代码值得让我们注意的是,在Hadoop的官网上有关于介绍HDFS的一句话:A client request to create a file does not reach the NameNode immediately. In fact, initially the HDFS client caches the file data into a temporary local file. Application writes are transparently redirected to this temporary local file. When the local file accumulates data worth over one HDFS block size, the client contacts the NameNode. 翻译这句话,我就在这里不献丑了。很多分析过源代码的朋友都认为这句话说得有问题,但是我想说的,这就话在本质上是没有问题的,因为DataStreamer总是一个数据块接着一个数据块向目标数据节点发送,也就是对于已经向某一个数据节点发送了一个Block后,DataStreamer并不是马上发送下一个Block,而是要等到packet得到确认后才发送下一个Block,假设当一个用户调用HDFS的API写入了2个Block的数据。此时DataStreamer还在等待第一个Block的所有packet的ack,那么用户的第2个Block的数据还缓存在dataQueue中,同时DataStreamer也没有向NameNode申请第二个Block。那么现在大家再来体会一下刚才那句话。是不是还有点意思呢?另外,用户不能一味的发送数据,负责缓存扛不住,所有就有一个限制了,也就是总的缓存数据不能超过maxPackets个packet,这个值视运行环境而定,目前默认是80或者是1000。ok,再来看看nextBlockOutputStream函数到底为数据块向数据节点传送到底干了那些工作。

private DatanodeInfo[] nextBlockOutputStream(String client) throws IOException {

LocatedBlock lb = null;

boolean retry = false;

DatanodeInfo[] nodes;

int count = conf.getInt("dfs.client.block.write.retries", 3);

boolean success;

do {

hasError = false;

lastException = null;

errorIndex = 0;

retry = false;

nodes = null;

success = false;

long startTime = System.currentTimeMillis();

//向NameNode申请一个新的block

lb = locateFollowingBlock(startTime);

block = lb.getBlock();

nodes = lb.getLocations();

LOG.debug("locate a block["+block.getBlockId()+"] for file["+src+"]: "+nodes);

//创建一个和数据节点的网络connection

success = createBlockOutputStream(nodes, clientName, false);

if (!success) {

//向NameNode放弃文件src的一个block

namenode.abandonBlock(block, src, clientName);

// Connection failed.  Let's wait a little bit and retry

retry = true;

try {

if (System.currentTimeMillis() - startTime > 5000) {

LOG.info("Waiting to find target node: " + nodes[0].getName());

}

Thread.sleep(6000);

} catch (InterruptedException iex) {

}

}

} while (retry && --count >= 0);

if (!success) {

throw new IOException("Unable to create new block.");

}

return nodes;

}

private LocatedBlock locateFollowingBlock(long start) throws IOException {

int retries = conf.getInt("dfs.client.block.write.locateFollowingBlock.retries", 5);

long sleeptime = 400;

while (true) {

long localstart = System.currentTimeMillis();

while (true) {

try {

//调用NameNode节点的远程方法addBlock来为文件src申请一个Block

return namenode.addBlock(src, clientName);

} catch (RemoteException e) {

IOException ue =

e.unwrapRemoteException(FileNotFoundException.class,

AccessControlException.class,

NSQuotaExceededException.class,

DSQuotaExceededException.class);

if (ue != e) {

throw ue; // no need to retry these exceptions

}

if (NotReplicatedYetException.class.getName().

equals(e.getClassName())) {

if (retries == 0) {

throw e;

} else {

--retries;

try {

Thread.sleep(sleeptime);

sleeptime *= 2;

} catch (InterruptedException ie) {

}

}

} else {

throw e;

}

}

}

}

}

private boolean createBlockOutputStream(DatanodeInfo[] nodes, String client, boolean recoveryFlag) {

String firstBadLink = "";

persistBlocks = true;

try {

InetSocketAddress target = NetUtils.createSocketAddr(nodes[0].getName());

s = socketFactory.createSocket();

int timeoutValue = 3000 * nodes.length + socketTimeout;

NetUtils.connect(s, target, timeoutValue);

s.setSoTimeout(timeoutValue);

s.setSendBufferSize(DEFAULT_DATA_SOCKET_SIZE);

LOG.debug("Send buf size " + s.getSendBufferSize());

long writeTimeout = HdfsConstants.WRITE_TIMEOUT_EXTENSION * nodes.length + datanodeWriteTimeout;

DataOutputStream out = new DataOutputStream(new BufferedOutputStream(NetUtils.getOutputStream(s, writeTimeout), DataNode.SMALL_BUFFER_SIZE));

//由确认线程ResponseProcessor使用,获取数据节点对发送的packet的确认包

blockReplyStream = new DataInputStream(NetUtils.getInputStream(s));

out.writeShort( DataTransferProtocol.DATA_TRANSFER_VERSION );//数据传输协议的版本号

out.write( DataTransferProtocol.OP_WRITE_BLOCK );//数据节点应该执行的操作

out.writeLong( block.getBlockId() );//Block的id号

out.writeLong( block.getGenerationStamp() );//Block创建的时间

out.writeInt( nodes.length );//Block所有副本数量

out.writeBoolean( recoveryFlag );       // 是否是恢复一个Block

Text.writeString( out, client );//客户端名字

out.writeBoolean(false); // Not sending src node information

out.writeInt( nodes.length - 1 );//剩余副本数量

for (int i = 1; i < nodes.length; i++) {

nodes[i].write(out);//存放剩余副本的数据节点信息

}

checksum.writeHeader( out );//数据校验信息

out.flush();

// receive ack for connect

firstBadLink = Text.readString(blockReplyStream);

if (firstBadLink.length() != 0) {

throw new IOException("Bad connect ack with firstBadLink " + firstBadLink);

}

blockStream = out;

return true;     // success

} catch (IOException ie) {

if (firstBadLink.length() != 0) {

for (int i = 0; i < nodes.length; i++) {

if (nodes[i].getName().equals(firstBadLink)) {

errorIndex = i;

break;

}

}

}

hasError = true;

setLastException(ie);

blockReplyStream = null;

return false;  // error

}

}

对于客户端向数据节点传送过程中,难免会发生错误,这些错误包括,客户端向第一个数据节点写数据时发生网络错误,数据节点向数据节点写数据时发生错误,从数据节点获取packet的确认信息是发生错误等,它们都统一交给DFSOutputSream中的函数processDatanodeError来处理的。

private boolean processDatanodeError(boolean hasError, boolean isAppend) {

if (!hasError) {

return false;

}

if (response != null) {

return true;

}

if (blockStream != null) {

try {

blockStream.close();

blockReplyStream.close();

} catch (IOException e) {

}

}

blockStream = null;

blockReplyStream = null;

// 将未被确认的数据包重新放到dataQueue中,并清空ackQueue

synchronized (ackQueue) {

dataQueue.addAll(0, ackQueue);

ackQueue.clear();

}

boolean success = false;

while (!success && clientRunning) {

DatanodeInfo[] newnodes = null;

if (nodes == null) {

String msg = "Could not get block locations. " + "Source file \"" + src + "\" - Aborting...";

LOG.warn(msg);

setLastException(new IOException(msg));

closed = true;

if (streamer != null) streamer.close();

return false;

}

StringBuilder pipelineMsg = new StringBuilder();

for (int j = 0; j < nodes.length; j++) {

pipelineMsg.append(nodes[j].getName());

if (j < nodes.length - 1) {

pipelineMsg.append(", ");

}

}

if (errorIndex < 0) {

newnodes = nodes;

} else {

if (nodes.length <= 1) {

lastException = new IOException("All datanodes " + pipelineMsg +   " are bad. Aborting...");

closed = true;

if (streamer != null) streamer.close();

return false;

}

//删除出问题的数据节点

newnodes =  new DatanodeInfo[nodes.length-1];

System.arraycopy(nodes, 0, newnodes, 0, errorIndex);

System.arraycopy(nodes, errorIndex+1, newnodes, errorIndex, newnodes.length-errorIndex);

}

LocatedBlock newBlock = null;

ClientDatanodeProtocol primary =  null;

DatanodeInfo primaryNode = null;

try {

//从剩余可用的数据节点中先一个节点来恢复Block

primaryNode = Collections.min(Arrays.asList(newnodes));

primary = createClientDatanodeProtocolProxy(primaryNode, conf);

newBlock = primary.recoverBlock(block, isAppend, newnodes);

} catch (IOException e) {

recoveryErrorCount++;

if (recoveryErrorCount > maxRecoveryErrorCount) {

if (nodes.length > 1) {

for (int j = 0; j < nodes.length; j++) {

if (nodes[j].equals(primaryNode)) {

errorIndex = j; // forget original bad node.

}

}

//删除这个有问题的数据节点

newnodes =  new DatanodeInfo[nodes.length-1];

System.arraycopy(nodes, 0, newnodes, 0, errorIndex);

System.arraycopy(nodes, errorIndex+1, newnodes, errorIndex,  newnodes.length-errorIndex);

nodes = newnodes;

recoveryErrorCount = 0;

errorIndex = -1;

return true;          //因为还有可用的数据节点,暂时放回,稍后再试

}

String emsg = "Error Recovery for block " + block + " failed " + " because recovery from primary datanode " +  primaryNode + " failed " + recoveryErrorCount +   " times. "  + " Pipeline was " + pipelineMsg +  ". Aborting...";

LOG.warn(emsg);

lastException = new IOException(emsg);

closed = true;

if (streamer != null) streamer.close();

return false;       // 因为可用的数据节点,错误已经无法再处理了,将关闭HDFS的数据写入流

}

return true;          // 因为还有可用的数据节点,暂时放回,稍后再试

} finally {

RPC.stopProxy(primary);

}

recoveryErrorCount = 0; // block recovery successful

// If the block recovery generated a new generation stamp, use that from now on.  Also, setup new pipeline

if (newBlock != null) {

block = newBlock.getBlock();

nodes = newBlock.getLocations();

}

this.hasError = false;

lastException = null;

errorIndex = 0;

success = createBlockOutputStream(nodes, clientName, true);

}

response = new ResponseProcessor(nodes);

response.start();

return false; // do not sleep, continue processing

}

    哎,看到上面的代码,烦都烦死了,我还是简单的描述一下关于processDatanodeError函数处理I/O过程中的错误吧!当有错误发生时。肯定是某一个数据节点发生了问题,那么首先会把这个有问题的数据节点删除掉。然后从剩余的可用的数据节点中选取一个,让它来恢复当前的这个Block,如果成功了ok,而失败,则删除再删除这个出问题的节点,则继续选节点来恢复Block,直到成功,如果最后没有可用的数据节点来恢复Block,则宣告这个Block写入失败,将关闭DFSOutputSream流,当用户再次写入时抛出异常。0b1331709591d260c1c78e86d0c51c18.png

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值