Hadoop源码分析之读文件时NameNode和DataNode的处理过程选取datanode详解

最新推荐文章于 2022-06-22 15:36:03 发布

linuxheik

最新推荐文章于 2022-06-22 15:36:03 发布

阅读量1.2k

点赞数

分类专栏：云数据库

云数据库专栏收录该内容

52 篇文章 0 订阅

订阅专栏

从NameNode节点获取数据块所在节点等信息

客户端在和数据节点建立流式接口的TCP连接，读取文件数据前需要定位数据的位置，所以首先客户端在DFSClient.callGetBlockLocations()方法中调用了远程方法ClientProtocol.getBlockLocations()，调用该方法返回一个LocatedBlocks对象，包含了一系列的LocatedBlock实例，通过这些信息客户端就知道需要到哪些数据节点上去获取数据。这个方法会在NameNode.getBlockLocations()中调用，进而调用FSNamesystem.同名的来进行实际的调用过程，FSNamesystem有三个重载方法，代码如下：

 
   01. 
   <code class="language-Java">LocatedBlocks getBlockLocations(String clientMachine, String src, 
  
   02. 
   long offset, long length) throws IOException { 
  
   03. 
   LocatedBlocks blocks = getBlockLocations(src, offset, length, true, true, 
  
   04. 
   true); 
  
   05. 
   if (blocks != null) {//如果blocks不为空，那么就对数据块所在的数据节点进行排序 
  
   06. 
   //sort the blocks 
  
   07. 
   // In some deployment cases, cluster is with separation of task tracker 
  
   08. 
   // and datanode which means client machines will not always be recognized 
  
   09. 
   // as known data nodes, so here we should try to get node (but not 
  
   10. 
   // datanode only) for locality based sort. 
  
   11. 
   Node client = host2DataNodeMap.getDatanodeByHost( 
  
   12. 
   clientMachine); 
  
   13. 
   if (client == null) { 
  
   14. 
   List<String> hosts = new ArrayList<String> (1); 
  
   15. 
   hosts.add(clientMachine); 
  
   16. 
   String rName = dnsToSwitchMapping.resolve(hosts).get(0); 
  
   17. 
   if (rName != null) 
  
   18. 
   client = new NodeBase(clientMachine, rName); 
  
   19. 
   }   
  
   20. 
     
   21. 
   DFSUtil.StaleComparator comparator = null; 
  
   22. 
   if (avoidStaleDataNodesForRead) { 
  
   23. 
   comparator = new DFSUtil.StaleComparator(staleInterval); 
  
   24. 
   } 
  
   25. 
   // Note: the last block is also included and sorted 
  
   26. 
   for (LocatedBlock b : blocks.getLocatedBlocks()) { 
  
   27. 
   clusterMap.pseudoSortByDistance(client, b.getLocations()); 
  
   28. 
   if (avoidStaleDataNodesForRead) { 
  
   29. 
   Arrays.sort(b.getLocations(), comparator); 
  
   30. 
   } 
  
   31. 
   } 
  
   32. 
   } 
  
   33. 
   return blocks; 
  
   34. 
   } 
  
   35. 
     
   36. 
   /** 
  
   37. 
   * Get block locations within the specified range. 
  
   38. 
   * @see ClientProtocol#getBlockLocations(String, long, long) 
  
   39. 
   */ 
  
   40. 
   public LocatedBlocks getBlockLocations(String src, long offset, long length 
  
   41. 
   ) throws IOException { 
  
   42. 
   return getBlockLocations(src, offset, length, false, true, true); 
  
   43. 
   } 
  
   44. 
     
   45. 
   /** 
  
   46. 
   * Get block locations within the specified range. 
  
   47. 
   * @see ClientProtocol#getBlockLocations(String, long, long) 
  
   48. 
   */ 
  
   49. 
   public LocatedBlocks getBlockLocations(String src, long offset, long length, 
  
   50. 
   boolean doAccessTime, boolean needBlockToken, boolean checkSafeMode) 
  
   51. 
   throws IOException { 
  
   52. 
   if (isPermissionEnabled) {//读权限检查 
  
   53. 
   FSPermissionChecker pc = getPermissionChecker(); 
  
   54. 
   checkPathAccess(pc, src, FsAction.READ); 
  
   55. 
   } 
  
   56. 
     
   57. 
   if (offset < 0) { 
  
   58. 
   throw new IOException("Negative offset is not supported. File: " + src ); 
  
   59. 
   } 
  
   60. 
   if (length < 0) { 
  
   61. 
   throw new IOException("Negative length is not supported. File: " + src ); 
  
   62. 
   } 
  
   63. 
   final LocatedBlocks ret = getBlockLocationsInternal(src, 
  
   64. 
   offset, length, Integer.MAX_VALUE, doAccessTime, needBlockToken);  
  
   65. 
   if (auditLog.isInfoEnabled() && isExternalInvocation()) { 
  
   66. 
   logAuditEvent(UserGroupInformation.getCurrentUser(), 
  
   67. 
   Server.getRemoteIp(), 
  
   68. 
   "open", src, null, null); 
  
   69. 
   } 
  
   70. 
   if (checkSafeMode && isInSafeMode()) { 
  
   71. 
   for (LocatedBlock b : ret.getLocatedBlocks()) { 
  
   72. 
   // if safemode & no block locations yet then throw safemodeException 
  
   73. 
   if ((b.getLocations() == null) || (b.getLocations().length == 0)) { 
  
   74. 
   throw new SafeModeException("Zero blocklocations for " + src, 
  
   75. 
   safeMode); 
  
   76. 
   } 
  
   77. 
   } 
  
   78. 
   } 
  
   79. 
   return ret; 
  
   80. 
   }</code>

从上面的代码可以看出，前两个方法都是调用了第三个重载方法，第二个方法获取到数据块之后，还会根据客户端和获取到的节点列表进行”排序”，“排序”调用的方法是：

 
   01. 
   <code class="language-java">public void pseudoSortByDistance( Node reader, Node[] nodes ) { 
  
   02. 
   int tempIndex = 0; 
  
   03. 
   if (reader != null ) { 
  
   04. 
   int localRackNode = -1; 
  
   05. 
   //scan the array to find the local node & local rack node 
  
   06. 
   for(int i=0; i<nodes.length; i++) {//遍历nodes，看reader是否在nodes中 
  
   07. 
   if(tempIndex == 0 && reader == nodes[i]) { //local node 
  
   08. 
   //swap the local node and the node at position 0 
  
   09. 
   //第i个数据节点与客户端是一台机器 
  
   10. 
   if( i != 0 ) { 
  
   11. 
   swap(nodes, tempIndex, i); 
  
   12. 
   } 
  
   13. 
   tempIndex=1; 
  
   14. 
   if(localRackNode != -1 ) { 
  
   15. 
   if(localRackNode == 0) {//localRackNode==0表示在没有交换之前，第0个节点是 
  
   16. 
   //与reader位于同一机架上的节点，现在交换了，那么第i个就是与reader在同一机架上的节点 
  
   17. 
   localRackNode = i; 
  
   18. 
   } 
  
   19. 
   break;//第0个是reader节点，第i个是与reader在同一机架上的节点，那么剩下的节点就一定在这个机架上，跳出循环 
  
   20. 
   } 
  
   21. 
   } else if(localRackNode == -1 && isOnSameRack(reader, nodes[i])) { 
  
   22. 
   //local rack，节点i和Reader在同一个机架上 
  
   23. 
   localRackNode = i; 
  
   24. 
   if(tempIndex != 0 ) break;//tempIndex ！= 0表示reader在nodes中 
  
   25. 
   } 
  
   26. 
   } 
  
   27. 
   //如果reader在nodes中，那么tempIndex==1，否则tempIndex = 0，如果localRackNode ！= 1，那么localRackNode节点就 
  
   28. 
   //是与reader位于同一机架上的节点，交换localRackNode到tempIndex，这样如果reader在nodes中，localRackNode与reader 
  
   29. 
   //在同一个机架上，那么第0个就是reader节点，第1个就是localRackNode节点，如果reader不在nodes中， 
  
   30. 
   //localRackNode与reader在同一个机架上，那么第0个就是localRackNode节点，否则就随机找一个 
  
   31. 
   if(localRackNode != -1 && localRackNode != tempIndex ) { 
  
   32. 
   swap(nodes, tempIndex, localRackNode); 
  
   33. 
   tempIndex++; 
  
   34. 
   } 
  
   35. 
   } 
  
   36. 
   //tempIndex == 0，则在nodes中既没有reader，也没有与reader在同一机架上的节点 
  
   37. 
   if(tempIndex == 0 && nodes.length != 0) { 
  
   38. 
   swap(nodes, 0, r.nextInt(nodes.length)); 
  
   39. 
   } 
  
   40. 
   }</code>

“排序”的规则是如果reader节点在nodes节点列表中，那么将reader放在nodes的第0个位置，如果在nodes中有与reader在同一机架上的节点localRackNode，那么就将localRackNode节点放在reader后面（如果reader不在nodes中，可以将reader视作在nodes的第-1个位置），如果也不存在与reader在同一机架上的节点，那么就在nodes中随机选择一个节点放在第0个位置。
在FSNamesystem.getBlockLocations()的第三个重载方法中，调用了FSNamesystem.getBlockLocationsInternal()方法来具体处理充NameNode节点的目录树中到文件所对应的数据块，这个方法代码如下：

 
   001. 
   <code class="language-java">private synchronized LocatedBlocks getBlockLocationsInternal(String src, 
  
   002. 
   long offset, 
  
   003. 
   long length, 
  
   004. 
   int nrBlocksToReturn, 
  
   005. 
   boolean doAccessTime, 
  
   006. 
   boolean needBlockToken) 
  
   007. 
   throws IOException { 
  
   008. 
   //获取src路径上最后一个节点即文件节点 
  
   009. 
   INodeFile inode = dir.getFileINode(src); 
  
   010. 
   if(inode == null) { 
  
   011. 
   return null; 
  
   012. 
   } 
  
   013. 
   if (doAccessTime && isAccessTimeSupported()) { 
  
   014. 
   //修改最后访问时间 
  
   015. 
   dir.setTimes(src, inode, -1, now(), false); 
  
   016. 
   } 
  
   017. 
   //返回文件的数据块 
  
   018. 
   Block[] blocks = inode.getBlocks(); 
  
   019. 
   if (blocks == null) { 
  
   020. 
   return null; 
  
   021. 
   } 
  
   022. 
   if (blocks.length == 0) {//节点为空 
  
   023. 
   return inode.createLocatedBlocks(new ArrayList<LocatedBlock>(blocks.length)); 
  
   024. 
   } 
  
   025. 
     
   026. 
   //下面开始遍历所有该文件的所有数据块，直到到达offset所在的数据块 
  
   027. 
   List<LocatedBlock> results; 
  
   028. 
   results = new ArrayList<LocatedBlock>(blocks.length); 
  
   029. 
     
   030. 
   int curBlk = 0; 
  
   031. 
   long curPos = 0, blkSize = 0; 
  
   032. 
   //数据块的个数 
  
   033. 
   int nrBlocks = (blocks[0].getNumBytes() == 0) ? 0 : blocks.length; 
  
   034. 
   for (curBlk = 0; curBlk < nrBlocks; curBlk++) { 
  
   035. 
   blkSize = blocks[curBlk].getNumBytes(); 
  
   036. 
   assert blkSize > 0 : "Block of size 0"; 
  
   037. 
   if (curPos + blkSize > offset) {//如果curPos + blkSize > offset则遍历到了offset所在的数据块 
  
   038. 
   break; 
  
   039. 
   } 
  
   040. 
   curPos += blkSize; 
  
   041. 
   } 
  
   042. 
   //curBlk == nrBlocks说明offset超过了文件的长度 
  
   043. 
   if (nrBlocks > 0 && curBlk == nrBlocks)   // offset >= end of file 
  
   044. 
   return null; 
  
   045. 
   //找到了offset所在的数据块 
  
   046. 
   long endOff = offset + length; 
  
   047. 
   //下面对于每一个curBlk和其后的每个数据块，先获取其副本，然后检查该副本是否已经损坏，如果是部分损坏，则过滤掉其余的损坏的副本 
  
   048. 
   //将正常的副本加入到machineSet中，返回，如果所有的副本都损坏，则将所有的副本都加入这个数据块对应的machineSet中，再对 
  
   049. 
   //machineSet构造LocatedBlock对象 
  
   050. 
   do { 
  
   051. 
   // get block locations，获取数据块所在的数据节点 
  
   052. 
   int numNodes = blocksMap.numNodes(blocks[curBlk]);//有numNodes个数据节点保存这个数据块 
  
   053. 
   int numCorruptNodes = countNodes(blocks[curBlk]).corruptReplicas();//损坏的副本数量 
  
   054. 
   int numCorruptReplicas = corruptReplicas.numCorruptReplicas(blocks[curBlk]); 
  
   055. 
   if (numCorruptNodes != numCorruptReplicas) { 
  
   056. 
   LOG.warn("Inconsistent number of corrupt replicas for " + 
  
   057. 
   blocks[curBlk] + "blockMap has " + numCorruptNodes + 
  
   058. 
   " but corrupt replicas map has " + numCorruptReplicas); 
  
   059. 
   } 
  
   060. 
   DatanodeDescriptor[] machineSet = null; 
  
   061. 
   boolean blockCorrupt = false; 
  
   062. 
   if (inode.isUnderConstruction() && curBlk == blocks.length - 1 
  
   063. 
   && blocksMap.numNodes(blocks[curBlk]) == 0) {//最后一个副本处于构建状态，不用检查是否有损坏的副本 
  
   064. 
   // get unfinished block locations 
  
   065. 
   INodeFileUnderConstruction cons = (INodeFileUnderConstruction)inode; 
  
   066. 
   machineSet = cons.getTargets(); 
  
   067. 
   blockCorrupt = false; 
  
   068. 
   } else { 
  
   069. 
   blockCorrupt = (numCorruptNodes == numNodes);//数据块的所有副本是否都已经损坏 
  
   070. 
   int numMachineSet = blockCorrupt ? numNodes : 
  
   071. 
   (numNodes - numCorruptNodes);//未损坏的副本数量 
  
   072. 
   machineSet = new DatanodeDescriptor[numMachineSet]; 
  
   073. 
   if (numMachineSet > 0) { 
  
   074. 
   numNodes = 0; 
  
   075. 
   for(Iterator<DatanodeDescriptor> it = 
  
   076. 
   blocksMap.nodeIterator(blocks[curBlk]); it.hasNext();) {//遍历所有副本 
  
   077. 
   DatanodeDescriptor dn = it.next(); 
  
   078. 
   boolean replicaCorrupt = corruptReplicas.isReplicaCorrupt(blocks[curBlk], dn); 
  
   079. 
   if (blockCorrupt || (!blockCorrupt && !replicaCorrupt))//数据块已经损坏或者部分副本损坏 
  
   080. 
   machineSet[numNodes++] = dn; 
  
   081. 
   } 
  
   082. 
   } 
  
   083. 
   } 
  
   084. 
   LocatedBlock b = new LocatedBlock(blocks[curBlk], machineSet, curPos, 
  
   085. 
   blockCorrupt); 
  
   086. 
   if(isAccessTokenEnabled && needBlockToken) { 
  
   087. 
   b.setBlockToken(accessTokenHandler.generateToken(b.getBlock(), 
  
   088. 
   EnumSet.of(BlockTokenSecretManager.AccessMode.READ))); 
  
   089. 
   } 
  
   090. 
     
   091. 
   results.add(b); 
  
   092. 
   curPos += blocks[curBlk].getNumBytes(); 
  
   093. 
   curBlk++; 
  
   094. 
   } while (curPos < endOff 
  
   095. 
   && curBlk < blocks.length 
  
   096. 
   && results.size() < nrBlocksToReturn); 
  
   097. 
     
   098. 
   return inode.createLocatedBlocks(results); 
  
   099. 
   }</code>

这个方法比较长，首先是执行INodeFile inode = dir.getFileINode(src);这行代码获取src路径上的文件节点，FSDirectory.getFileINode()方法根据文件路径，查找找到路径分隔符的最后一个元素，如果这个元素代表的文件存在，则返回该文件的对象，如果不存在，就为返回null。需要说明的是在HDFS的目录树中，根目录是一个空字符串即””，使用rootDir表示那么路径rootDir/home/hadoop这个路径的真实值为”/home/hadoop”。
并且在INode类中，文件/目录名遍历name是一个字节数组，如果name.length为0，则是根节点。FSDirectory.getFileINode(String src)方法会通过rootDir.getNode(src);获取src的的文件节点对象即src文件所对应的INode对象，这个过程中会调用INode.getPathComponents(String path)方法会返回路径path上每个以/分隔的字符串的字节数组，即得到路径中的每个目录和文件名的字节数组，为什么要获取到路径目录和文件的字节数组？因为INode.name是二进制格式，INodeDirectory.getExistingPathINodes()方法会使用二分查找，看目录或文件是否存在，具体代码比较简单。
如果通过FSDirectory.getFileINode(String src)返回的INode对象为null，那么直接返回null值，否则，根据参数doAccessTime来确定是否有修改文件的最后访问时间。
继续向下执行getBlockLocationsInternal方法，接下来根据以上获取到的INode对象获取到这个文件对应的数据块信息，如果数据块为null，则返回null，如果数据块数组长度为0，那么创建一个LocatedBlocks对象，这个对象中对应的数据块数组元素个数为0，稍后会继续分析如何根据数据块数组来创建LocatedBlocks对象。
如果该文件对应的数据块数组元素个数大于0，那么就遍历所有该文件的所有数据块，直到到达参数offset所在的数据块，其中offset是文件中数据的偏移，它一定在某个数据块中。具体的方法是：设置一个指针curPos表示当前的偏移，每次访问一个数据块，就看curPos与数据块的大小的和是否大于offset，如果小于就让curPos的值加上数据块的大小，如果大于就停止遍历，这样就找到了offset所在的数据块。
接下来就根据剩余的数据块副本来构造DataNode数据节点列表，对于每个数据块，检查其损坏副本的数量，首先对同一个数据块检查blockMap中的损坏副本与corruptReplicas中记录的损坏副本是否相同，如果不同就记录log信息。numCorruptNodes和numCorruptReplicas虽然都代表损坏副本的额数量，但是求这两个值的方式不同,numCorruptNodes是先根据数据块从blocksMap中取出这个数据块对应的DataNode节点，再看这个这个数据块对应的DataNode节点是否在corruptReplicas(这个遍历保存了已经损坏的数据块副本)中，numCorruptNodes表示这个数据块对应的DataNode节点有多少在corruptReplicas中，而numCorruptReplicas则是根据数据块来检查在corruptReplicas中有多少对应的节点，有可能这两个值不一致。
对于每一个数据块，找到这个数据块所有的正常副本，然后构造一个LocatedBlock对象，这个对象保存了对应的数据块的所有正常副本所在的DataNode节点，以及这个数据块在文件中的偏移等信息。如果一个数据块的所有副本都损坏，则将这个数据块的所有副本都返回给客户端，但是LocatedBlock中的corrupt属性记录为true，它表示这个数据块的所有副本都损坏了。此外如果当前数据块是文件的最后一个数据块，并且这个数据块还于构建状态，不用检查是否有损坏的副本，直接将它的所有副本返回给客户端。
执行以上的过程就完成了数据块从NameNode获取文件副本的过程。

从数据节点获取数据块内容

客户端获取到数据块以及其所在的DataNode节点信息后，就可以联系DataNode节点来读文件数据了。HDFS提供了DataXceiverServer类和DataXceiver类来处理客户端的读请求，其中DataXceiverServer类用于接收客户端的Socket连接请求，然后创建一个DataXceiver线程来接收客户端的读数据请求，这个DataXceiver线程接受到客户端的读数据请求后，就可以将数据发送给客户端了。这个过程是一个基本Java的Socket通信，与Java提供的NIO不同，这种通信方式每个客户端读请求在DataNode中都对应一个单独的线程。
客户端读数据是基于TCP数据流，使用了Java的基本套接字的功能。在HDFS启动DataNode时，执行DataNode.startDataNode()方法过程中创建了一个java.net.ServerSocket对象，然后构造一个DataXceiverServer线程专门用于accept客户端。DataXceiverServer线程启动后就阻塞在accpt方法中，等待着客户端的连接请求，只要有客户端连接过来，就会完成accept方法，然后创建一个DataXceiver线程用于处理客户端的读数据请求，accept客户端的这部分代码实现在DataXceiverServer.run()方法中，代码比较简单。
客户端的连接被接收后DataNode节点就建立了一个DataXceiver线程，在DataXceiver线程的run方法中处理客户端的读数据请求，方法代码如下：

 
   01. 
   <code class="language-java">public void run() { 
  
   02. 
   DataInputStream in=null; 
  
   03. 
   try { 
  
   04. 
   //创建输入流 
  
   05. 
   in = new DataInputStream( 
  
   06. 
   new BufferedInputStream(NetUtils.getInputStream(s), 
  
   07. 
   SMALL_BUFFER_SIZE)); 
  
   08. 
   //进行版本检查 
  
   09. 
   short version = in.readShort(); 
  
   10. 
   if ( version != DataTransferProtocol.DATA_TRANSFER_VERSION ) { 
  
   11. 
   throw new IOException( "Version Mismatch" ); 
  
   12. 
   } 
  
   13. 
   boolean local = s.getInetAddress().equals(s.getLocalAddress());//socket连接的远程地址是否是本地机器的地址，即是否连接到了本地机器 
  
   14. 
   byte op = in.readByte();//读入请求码 
  
   15. 
   // Make sure the xciver count is not exceeded，DataNode中读写请求的数量，即DataXceiver线程的数量有个阈值 
  
   16. 
   int curXceiverCount = datanode.getXceiverCount(); 
  
   17. 
   if (curXceiverCount > dataXceiverServer.maxXceiverCount) {//该请求是否超出数据节点的支撑能力，以确保数据节点的服务质量 
  
   18. 
   throw new IOException("xceiverCount " + curXceiverCount 
  
   19. 
   + " exceeds the limit of concurrent xcievers " 
  
   20. 
   + dataXceiverServer.maxXceiverCount); 
  
   21. 
   } 
  
   22. 
   long startTime = DataNode.now(); 
  
   23. 
   switch ( op ) { 
  
   24. 
   case DataTransferProtocol.OP_READ_BLOCK://客户端读数据 
  
   25. 
   readBlock( in ); 
  
   26. 
   datanode.myMetrics.addReadBlockOp(DataNode.now() - startTime); 
  
   27. 
   if (local) 
  
   28. 
   datanode.myMetrics.incrReadsFromLocalClient(); 
  
   29. 
   else 
  
   30. 
   datanode.myMetrics.incrReadsFromRemoteClient(); 
  
   31. 
   break; 
  
   32. 
   case DataTransferProtocol.OP_WRITE_BLOCK://客户端写数据 
  
   33. 
   writeBlock( in ); 
  
   34. 
   datanode.myMetrics.addWriteBlockOp(DataNode.now() - startTime); 
  
   35. 
   if (local) 
  
   36. 
   datanode.myMetrics.incrWritesFromLocalClient(); 
  
   37. 
   else 
  
   38. 
   datanode.myMetrics.incrWritesFromRemoteClient(); 
  
   39. 
   break; 
  
   40. 
   case DataTransferProtocol.OP_REPLACE_BLOCK: // for balancing purpose; send to a destination，数据块替换 
  
   41. 
   replaceBlock(in); 
  
   42. 
   datanode.myMetrics.addReplaceBlockOp(DataNode.now() - startTime); 
  
   43. 
   break; 
  
   44. 
   case DataTransferProtocol.OP_COPY_BLOCK://数据块拷贝 
  
   45. 
   // for balancing purpose; send to a proxy source 
  
   46. 
   copyBlock(in); 
  
   47. 
   datanode.myMetrics.addCopyBlockOp(DataNode.now() - startTime); 
  
   48. 
   break; 
  
   49. 
   case DataTransferProtocol.OP_BLOCK_CHECKSUM: //get the checksum of a block，读数据块的校验信息 
  
   50. 
   getBlockChecksum(in); 
  
   51. 
   datanode.myMetrics.addBlockChecksumOp(DataNode.now() - startTime); 
  
   52. 
   break; 
  
   53. 
   default: 
  
   54. 
   throw new IOException("Unknown opcode " + op + " in data stream"); 
  
   55. 
   } 
  
   56. 
   } catch (Throwable t) { 
  
   57. 
   LOG.error(datanode.dnRegistration + ":DataXceiver",t); 
  
   58. 
   } finally { 
  
   59. 
   LOG.debug(datanode.dnRegistration + ":Number of active connections is: " 
  
   60. 
   + datanode.getXceiverCount()); 
  
   61. 
   IOUtils.closeStream(in); 
  
   62. 
   IOUtils.closeSocket(s); 
  
   63. 
   dataXceiverServer.childSockets.remove(s); 
  
   64. 
   } 
  
   65. 
   }</code>

DataXceiver线程除了用于处理客户端的读数据请求，还处理客户端写数据请求，DataNode节点之间的数据块替换，数据块拷贝和读数据块校验信息等功能，暂时只分析客户端读数据请求的部分，在上面的DataXceiver.run()方法中，首先根据参数创建一个输入流，用于读取客户端发送过来的请求数据，然后读取一个short类型的版本信息，检查客户端的数据传输接口值是否和DataNode节点一致，再读取请求操作码op，DataXceiver线程会根据客端的操作请求码op来进行不同的操作（switch语句），DataTransferProtocol.OP_READ_BLOCK操作码代表读操作，如果是这个操作码，就执行DataXceiver.readBlock()方法，这个方法代码如下：

 
   01. 
   <code class="language-java">private void readBlock(DataInputStream in) throws IOException { 
  
   02. 
   long blockId = in.readLong();  //要读取的数据块标识，数据节点通过它定位数据块        
  
   03. 
   Block block = new Block( blockId, 0 , in.readLong());//这个in.readLong()方法读取数据版本号 
  
   04. 
     
   05. 
   long startOffset = in.readLong();//要读取数据位于数据块中的位置 
  
   06. 
   long length = in.readLong();//客户端要读取的数据长度 
  
   07. 
   String clientName = Text.readString(in);//发起读请求的客户端名字 
  
   08. 
   Token<BlockTokenIdentifier> accessToken = new Token<BlockTokenIdentifier>(); 
  
   09. 
   accessToken.readFields(in);//安全相关 
  
   10. 
   OutputStream baseStream = NetUtils.getOutputStream(s, 
  
   11. 
   datanode.socketWriteTimeout);//Socket对应的输出流 
  
   12. 
   DataOutputStream out = new DataOutputStream( 
  
   13. 
   new BufferedOutputStream(baseStream, SMALL_BUFFER_SIZE)); 
  
   14. 
     
   15. 
   if (datanode.isBlockTokenEnabled) { 
  
   16. 
   try { 
  
   17. 
   datanode.blockTokenSecretManager.checkAccess(accessToken, null, block, 
  
   18. 
   BlockTokenSecretManager.AccessMode.READ); 
  
   19. 
   } catch (InvalidToken e) { 
  
   20. 
   try { 
  
   21. 
   out.writeShort(DataTransferProtocol.OP_STATUS_ERROR_ACCESS_TOKEN); 
  
   22. 
   out.flush(); 
  
   23. 
   throw new IOException("Access token verification failed, for client " 
  
   24. 
   + remoteAddress + " for OP_READ_BLOCK for " + block); 
  
   25. 
   } finally { 
  
   26. 
   IOUtils.closeStream(out); 
  
   27. 
   } 
  
   28. 
   } 
  
   29. 
   } 
  
   30. 
   // send the block 
  
   31. 
   BlockSender blockSender = null; 
  
   32. 
   final String clientTraceFmt = 
  
   33. 
   clientName.length() > 0 && ClientTraceLog.isInfoEnabled() 
  
   34. 
   ? String.format(DN_CLIENTTRACE_FORMAT, localAddress, remoteAddress, 
  
   35. 
   "%d", "HDFS_READ", clientName, "%d", 
  
   36. 
   datanode.dnRegistration.getStorageID(), block, "%d") 
  
   37. 
   : datanode.dnRegistration + " Served " + block + " to " + 
  
   38. 
   s.getInetAddress(); 
  
   39. 
   try { 
  
   40. 
   try { 
  
   41. 
   blockSender = new BlockSender(block, startOffset, length, 
  
   42. 
   true, true, false, datanode, clientTraceFmt); 
  
   43. 
   } catch(IOException e) {//BlockSender的构造方法会进行一系列的检查，这些检查通过后，才会成功创建对象，否则通过异常返回给客户端 
  
   44. 
   out.writeShort(DataTransferProtocol.OP_STATUS_ERROR); 
  
   45. 
   throw e; 
  
   46. 
   } 
  
   47. 
     
   48. 
   out.writeShort(DataTransferProtocol.OP_STATUS_SUCCESS); // send op status，操作成功状态 
  
   49. 
   long read = blockSender.sendBlock(out, baseStream, null); // send data,发送数据 
  
   50. 
     
   51. 
   if (blockSender.isBlockReadFully()) {//客户端是否校验成功，这是一个客户端可选的响应 
  
   52. 
   // See if client verification succeeded. 
  
   53. 
   // This is an optional response from client. 
  
   54. 
   try { 
  
   55. 
   if (in.readShort() == DataTransferProtocol.OP_STATUS_CHECKSUM_OK  && 
  
   56. 
   datanode.blockScanner != null) {//客户端已经进行了数据块的校验，数据节点就可以省略重复的工作，减轻系统负载 
  
   57. 
   datanode.blockScanner.verifiedByClient(block); 
  
   58. 
   } 
  
   59. 
   } catch (IOException ignored) {} 
  
   60. 
   } 
  
   61. 
     
   62. 
   datanode.myMetrics.incrBytesRead((int) read); 
  
   63. 
   datanode.myMetrics.incrBlocksRead(); 
  
   64. 
   } catch ( SocketException ignored ) { 
  
   65. 
   // Its ok for remote side to close the connection anytime. 
  
   66. 
   datanode.myMetrics.incrBlocksRead(); 
  
   67. 
   } catch ( IOException ioe ) { 
  
   68. 
   /* What exactly should we do here? 
  
   69. 
   * Earlier version shutdown() datanode if there is disk error. 
  
   70. 
   */ 
  
   71. 
   LOG.warn(datanode.dnRegistration +  ":Got exception while serving " + 
  
   72. 
   block + " to " + s.getInetAddress() + ":\n" + 
  
   73. 
   StringUtils.stringifyException(ioe) ); 
  
   74. 
   throw ioe; 
  
   75. 
   } finally { 
  
   76. 
   IOUtils.closeStream(out); 
  
   77. 
   IOUtils.closeStream(blockSender); 
  
   78. 
   } 
  
   79. 
   }</code>

这个方法先读取数据块标识和数据版本号，创建一个数据块对象(Block对象)，然后依次读取数据位于数据块中的位置(startOffset)，要读取的数据长度(length)，发起读请求的客户端名字(clientName)，安全标识(accessToken)，再创建到客户端的输出流。
接下来就构造一个BlockSender对象用于向客户端发送数据，响应客户端的读数据请求，BlockSender的构造方法会进行一系列的检查，这些检查通过后，才会成功创建对象，否则通过异常返回给客户端。如果调用BlockSender构造方法没有抛出异常，则BlockSender对象创建成功，那么就向客户端写出一个DataTransferProtocol.OP_STATUS_SUCCESS标识，接着调用BlockSender.sendBlock()方法发送数据。
如果客户端接收数据后校验成功，客户端会向DataNode节点发送一个DataTransferProtocol.OP_STATUS_CHECKSUM_OK标识，DataNode节点可以通过这个标识通知数据块扫描器，让扫描器标识该数据块扫描成功，也可以看作客户端替这个DataNode节点的数据块扫描器检查了这个数据块，那么数据块扫描器就不用重复检查了，这样设计，数据节点就可以省略重复的工作，减轻系统负载。
上面分析到了构造BlockSender对象时会进行一系列检查，那么这些检查是怎么进行的呢？下面就来看看BlockSender对象的处理过程，其构造方法如下：

 
   001. 
   <code class="language-java">BlockSender(Block block, long startOffset, long length, 
  
   002. 
   boolean corruptChecksumOk, boolean chunkOffsetOK, 
  
   003. 
   boolean verifyChecksum, DataNode datanode, String clientTraceFmt) 
  
   004. 
   throws IOException { 
  
   005. 
   try { 
  
   006. 
   this.block = block;//要发送的数据块 
  
   007. 
   this.chunkOffsetOK = chunkOffsetOK; 
  
   008. 
   this.corruptChecksumOk = corruptChecksumOk; 
  
   009. 
   this.verifyChecksum = verifyChecksum; 
  
   010. 
   this.blockLength = datanode.data.getVisibleLength(block); 
  
   011. 
   this.transferToAllowed = datanode.transferToAllowed; 
  
   012. 
   this.clientTraceFmt = clientTraceFmt; 
  
   013. 
   this.readaheadLength = datanode.getReadaheadLength(); 
  
   014. 
   this.readaheadPool = datanode.readaheadPool; 
  
   015. 
   this.shouldDropCacheBehindRead = datanode.shouldDropCacheBehindReads(); 
  
   016. 
     
   017. 
   if ( !corruptChecksumOk || datanode.data.metaFileExists(block) ) { 
  
   018. 
   checksumIn = new DataInputStream( 
  
   019. 
   new BufferedInputStream(datanode.data.getMetaDataInputStream(block), 
  
   020. 
   BUFFER_SIZE)); 
  
   021. 
     
   022. 
   // read and handle the common header here. For now just a version 
  
   023. 
   BlockMetadataHeader header = BlockMetadataHeader.readHeader(checksumIn); 
  
   024. 
   short version = header.getVersion(); 
  
   025. 
     
   026. 
   if (version != FSDataset.METADATA_VERSION) { 
  
   027. 
   LOG.warn("Wrong version (" + version + ") for metadata file for " 
  
   028. 
   + block + " ignoring ..."); 
  
   029. 
   } 
  
   030. 
   checksum = header.getChecksum(); 
  
   031. 
   } else { 
  
   032. 
   LOG.warn("Could not find metadata file for " + block); 
  
   033. 
   // This only decides the buffer size. Use BUFFER_SIZE? 
  
   034. 
   checksum = DataChecksum.newDataChecksum(DataChecksum.CHECKSUM_NULL, 
  
   035. 
   16 * 1024); 
  
   036. 
   } 
  
   037. 
     
   038. 
   /* If bytesPerChecksum is very large, then the metadata file 
  
   039. 
   * is mostly corrupted. For now just truncate bytesPerchecksum to 
  
   040. 
   * blockLength. 
  
   041. 
   */        
  
   042. 
   bytesPerChecksum = checksum.getBytesPerChecksum(); 
  
   043. 
   if (bytesPerChecksum > 10*1024*1024 && bytesPerChecksum > blockLength){ 
  
   044. 
   checksum = DataChecksum.newDataChecksum(checksum.getChecksumType(), 
  
   045. 
   Math.max((int)blockLength, 10*1024*1024)); 
  
   046. 
   bytesPerChecksum = checksum.getBytesPerChecksum();        
  
   047. 
   } 
  
   048. 
   checksumSize = checksum.getChecksumSize(); 
  
   049. 
     
   050. 
   if (length < 0) { 
  
   051. 
   length = blockLength; 
  
   052. 
   } 
  
   053. 
     
   054. 
   endOffset = blockLength; 
  
   055. 
   if (startOffset < 0 || startOffset > endOffset 
  
   056. 
   || (length + startOffset) > endOffset) { 
  
   057. 
   String msg = " Offset " + startOffset + " and length " + length 
  
   058. 
   + " don't match " + block + " ( blockLen " + endOffset + " )"; 
  
   059. 
   LOG.warn(datanode.dnRegistration + ":sendBlock() : " + msg); 
  
   060. 
   throw new IOException(msg); 
  
   061. 
   } 
  
   062. 
     
   063. 
   //应答数据在数据块中的开始位置 
  
   064. 
   offset = (startOffset - (startOffset % bytesPerChecksum)); 
  
   065. 
   if (length >= 0) { 
  
   066. 
   // Make sure endOffset points to end of a checksumed chunk. 
  
   067. 
   long tmpLen = startOffset + length; 
  
   068. 
   if (tmpLen % bytesPerChecksum != 0) { 
  
   069. 
   //用户读取数据的结束位置 
  
   070. 
   tmpLen += (bytesPerChecksum - tmpLen % bytesPerChecksum); 
  
   071. 
   } 
  
   072. 
   if (tmpLen < endOffset) { 
  
   073. 
   endOffset = tmpLen; 
  
   074. 
   } 
  
   075. 
   } 
  
   076. 
     
   077. 
   // seek to the right offsets，设置读校验信息文件的位置信息 
  
   078. 
   if (offset > 0) { 
  
   079. 
   long checksumSkip = (offset / bytesPerChecksum) * checksumSize; 
  
   080. 
   // note blockInStream is  seeked when created below 
  
   081. 
   if (checksumSkip > 0) { 
  
   082. 
   // Should we use seek() for checksum file as well?，跳过不需要的部分 
  
   083. 
   IOUtils.skipFully(checksumIn, checksumSkip); 
  
   084. 
   } 
  
   085. 
   } 
  
   086. 
   seqno = 0; 
  
   087. 
   //打开数据块的文件输入流 
  
   088. 
   blockIn = datanode.data.getBlockInputStream(block, offset); // seek to offset 
  
   089. 
   if (blockIn instanceof FileInputStream) { 
  
   090. 
   blockInFd = ((FileInputStream) blockIn).getFD(); 
  
   091. 
   } else { 
  
   092. 
   blockInFd = null; 
  
   093. 
   } 
  
   094. 
   memoizedBlock = new MemoizedBlock(blockIn, blockLength, datanode.data, block); 
  
   095. 
   } catch (IOException ioe) { 
  
   096. 
   IOUtils.closeStream(this); 
  
   097. 
   IOUtils.closeStream(blockIn); 
  
   098. 
   throw ioe; 
  
   099. 
   } 
  
   100. 
   }</code>

在这个方法中，首先根据构造函数的参数为BlockSender的部分成员变量赋值，其中block为要发送的数据块对象，startOffset为要读取数据位于数据块中的位置，length为要读取的数据长度，corruptChecksumOk为true那么就表示不需要发送这个数据块文件对应的校验文件的数据，否则就必须要发送数据块文件的校验文件信息。
如果corruptChecksumOk为false，且数据块文件对应的校验文件存在，那么就创建这个校验文件输入流checksumIn，然后读入这个文件的头部信息，即文件中校验数据之前的数据，并且读入的元数据版本号与FSDataset.METADATA_VERSION比较。
方法中客户端要读取的偏移起点用startOffset标识，结束点用endOffset表示，由于校验块大小是一定的(默认为512字节)，若startOffset在一个校验块内，那么这样传输客户端就会校验出错，即如果DataNode节点从startOffset处开始发送，那么客户端收到的数据校验后就与校验数据不一致(校验数据无法拆分)，所以就必须从startOffset所在的那个校验块的起点开始发送数据，同理，endOffset如果在一个校验块内，那么就要截至到这个校验的结束，如下图所示

图中有三个校验块，阴影部分为要读取的数据部分，所以这部分的起始和结尾出刚好落在了第一个数据块和第3个数据块中。

根据上面的分析，DataNode节点发送的数据起点是计算得到的offset值，结束点是计算得到的endOffset值，然后就创建数据块文件的输入数据流blockIn，这样就成功创建了BlockSender对象。
创建完BlockSender对象，就可以通过这个对象向客户端发送数据了，具体过程实现在BlockSender.sendBlock()方法中，代码如下：

 
   01. 
   <code class="language-java">long sendBlock(DataOutputStream out, OutputStream baseStream, 
  
   02. 
   DataTransferThrottler throttler) throws IOException { 
  
   03. 
   if( out == null ) { 
  
   04. 
   throw new IOException( "out stream is null" ); 
  
   05. 
   } 
  
   06. 
   this.throttler = throttler; 
  
   07. 
     
   08. 
   initialOffset = offset; 
  
   09. 
   long totalRead = 0; 
  
   10. 
   OutputStream streamForSendChunks = out; 
  
   11. 
     
   12. 
   lastCacheDropOffset = initialOffset; 
  
   13. 
     
   14. 
   // Advise that this file descriptor will be accessed sequentially. 
  
   15. 
   //调用<a href="http://www.it165.net/os/oslin/" target="_blank" class="keylink">Linux</a>的posix_fadvise函数来声明blockInfd的访问方式 
  
   16. 
   if (isLongRead() && blockInFd != null) {//如果要读取的数据长度超过一定值，并且文件描述符不为空，那么就设置对文件的访问方式 
  
   17. 
   NativeIO.posixFadviseIfPossible(blockInFd, 0, 0, 
  
   18. 
   NativeIO.POSIX_FADV_SEQUENTIAL); 
  
   19. 
   } 
  
   20. 
     
   21. 
   // Trigger readahead of beginning of file if configured. 
  
   22. 
   manageOsCache(); 
  
   23. 
     
   24. 
   final long startTime = ClientTraceLog.isInfoEnabled() ? System.nanoTime() : 0; 
  
   25. 
   //发送应答头部信息，包含数据校验类型，校验块大小，偏移量，其中对于客户端请求，偏移量是必选参数 
  
   26. 
   try { 
  
   27. 
   try { 
  
   28. 
   checksum.writeHeader(out);//发送数据校验类型，校验块大小 
  
   29. 
   if ( chunkOffsetOK ) { 
  
   30. 
   out.writeLong( offset );//发送偏移量 
  
   31. 
   } 
  
   32. 
   out.flush(); 
  
   33. 
   } catch (IOException e) { //socket error 
  
   34. 
   throw ioeToSocketException(e); 
  
   35. 
   } 
  
   36. 
   //根据缓冲区大小配置，计算一次能够发送多少校验块的数据，并分配工作缓冲区 
  
   37. 
   int maxChunksPerPacket; 
  
   38. 
   int pktSize = DataNode.PKT_HEADER_LEN + SIZE_OF_INTEGER; 
  
   39. 
     
   40. 
   if (transferToAllowed && !verifyChecksum && 
  
   41. 
   baseStream instanceof SocketOutputStream && 
  
   42. 
   blockIn instanceof FileInputStream) {//零拷贝传输方式 
  
   43. 
     
   44. 
   FileChannel fileChannel = ((FileInputStream)blockIn).getChannel(); 
  
   45. 
     
   46. 
   // blockInPosition also indicates sendChunks() uses transferTo. 
  
   47. 
   blockInPosition = fileChannel.position(); 
  
   48. 
   streamForSendChunks = baseStream; 
  
   49. 
     
   50. 
   // assure a mininum buffer size. 
  
   51. 
   maxChunksPerPacket = (Math.max(BUFFER_SIZE, 
  
   52. 
   MIN_BUFFER_WITH_TRANSFERTO) 
  
   53. 
   + bytesPerChecksum - 1)/bytesPerChecksum;//一次发送几个校验块 
  
   54. 
     
   55. 
   // packet buffer has to be able to do a normal transfer in the case 
  
   56. 
   // of recomputing checksum，缓冲区需要能够执行一次普通传输 
  
   57. 
   pktSize += (bytesPerChecksum + checksumSize) * maxChunksPerPacket; 
  
   58. 
   } else {//传统的传输方式 
  
   59. 
   maxChunksPerPacket = Math.max(1, 
  
   60. 
   (BUFFER_SIZE + bytesPerChecksum - 1)/bytesPerChecksum); 
  
   61. 
   pktSize += (bytesPerChecksum + checksumSize) * maxChunksPerPacket; 
  
   62. 
   } 
  
   63. 
     
   64. 
   ByteBuffer pktBuf = ByteBuffer.allocate(pktSize); 
  
   65. 
   //循环发送数据块中的校验块 
  
   66. 
   while (endOffset > offset) { 
  
   67. 
   manageOsCache(); 
  
   68. 
   long len = sendChunks(pktBuf, maxChunksPerPacket, 
  
   69. 
   streamForSendChunks); 
  
   70. 
   offset += len; 
  
   71. 
   totalRead += len + ((len + bytesPerChecksum - 1)/bytesPerChecksum* 
  
   72. 
   checksumSize); 
  
   73. 
   seqno++; 
  
   74. 
   } 
  
   75. 
   try { 
  
   76. 
   out.writeInt(0); // mark the end of block        
  
   77. 
   out.flush(); 
  
   78. 
   } catch (IOException e) { //socket error 
  
   79. 
   throw ioeToSocketException(e); 
  
   80. 
   } 
  
   81. 
   } 
  
   82. 
   catch (RuntimeException e) { 
  
   83. 
   LOG.error("unexpected exception sending block", e); 
  
   84. 
     
   85. 
   throw new IOException("unexpected runtime exception", e); 
  
   86. 
   } 
  
   87. 
   finally { 
  
   88. 
   if (clientTraceFmt != null) { 
  
   89. 
   final long endTime = System.nanoTime(); 
  
   90. 
   ClientTraceLog.info(String.format(clientTraceFmt, totalRead, initialOffset, endTime - startTime)); 
  
   91. 
   } 
  
   92. 
   close(); 
  
   93. 
   } 
  
   94. 
     
   95. 
   blockReadFully = (initialOffset == 0 && offset >= blockLength); 
  
   96. 
     
   97. 
   return totalRead; 
  
   98. 
   }</code>

在这个方法有两个输出流对象参数，一个是DataOutputStream类型的out对象，一个是OutputStream类型的baseStream对象，在DataXceiver.readBlock()方法中可以看到，其实out对象就是对baseStream对象的封装，baseStream主要用于向客户端发送数据的“零拷贝”过程中(稍后分析)。
sendBlock方法首先进行一些发送数据前的预处理，比如通过本地方法调用来调用Linux的posix_fadvise函数来声明blockInfd的访问方式为顺序访问，调用manageOsCache()方法设置操作系统缓存等。然后对客户端读请求发送应答头部信息，包含数据校验类型，校验块大小，偏移量，其中对于客户端请求，偏移量是必选参数。为什么说偏移量是必选的？因为BlockSender不但被用于支持客户端读数据，也用于数据块复制中。数据块复制由于是对整个数据块进行的操作，也就不需要提供数据块内的偏移量，但是对于客户端来说，偏移量是一个必须的参数。
输出完响应头部信息后，就可以开始向客户端输出数据块数据了。输出数据有两种方式，一种是传统的传输方式，即先从数据块文件中读入文件数据，然后通过Socket输出流输出，这种方式容易理解，但是效率比较低。另外一种方式是Linux/Unix中的“零拷贝”输出，关于“零拷贝输出”可以参考通过零拷贝实现有效数据传输，不论是传统的输出方式，还是“零拷贝”输出都是循环调用BlockSender.sendChunks()方法进行输出的，因为一个数据块大小可能比较大，DataNode节点会分多次分别将这个数据块的数据发送完成，每次发送都发送一个数据包，这个数据包有包长度，在数据块中的偏移，序列号等信息。Block.sendChunks()方法的代码如下：

 
   001. 
   <code class="language-java">  private int sendChunks(ByteBuffer pkt, int maxChunks, OutputStream out) 
  
   002. 
   throws IOException { 
  
   003. 
   // Sends multiple chunks in one packet with a single write(). 
  
   004. 
   int len = (int) Math.min(endOffset - offset, 
  
   005. 
   (((long) bytesPerChecksum) * ((long) maxChunks))); 
  
   006. 
     
   007. 
   // truncate len so that any partial chunks will be sent as a final packet. 
  
   008. 
   // this is not necessary for correctness, but partial chunks are 
  
   009. 
   // ones that may be recomputed and sent via buffer copy, so try to minimize 
  
   010. 
   // those bytes 
  
   011. 
   if (len > bytesPerChecksum && len % bytesPerChecksum != 0) { 
  
   012. 
   len -= len % bytesPerChecksum; 
  
   013. 
   } 
  
   014. 
     
   015. 
   if (len == 0) { 
  
   016. 
   return 0; 
  
   017. 
   } 
  
   018. 
     
   019. 
   int numChunks = (len + bytesPerChecksum - 1)/bytesPerChecksum; 
  
   020. 
   int packetLen = len + numChunks*checksumSize + 4; 
  
   021. 
   pkt.clear(); 
  
   022. 
     
   023. 
   // write packet header，应答包头部 
  
   024. 
   pkt.putInt(packetLen);//包长度 
  
   025. 
   pkt.putLong(offset);//偏移量 
  
   026. 
   pkt.putLong(seqno);//序列号 
  
   027. 
   pkt.put((byte)((offset + len >= endOffset) ? 1 : 0));//最后应答包标识，该数据包是否是应答的最后一个数据包 
  
   028. 
   //why no ByteBuf.putBoolean()? 
  
   029. 
   pkt.putInt(len);//数据长度 
  
   030. 
   int checksumOff = pkt.position(); 
  
   031. 
   int checksumLen = numChunks * checksumSize;//校验信息的长度 
  
   032. 
   byte[] buf = pkt.array();//获取字节缓冲区对应的字节数组 
  
   033. 
     
   034. 
   if (checksumSize > 0 && checksumIn != null) { 
  
   035. 
   try { 
  
   036. 
   checksumIn.readFully(buf, checksumOff, checksumLen);//将校验信息发送写到发送缓冲区中 
  
   037. 
   } catch (IOException e) { 
  
   038. 
   LOG.warn(" Could not read or failed to veirfy checksum for data" + 
  
   039. 
   " at offset " + offset + " for block " + block + " got : " 
  
   040. 
   + StringUtils.stringifyException(e)); 
  
   041. 
   IOUtils.closeStream(checksumIn); 
  
   042. 
   checksumIn = null; 
  
   043. 
   if (corruptChecksumOk) { 
  
   044. 
   if (checksumOff < checksumLen) { 
  
   045. 
   // Just fill the array with zeros. 
  
   046. 
   Arrays.fill(buf, checksumOff, checksumLen, (byte) 0); 
  
   047. 
   } 
  
   048. 
   } else { 
  
   049. 
   throw e; 
  
   050. 
   } 
  
   051. 
   } 
  
   052. 
   } 
  
   053. 
     
   054. 
   int dataOff = checksumOff + checksumLen;//数据部分的偏移 
  
   055. 
     
   056. 
   if (blockInPosition < 0) {//blockInPosition < 0表示不能进行“零拷贝”传输 
  
   057. 
   //normal transfer，进行传统的传输，即先将数据从文件读入内存缓冲区，再将数据通过Socket发送给客户端 
  
   058. 
   IOUtils.readFully(blockIn, buf, dataOff, len); 
  
   059. 
     
   060. 
   if (verifyChecksum) { 
  
   061. 
   int dOff = dataOff; 
  
   062. 
   int cOff = checksumOff; 
  
   063. 
   int dLeft = len; 
  
   064. 
     
   065. 
   for (int i=0; i<numChunks; i++) { 
  
   066. 
   checksum.reset(); 
  
   067. 
   int dLen = Math.min(dLeft, bytesPerChecksum); 
  
   068. 
   checksum.update(buf, dOff, dLen); 
  
   069. 
   if (!checksum.compare(buf, cOff)) { 
  
   070. 
   throw new ChecksumException("Checksum failed at " + 
  
   071. 
   (offset + len - dLeft), len); 
  
   072. 
   } 
  
   073. 
   dLeft -= dLen; 
  
   074. 
   dOff += dLen; 
  
   075. 
   cOff += checksumSize; 
  
   076. 
   } 
  
   077. 
   } 
  
   078. 
     
   079. 
   // only recompute checksum if we can't trust the meta data due to 
  
   080. 
   // concurrent writes 
  
   081. 
   if (memoizedBlock.hasBlockChanged(len)) {//如果数据发生了变化 
  
   082. 
   ChecksumUtil.updateChunkChecksum( 
  
   083. 
   buf, checksumOff, dataOff, len, checksum 
  
   084. 
   ); 
  
   085. 
   } 
  
   086. 
     
   087. 
   try { 
  
   088. 
   out.write(buf, 0, dataOff + len); 
  
   089. 
   } catch (IOException e) { 
  
   090. 
   throw ioeToSocketException(e); 
  
   091. 
   } 
  
   092. 
   } else { 
  
   093. 
   try { 
  
   094. 
   //use transferTo(). Checks on out and blockIn are already done. 
  
   095. 
   //使用transferTo()方法需要获得Socket的输出流和输入文件通道 
  
   096. 
   SocketOutputStream sockOut = (SocketOutputStream) out; 
  
   097. 
   FileChannel fileChannel = ((FileInputStream) blockIn).getChannel(); 
  
   098. 
     
   099. 
   if (memoizedBlock.hasBlockChanged(len)) {//有竞争存在 
  
   100. 
   //文件发生变化，假定出现读写竞争 
  
   101. 
   fileChannel.position(blockInPosition); 
  
   102. 
   IOUtils.readFileChannelFully( 
  
   103. 
   fileChannel, 
  
   104. 
   buf, 
  
   105. 
   dataOff, 
  
   106. 
   len 
  
   107. 
   ); 
  
   108. 
   //计算校验和 
  
   109. 
   ChecksumUtil.updateChunkChecksum( 
  
   110. 
   buf, checksumOff, dataOff, len, checksum 
  
   111. 
   );          
  
   112. 
   sockOut.write(buf, 0, dataOff + len); 
  
   113. 
   } else { 
  
   114. 
   //first write the packet，写数据和包校验信息 
  
   115. 
   sockOut.write(buf, 0, dataOff); 
  
   116. 
   // no need to flush. since we know out is not a buffered stream.零拷贝发送数据 
  
   117. 
   sockOut.transferToFully(fileChannel, blockInPosition, len); 
  
   118. 
   } 
  
   119. 
     
   120. 
   blockInPosition += len; 
  
   121. 
     
   122. 
   } catch (IOException e) { 
  
   123. 
   /* exception while writing to the client (well, with transferTo(), 
  
   124. 
   * it could also be while reading from the local file). 
  
   125. 
   */ 
  
   126. 
   throw ioeToSocketException(e); 
  
   127. 
   } 
  
   128. 
   } 
  
   129. 
     
   130. 
   if (throttler != null) { // rebalancing so throttle 
  
   131. 
   throttler.throttle(packetLen);//调用节流器对象的throttle()方法 
  
   132. 
   } 
  
   133. 
     
   134. 
   return len; 
  
   135. 
   }</code>

该方法有三个参数，ByteBuffer类型的pkt参数为发送缓冲区，不论是传统输出方式还是“零拷贝”输出方式，都需要发送包信息，在使用传统发送方式时，pkt中有包信息和数据信息，在“零拷贝”方式中，pkt则包含包信息。第二个参数是maxChunks，表示要发送几个校验块，第三个参数是输出流对象。
BlockSender.sendChunks()方法逻辑比较清晰，在发送缓冲区中写入包头部信息，然后是校验信息，最后通过blockInPosition变量来区分是通过传统输出方式发送还是“零拷贝”方式发送，blockInPosition变量默认是-1，在BlockSender.sendBlock()方法中可能会通过blockInPosition = fileChannel.position();这行代码改变，因为这行代码是在判断通过”零拷贝“方式发送后执行的，如果blockInPosition为-1，那么小于零，说明在BlockSender.sendBlock()方法中并未改变，如果值改变了，那么一定是一个非负值，因为FileChannel.position()方法的返回值是一个非负值，这个返回值代表FileChannel中的变量position的位置（请参考Java NIO中的部分）。