Hadoop DataNode启动之dataXceiverServer

  DN的主要工作用来存取数据,与其产生块交互的主要有两种角色:客户端和其他DN,数据块的收发是比较繁重的工作,虽然DN不必面临高并发的场景,但如果是串行服务的话必然会降低效率,为此,DN在每次接到块操作请求时,都会产生一个线程用于服务,这里说的dataXceiverServer就类似一餐馆老板,每来一个客人,就派一个小伙计出来服务,一对一的。DataXceiver就是小伙计的角色。dataXceiverServer的创建时机如下


  void startDataNode(Configuration conf, 
                     AbstractList<File> dataDirs, SecureResources resources
                     ) throws IOException {
  .............

    // 创建一个ServerSocket
    ServerSocket ss;
    if(secureResources == null) {
     //如果指定写超时时间,则用ServerSocketChannel来创建,该方法可以设置非阻塞模式,在网络环境差的情况下非常有用
      ss = (socketWriteTimeout > 0) ? 
        ServerSocketChannel.open().socket() : new ServerSocket();
      //将ServerSocket与端口绑定,默认端口为50010
      Server.bind(ss, socAddr, 0);
    } else {
      ss = resources.getStreamingSocket();
    }
    //设置接收缓冲区
    ss.setReceiveBufferSize(DEFAULT_DATA_SOCKET_SIZE); 
    // 获得真正绑定的端口
    tmpPort = ss.getLocalPort();
    selfAddr = new InetSocketAddress(ss.getInetAddress().getHostAddress(),
                                     tmpPort);
    //更新注册体内容,用于向NN汇报
    this.dnRegistration.setName(machineName + ":" + tmpPort);
    LOG.info("Opened info server at " + tmpPort);
    //创建一个线程组,对外监听的DataXceiverServer和对外服务的DataXceiver都放在该组中,注意这里都是后台守护线程
    this.threadGroup = new ThreadGroup("dataXceiverServer");
    //开始创建dataXceiverServer,该服务器会在DN最后做主循环前启动
    this.dataXceiverServer = new Daemon(threadGroup, 
        new DataXceiverServer(ss, conf, this));
    this.threadGroup.setDaemon(true); // auto destroy when empty

  ................
  }

下面看下dataXceiverServer的线程执行体,和我们自己编写的网络服务端差不多

  public void run() {
   //一直循环
    while (datanode.shouldRun) {
      try {
       //监听中.....
        Socket s = ss.accept();
        s.setTcpNoDelay(true);
        //创建一个新的线程服务客户端,注意他也是放在dataXceiverServer这个线程组里的
        new Daemon(datanode.threadGroup, 
            new DataXceiver(s, datanode, this)).start();
      } catch (SocketTimeoutException ignored) {
        // wake up to see if should continue to run
      } catch (AsynchronousCloseException ace) {
          LOG.warn(datanode.dnRegistration + ":DataXceiveServer:"
                  + StringUtils.stringifyException(ace));
          datanode.shouldRun = false;
      } catch (IOException ie) {
        LOG.warn(datanode.dnRegistration + ":DataXceiveServer: IOException due to:"
                                 + StringUtils.stringifyException(ie));
      } catch (Throwable te) {
        LOG.error(datanode.dnRegistration + ":DataXceiveServer: Exiting due to:" 
                                 + StringUtils.stringifyException(te));
        datanode.shouldRun = false;
      }
    }
    try {
     //关闭ss
      ss.close();
    } catch (IOException ie) {
      LOG.warn(datanode.dnRegistration + ":DataXceiveServer: Close exception due to: "
                               + StringUtils.stringifyException(ie));
    }
    LOG.info("Exiting DataXceiveServer");
  }

上面这个run方法并不是核心,真正干活的是DataXceiver,让我们看看他的执行体

    public void run() {
    DataInputStream in=null; 
    try {
     //构建读入流,因为先要读取一些验证信息
      in = new DataInputStream(
          new BufferedInputStream(NetUtils.getInputStream(s), 
                                  SMALL_BUFFER_SIZE));
      //传输版本对比
      short version = in.readShort();
      if ( version != DataTransferProtocol.DATA_TRANSFER_VERSION ) {
        throw new IOException( "Version Mismatch" );
      }
      //判断是否为本地操作,用于更新相关统计信息
      boolean local = s.getInetAddress().equals(s.getLocalAddress());
      //读取操作码
      byte op = in.readByte();
      // 确保活动线程数不大于dataXceiverServer.maxXceiverCount(256),这个限制还是很宽松的
      int curXceiverCount = datanode.getXceiverCount();
      if (curXceiverCount > dataXceiverServer.maxXceiverCount) {
        throw new IOException("xceiverCount " + curXceiverCount
                              + " exceeds the limit of concurrent xcievers "
                              + dataXceiverServer.maxXceiverCount);
      }
      //记录启动时间
      long startTime = DataNode.now();
      //根据不同操作码执行操作
      switch ( op ) {
      //读取数据块
      case DataTransferProtocol.OP_READ_BLOCK:
        readBlock( in );
        datanode.myMetrics.addReadBlockOp(DataNode.now() - startTime);
        if (local)
          datanode.myMetrics.incrReadsFromLocalClient();
        else
          datanode.myMetrics.incrReadsFromRemoteClient();
        break;
      //写入数据块
      case DataTransferProtocol.OP_WRITE_BLOCK:
        writeBlock( in );
        datanode.myMetrics.addWriteBlockOp(DataNode.now() - startTime);
        if (local)
          datanode.myMetrics.incrWritesFromLocalClient();
        else
          datanode.myMetrics.incrWritesFromRemoteClient();
        break;
      //替换数据块
      case DataTransferProtocol.OP_REPLACE_BLOCK: // for balancing purpose; send to a destination
        replaceBlock(in);
        datanode.myMetrics.addReplaceBlockOp(DataNode.now() - startTime);
        break;
      //拷贝数据块
      case DataTransferProtocol.OP_COPY_BLOCK:
            // for balancing purpose; send to a proxy source
        copyBlock(in);
        datanode.myMetrics.addCopyBlockOp(DataNode.now() - startTime);
        break;
      //检测数据块
      case DataTransferProtocol.OP_BLOCK_CHECKSUM: //get the checksum of a block
        getBlockChecksum(in);
        datanode.myMetrics.addBlockChecksumOp(DataNode.now() - startTime);
        break;
      default:
        throw new IOException("Unknown opcode " + op + " in data stream");
      }
    } catch (Throwable t) {
      LOG.error(datanode.dnRegistration + ":DataXceiver",t);
    } finally {
      LOG.debug(datanode.dnRegistration + ":Number of active connections is: "
                               + datanode.getXceiverCount());
      //关闭相关流并移除服务socket
      IOUtils.closeStream(in);
      IOUtils.closeSocket(s);
      dataXceiverServer.childSockets.remove(s);
    }
  }
  

以读取数据块为例,看下是如何操作的

    private void readBlock(DataInputStream in) throws IOException {
    //
    // 读取都信息
    //
    long blockId = in.readLong(); //读取blockId         
    Block block = new Block( blockId, 0 , in.readLong());//读取stamp标记


    long startOffset = in.readLong();//读取偏移量
    long length = in.readLong();//读取长度
    String clientName = Text.readString(in);//客户端名称
    Token<BlockTokenIdentifier> accessToken = new Token<BlockTokenIdentifier>();
    accessToken.readFields(in);
    //获得输出流用于发送block
    OutputStream baseStream = NetUtils.getOutputStream(s, 
        datanode.socketWriteTimeout);
    DataOutputStream out = new DataOutputStream(
                 new BufferedOutputStream(baseStream, SMALL_BUFFER_SIZE));
    
    if (datanode.isBlockTokenEnabled) {
      try {
        datanode.blockTokenSecretManager.checkAccess(accessToken, null, block,
            BlockTokenSecretManager.AccessMode.READ);
      } catch (InvalidToken e) {
        try {
          out.writeShort(DataTransferProtocol.OP_STATUS_ERROR_ACCESS_TOKEN);
          out.flush();
          throw new IOException("Access token verification failed, for client "
              + remoteAddress + " for OP_READ_BLOCK for block " + block);
        } finally {
          IOUtils.closeStream(out);
        }
      }
    }
    // 构建blockSender用于发送block
    BlockSender blockSender = null;
    //构建日志信息类似下面这种,我们在日志里会经常看到
    //src: /127.0.0.1:50010, dest: /127.0.0.1:50243, bytes: %d, op: HDFS_READ, cliID: DFSClient_-880133444, offset: %d, srvID: DS-1789183053-125.120.30.128-50010-1379249313769, blockid: blk_5420252401562768646_1005, duration: %d
    //源地址、目的地址、发送量、操作类型、客户端名称、偏移量等等....
    final String clientTraceFmt =
      clientName.length() > 0 && ClientTraceLog.isInfoEnabled()
        ? String.format(DN_CLIENTTRACE_FORMAT, localAddress, remoteAddress,
            "%d", "HDFS_READ", clientName, "%d", 
            datanode.dnRegistration.getStorageID(), block, "%d")
        : datanode.dnRegistration + " Served block " + block + " to " +
            s.getInetAddress();
    try {
      try {
       //当读取的块大于一个块大小的时候,length值则为一个块大小,这是HDFS的IO单位
        blockSender = new BlockSender(block, startOffset, length,
            true, true, false, datanode, clientTraceFmt);
      } catch(IOException e) {
        out.writeShort(DataTransferProtocol.OP_STATUS_ERROR);
        throw e;
      }

      out.writeShort(DataTransferProtocol.OP_STATUS_SUCCESS); // 发送操作状态
      long read = blockSender.sendBlock(out, baseStream, null); // 真正的发送数据

      if (blockSender.isBlockReadFully()) {
        // 如果数据块发送完毕,则等客户端返回状态,以确定是否成功
        try {
          if (in.readShort() == DataTransferProtocol.OP_STATUS_CHECKSUM_OK  && 
              datanode.blockScanner != null) {
            datanode.blockScanner.verifiedByClient(block);
          }
        } catch (IOException ignored) {}
      }
      //更新相关统计信息
      datanode.myMetrics.incrBytesRead((int) read);
      datanode.myMetrics.incrBlocksRead();
    } catch ( SocketException ignored ) {
      // Its ok for remote side to close the connection anytime.
      datanode.myMetrics.incrBlocksRead();
    } catch ( IOException ioe ) {
      /* What exactly should we do here?
       * Earlier version shutdown() datanode if there is disk error.
       */
      LOG.warn(datanode.dnRegistration +  ":Got exception while serving " + 
          block + " to " +
                s.getInetAddress() + ":\n" + 
                StringUtils.stringifyException(ioe) );
      throw ioe;
    } finally {
     //关闭相关流
      IOUtils.closeStream(out);
      IOUtils.closeStream(blockSender);
    }
  }

发送数据的操作是我们关心的,这也是BlockSender的主要功能,他首先会向客户端发送校验信息,比如校验时chunk的大小,然后会确定发送包的大小,这些都确定好后,则真正开始发送

  long sendBlock(DataOutputStream out, OutputStream baseStream, 
                 BlockTransferThrottler throttler) throws IOException {
    if( out == null ) {
      throw new IOException( "out stream is null" );
    }
    this.throttler = throttler;//节流器

    long initialOffset = offset;//偏移量
    long totalRead = 0;//本次读取的总长度,用于返回
    OutputStream streamForSendChunks = out;
    //记录开始时间
    final long startTime = ClientTraceLog.isInfoEnabled() ? System.nanoTime() : 0; 
    try {
      try {
       //写入校验头信息,告诉客户端如何校验
        checksum.writeHeader(out);
        if ( chunkOffsetOK ) {
          out.writeLong( offset );//从哪里开始校验
        }
        out.flush();//刷新到客户端
      } catch (IOException e) { //socket error
        throw ioeToSocketException(e);
      }
      //下面这段判断主要用于确定发送的包大小
      int maxChunksPerPacket;
      int pktSize = DataNode.PKT_HEADER_LEN + SIZE_OF_INTEGER;
      
      if (transferToAllowed && !verifyChecksum && 
          baseStream instanceof SocketOutputStream && 
          blockIn instanceof FileInputStream) {
        
        FileChannel fileChannel = ((FileInputStream)blockIn).getChannel();
        
        // 记录文件位置,发送块的其实位置
        blockInPosition = fileChannel.position();
        streamForSendChunks = baseStream;
        
        // 计算每个包发送多少chunks
        maxChunksPerPacket = (Math.max(BUFFER_SIZE,
                                       MIN_BUFFER_WITH_TRANSFERTO)
                              + bytesPerChecksum - 1)/bytesPerChecksum;
        
        // 计算包大小,此值会确定分配缓存的大小
        pktSize += (bytesPerChecksum + checksumSize) * maxChunksPerPacket;
      } else {
        maxChunksPerPacket = Math.max(1,
                 (BUFFER_SIZE + bytesPerChecksum - 1)/bytesPerChecksum);
        pktSize += (bytesPerChecksum + checksumSize) * maxChunksPerPacket;
      }
      //分配缓冲区
      ByteBuffer pktBuf = ByteBuffer.allocate(pktSize);
      //循环发送chunks
      while (endOffset > offset) {
        long len = sendChunks(pktBuf, maxChunksPerPacket, 
                              streamForSendChunks);
        offset += len;
        totalRead += len + ((len + bytesPerChecksum - 1)/bytesPerChecksum*
                            checksumSize);
        seqno++;
      }
      try {
       写结束标记位并刷新
        out.writeInt(0); // mark the end of block        
        out.flush();
      } catch (IOException e) { //socket error
        throw ioeToSocketException(e);
      }
    }
    catch (RuntimeException e) {
      LOG.error("unexpected exception sending block", e);
      
      throw new IOException("unexpected runtime exception", e);
    } 
    finally {
      if (clientTraceFmt != null) {
       //记录发送时间
        final long endTime = System.nanoTime();
        ClientTraceLog.info(String.format(clientTraceFmt, totalRead, initialOffset, endTime - startTime));
      }
      close();
    }
  //记录是否发送完毕
    blockReadFully = (initialOffset == 0 && offset >= blockLength);

    return totalRead;
  }

如何发送chunk的呢?

 

  private int sendChunks(ByteBuffer pkt, int maxChunks, OutputStream out) 
                         throws IOException {
    // 至多读取一个包大小的chunks
    int len = Math.min((int) (endOffset - offset),
                       bytesPerChecksum*maxChunks);
    
    // 通过该计算确定一个包读取整数个chunk,以防客户端校验出错
    if (len > bytesPerChecksum && len % bytesPerChecksum != 0) {
      len -= len % bytesPerChecksum;
    }
    
    if (len == 0) {
      return 0;
    }
  //计算本次读取多少chunk
    int numChunks = (len + bytesPerChecksum - 1)/bytesPerChecksum;
    int packetLen = len + numChunks*checksumSize + 4;
    pkt.clear();
    
    // 写包头信息,长度、偏移、序列号
    pkt.putInt(packetLen);
    pkt.putLong(offset);
    pkt.putLong(seqno);
    pkt.put((byte)((offset + len >= endOffset) ? 1 : 0));
               //why no ByteBuf.putBoolean()?
    pkt.putInt(len);
    
    int checksumOff = pkt.position();
    int checksumLen = numChunks * checksumSize;
    //获取包存储区,用于存放发送的数据
    byte[] buf = pkt.array();
    
    if (checksumSize > 0 && checksumIn != null) {
      try {
       //读取一组chunk的checksum
        checksumIn.readFully(buf, checksumOff, checksumLen);
      } catch (IOException e) {
        LOG.warn(" Could not read or failed to veirfy checksum for data" +
                 " at offset " + offset + " for block " + block + " got : "
                 + StringUtils.stringifyException(e));
        IOUtils.closeStream(checksumIn);
        checksumIn = null;
        if (corruptChecksumOk) {
          if (checksumOff < checksumLen) {
            // Just fill the array with zeros.
            Arrays.fill(buf, checksumOff, checksumLen, (byte) 0);
          }
        } else {
          throw e;
        }
      }
    }
    //更新偏移量,开始读取数据
    int dataOff = checksumOff + checksumLen;
    
    if (blockInPosition < 0) {
      //normal transfer
      IOUtils.readFully(blockIn, buf, dataOff, len);

      if (verifyChecksum) {
        int dOff = dataOff;
        int cOff = checksumOff;
        int dLeft = len;

        for (int i=0; i<numChunks; i++) {
          checksum.reset();
          int dLen = Math.min(dLeft, bytesPerChecksum);
          checksum.update(buf, dOff, dLen);
          if (!checksum.compare(buf, cOff)) {//校验码对比,如果失败则记录日志,在DataBlockScanner中还会用到,具体看下篇
            throw new ChecksumException("Checksum failed at " + 
                                        (offset + len - dLeft), len);
          }
          dLeft -= dLen;
          dOff += dLen;
          cOff += checksumSize;
        }
      }
      
      // only recompute checksum if we can't trust the meta data due to 
      // concurrent writes
      if (memoizedBlock.hasBlockChanged(len)) {
        ChecksumUtil.updateChunkChecksum(
          buf, checksumOff, dataOff, len, checksum
        );
      }
      
      try {
        out.write(buf, 0, dataOff + len);
      } catch (IOException e) {
        throw ioeToSocketException(e);
      }
    } else {
      try {
        //use transferTo(). Checks on out and blockIn are already done. 
        SocketOutputStream sockOut = (SocketOutputStream) out;
        FileChannel fileChannel = ((FileInputStream) blockIn).getChannel();

        if (memoizedBlock.hasBlockChanged(len)) {
          fileChannel.position(blockInPosition);
          IOUtils.readFileChannelFully(
            fileChannel,
            buf,
            dataOff,
            len
          );
          //更新校验码
          ChecksumUtil.updateChunkChecksum(
            buf, checksumOff, dataOff, len, checksum
          );          
          sockOut.write(buf, 0, dataOff + len);
        } else {
          //写出数据
          sockOut.write(buf, 0, dataOff);
          // no need to flush. since we know out is not a buffered stream.
          sockOut.transferToFully(fileChannel, blockInPosition, len);
        }

        blockInPosition += len;

      } catch (IOException e) {
      /* exception while writing to the client (well, with transferTo(),
       * it could also be while reading from the local file).
       */
        throw ioeToSocketException(e);
      }
    }

    if (throttler != null) { // 发送完一个chunk,则通过节流器判断是否超出阈值,如果则产生等待,以节省网络带宽
      throttler.throttle(packetLen);
    }

    return len;
  }


 


 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值