Hadoop3.2.1 【 HDFS 】源码分析 :DatanodeManager解析

 

一.前言

DatanodeManager的主要功能是管理数据节点,包括退役和其他活动。记录了在Namenode上注册的Datanode, 以及这些Datanode在网
络中的拓扑结构等信息。 

 

二.构造方法


  DatanodeManager(final BlockManager blockManager, final Namesystem namesystem,final Configuration conf) throws IOException {

    this.namesystem = namesystem;
    this.blockManager = blockManager;

    // dfs.use.dfs.network.topology : true
    this.useDfsNetworkTopology = conf.getBoolean(
        DFSConfigKeys.DFS_USE_DFS_NETWORK_TOPOLOGY_KEY,
        DFSConfigKeys.DFS_USE_DFS_NETWORK_TOPOLOGY_DEFAULT);


    if (useDfsNetworkTopology) {
      //默认使用 DFSNetworkTopology
      networktopology = DFSNetworkTopology.getInstance(conf);
    } else {
      networktopology = NetworkTopology.getInstance(conf);
    }

    //构建HeartbeatManager
    this.heartbeatManager = new HeartbeatManager(namesystem, blockManager, conf);

    //构建DatanodeAdminManager
    this.datanodeAdminManager = new DatanodeAdminManager(namesystem, blockManager, heartbeatManager);

    this.fsClusterStats = newFSClusterStats();

    // 启用慢速DataNode检测  : false
    // dfs.datanode.peer.stats.enabled : false
    this.dataNodePeerStatsEnabled = conf.getBoolean(
        DFSConfigKeys.DFS_DATANODE_PEER_STATS_ENABLED_KEY,
        DFSConfigKeys.DFS_DATANODE_PEER_STATS_ENABLED_DEFAULT);
    // 是否启用磁盘统计 0 不启用
    // dfs.datanode.fileio.profiling.sampling.percentage : 0
    this.dataNodeDiskStatsEnabled = Util.isDiskStatsEnabled(conf.getInt(
        DFSConfigKeys.DFS_DATANODE_FILEIO_PROFILING_SAMPLING_PERCENTAGE_KEY,
        DFSConfigKeys.
            DFS_DATANODE_FILEIO_PROFILING_SAMPLING_PERCENTAGE_DEFAULT));

    final Timer timer = new Timer();
    //构建SlowPeerTracker对象 启用慢速DataNode检测
    this.slowPeerTracker = dataNodePeerStatsEnabled ?  new SlowPeerTracker(conf, timer) : null;

    //构建 SlowDiskTracker 对象 是否启用磁盘统计
    this.slowDiskTracker = dataNodeDiskStatsEnabled ? new SlowDiskTracker(conf, timer) : null;

    // dfs.datanode.address : 0.0.0.0:9866
    this.defaultXferPort = NetUtils.createSocketAddr(
          conf.getTrimmed(DFSConfigKeys.DFS_DATANODE_ADDRESS_KEY,
              DFSConfigKeys.DFS_DATANODE_ADDRESS_DEFAULT)).getPort();

    // dfs.datanode.http.address : 0.0.0.0:9864
    this.defaultInfoPort = NetUtils.createSocketAddr(
          conf.getTrimmed(DFSConfigKeys.DFS_DATANODE_HTTP_ADDRESS_KEY,
              DFSConfigKeys.DFS_DATANODE_HTTP_ADDRESS_DEFAULT)).getPort();

    // dfs.datanode.https.address : 0.0.0.0:9865
    this.defaultInfoSecurePort = NetUtils.createSocketAddr(
        conf.getTrimmed(DFSConfigKeys.DFS_DATANODE_HTTPS_ADDRESS_KEY,
            DFSConfigKeys.DFS_DATANODE_HTTPS_ADDRESS_DEFAULT)).getPort();

    // dfs.datanode.ipc.address : 0.0.0.0:9867
    this.defaultIpcPort = NetUtils.createSocketAddr(
          conf.getTrimmed(DFSConfigKeys.DFS_DATANODE_IPC_ADDRESS_KEY,
              DFSConfigKeys.DFS_DATANODE_IPC_ADDRESS_DEFAULT)).getPort();

    // dfs.namenode.hosts.provider.classname : default  HostConfigManager
    this.hostConfigManager = ReflectionUtils.newInstance(
        conf.getClass(DFSConfigKeys.DFS_NAMENODE_HOSTS_PROVIDER_CLASSNAME_KEY,
            HostFileManager.class, HostConfigManager.class), conf);
    try {
      // 刷新配置重新加载hosts配置
      this.hostConfigManager.refresh();
    } catch (IOException e) {
      LOG.error("error reading hosts files: ", e);
    }

    // net.topology.node.switch.mapping.impl :  default ScriptBasedMapping
    this.dnsToSwitchMapping = ReflectionUtils.newInstance(
        conf.getClass(DFSConfigKeys.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, 
            ScriptBasedMapping.class, DNSToSwitchMapping.class), conf);

    // dfs.namenode.reject-unresolved-dn-topology-mapping : false
    this.rejectUnresolvedTopologyDN = conf.getBoolean(
        DFSConfigKeys.DFS_REJECT_UNRESOLVED_DN_TOPOLOGY_MAPPING_KEY,
        DFSConfigKeys.DFS_REJECT_UNRESOLVED_DN_TOPOLOGY_MAPPING_DEFAULT);

    // 如果dns-to-switch映射支持缓存,请在include列表中解析这些主机的网络位置,并将映射存储在缓存中;
    // 因此,将来的解析调用将很快。

    // If the dns to switch mapping supports cache, resolve network
    // locations of those hosts in the include list and store the mapping
    // in the cache; so future calls to resolve will be fast.
    if (dnsToSwitchMapping instanceof CachedDNSToSwitchMapping) {
      final ArrayList<String> locations = new ArrayList<>();
      for (InetSocketAddress addr : hostConfigManager.getIncludes()) {
        locations.add(addr.getAddress().getHostAddress());
      }
      dnsToSwitchMapping.resolve(locations);
    }

    // dfs.heartbeat.interval  : 3 seconds
    heartbeatIntervalSeconds = conf.getTimeDuration(
        DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
        DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT, TimeUnit.SECONDS);

    //
    // dfs.namenode.heartbeat.recheck-interval : 5 minutes
    heartbeatRecheckInterval = conf.getInt(
        DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, 
        DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT); // 5 minutes

    // 等待间隔 2 * 5*60*1000 + 10 * 1000 *   3  =  630 * 1000 = 630秒 = 10.5分钟
    this.heartbeatExpireInterval = 2 * heartbeatRecheckInterval
        + 10 * 1000 * heartbeatIntervalSeconds;
    // dfs.block.invalidate.limit : 1000
    // 删除block的限制为 dfs.block.invalidate.limit:1000 与 20*心跳间隔  之间的最大值.  也就是说如果心跳间隔小与500秒一次的话, 那么限制数量为1000, 如果心跳间隔大于50s ,那么就是20*心跳间隔的时间.
    // Effected block invalidate limit is the bigger value between value configured in hdfs-site.xml, and 20 * HB interval.
    final int configuredBlockInvalidateLimit = conf.getInt(
        DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY,
        DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_DEFAULT);
    final int countedBlockInvalidateLimit = 20*(int)(heartbeatIntervalSeconds);
    this.blockInvalidateLimit = Math.max(countedBlockInvalidateLimit,
        configuredBlockInvalidateLimit);

    LOG.info(DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY
        + ": configured=" + configuredBlockInvalidateLimit
        + ", counted=" + countedBlockInvalidateLimit
        + ", effected=" + blockInvalidateLimit);
    // dfs.namenode.datanode.registration.ip-hostname-check : true
    // 当我们在配置datanode时,如果不是使用了主机名加dns解析或者hosts文件解析的方式,而是直接使用ip地址去配置slaves文件,那么就会产生这个错误。
    this.checkIpHostnameInRegistration = conf.getBoolean(
        DFSConfigKeys.DFS_NAMENODE_DATANODE_REGISTRATION_IP_HOSTNAME_CHECK_KEY,
        DFSConfigKeys.DFS_NAMENODE_DATANODE_REGISTRATION_IP_HOSTNAME_CHECK_DEFAULT);
    LOG.info(DFSConfigKeys.DFS_NAMENODE_DATANODE_REGISTRATION_IP_HOSTNAME_CHECK_KEY
        + "=" + checkIpHostnameInRegistration);


    // dfs.namenode.avoid.read.stale.datanode false
    this.avoidStaleDataNodesForRead = conf.getBoolean(
        DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_READ_KEY,
        DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_READ_DEFAULT);

    //dfs.namenode.avoid.write.stale.datanode false
    this.avoidStaleDataNodesForWrite = conf.getBoolean(
        DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_KEY,
        DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_DEFAULT);
    this.staleInterval = getStaleIntervalFromConf(conf, heartbeatExpireInterval);

    // dfs.namenode.write.stale.datanode.ratio : 0.5f
    this.ratioUseStaleDataNodesForWrite = conf.getFloat(
        DFSConfigKeys.DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_KEY,
        DFSConfigKeys.DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_DEFAULT);


    Preconditions.checkArgument(
        (ratioUseStaleDataNodesForWrite > 0 && 
            ratioUseStaleDataNodesForWrite <= 1.0f),
        DFSConfigKeys.DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_KEY +
        " = '" + ratioUseStaleDataNodesForWrite + "' is invalid. " +
        "It should be a positive non-zero float value, not greater than 1.0f.");


    // dfs.namenode.path.based.cache.retry.interval.ms : 30_000L
    this.timeBetweenResendingCachingDirectivesMs = conf.getLong(
        DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_RETRY_INTERVAL_MS,
        DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_RETRY_INTERVAL_MS_DEFAULT);

    // dfs.namenode.blocks.per.postponedblocks.rescan : 10_000
    this.blocksPerPostponedMisreplicatedBlocksRescan = conf.getLong(
        DFSConfigKeys.DFS_NAMENODE_BLOCKS_PER_POSTPONEDBLOCKS_RESCAN_KEY,
        DFSConfigKeys.DFS_NAMENODE_BLOCKS_PER_POSTPONEDBLOCKS_RESCAN_KEY_DEFAULT);
  }

三.属性

datanodeMap: 维护StorageId -> DatanodeDescriptor的映射关系。

private final Map<String, DatanodeDescriptor> datanodeMap  = new HashMap<>();


host2DatanodeMap: 维护host -> DatanodeDescriptor的映射关系。

这里为什么要维护两个Datanode的映射关系呢? 这是因为Datanode的storageId是有可能发生变化的。 一般情况下这两个映射关系是一致的, 但是可能会出现Datanode重新启动后使用一个新storageId注册的情况, 这时候就需要对datanodeMap字段进行更新。 


  /**
   * 保存了datanode的映射关系[两种]
   * 1.  hostname --> ip地址
   * 2.  ip地址 --> DatanodeDescriptor [ ]  , 一个ip上可能存在多个datanode[罕见情况:主机上有多个datanode]
   * Host names to datanode descriptors mapping.
   *
   *  */
  private final Host2NodesMap host2DatanodeMap = new Host2NodesMap();


networktopology: 维护整个网络的拓扑结构。 [这部分需要单独拎出来说一下]

默认使用  DFSNetworkTopology  , 否则使用 NetworkTopology

 

DFSNetworkTopology : HDFS专用的网络拓扑类。执行此子类化的主要目的是添加存储类型感知chooseRandom方法。所有剩余部分应相同。当前是用于测试存储类型信息的占位符。

NetworkTopology : 该类表示具有树状层次网络拓扑结构的计算机群集。例如,集群可能由许多数据中心组成电脑机架。在网络拓扑中,叶子代表数据节点(计算机),而内部节点代表管理进出数据中心或机架的交换机/路由器。

 

四.添加和撤销Datanode

HDFS的一个重要特征就是具有弹性, 也就是当HDFS需要增加容量时, 可以动态地向集群中添加新的Datanode;同理, 当HDFS需要减小规模时, 可以动态地撤销已经存在的Datanode。 无论是添加还是撤销Datanode的操作, 都不会影响HDFS服务。

HDFS提供了dfs.hosts文件(由配置项dfs.hosts指定, 又称include文件) 以及dfs.hosts.exclude文件(由配置项dfs.hosts.exclude指定, 简称exclude文件) 管理接入到HDFS的Datanode。 include文件指定了可以连接到Namenode的Datanode列表, exclude文件指定了不能连接到Namenode的Datanode列表, 这两个文件都是文本文件, 一行表示一个Datanode。 一个未定义或空的include文件意味着所有Datanode都可以连接到Namenode。

4.1.refreshNodes()

执行“dfsadmin -refreshNodes”命令会调用RPC方法ClientProtocol.refreshNodes()通知Namenode更新include文件和exclude文件, 这个操作最终会由DatanodeManager.refreshNodes()方法响应。 refreshNodes()方法会首先调用refreshHostsReader()方法将include文件与exclude文件加载到hostFileManager对象中, 之后调用refreshDatanodes()刷新所有的数据节点


  /**
   * Rereads conf to get hosts and exclude list file names.
   * Rereads the files to update the hosts and exclude lists.  It
   * checks if any of the hosts have changed states:
   */
  public void refreshNodes(final Configuration conf) throws IOException {

    // 加载include文件与exclude文件至 hostConfigManager
    refreshHostsReader(conf);
    namesystem.writeLock();
    try {
      //刷新所有的数据节点
      refreshDatanodes();
      countSoftwareVersions();
    } finally {
      namesystem.writeUnlock();
    }
  }

refreshDatanodes()方法会遍历DatanodeManager.datanodeMap字段中保存的所有DatanodeDescriptor对象,对于不可以连接到Namenode的Datanode, 设置对应的DatanodeDescriptor.setDisallowed字段为true, 表明当前Datanode不可以接入HDFS集群。 对于exclude文件中的节点, 需要进行撤销操作, 则调用startDecommission()开始撤销操作; 不在exclude文件中的节点, 则调用stopDecommission()停止撤销操作


  /**
   * Reload datanode membership and the desired admin operations from
   * host files. If a node isn't allowed, hostConfigManager.isIncluded returns
   * false and the node can't be used.
   * If a node is allowed and the desired admin operation is defined,
   * it will transition to the desired admin state.
   * If a node is allowed and upgrade domain is defined,
   * the upgrade domain will be set on the node.
   * To use maintenance mode or upgrade domain, set
   * DFS_NAMENODE_HOSTS_PROVIDER_CLASSNAME_KEY to
   * CombinedHostFileManager.class.
   */
  private void refreshDatanodes() {
    final Map<String, DatanodeDescriptor> copy;
    synchronized (this) {
      copy = new HashMap<>(datanodeMap);
    }
    for (DatanodeDescriptor node : copy.values()) {

      // Check if not include.
      //不在include文件中
      if (!hostConfigManager.isIncluded(node)) {
        //直接设置为已撤销状态, DatanodeDescriptor.isAllowed=false
        //不会拷贝Datanode上的数据块
        //所以, 在撤销节点时, 先在exclude文件中添加, 撤销结束后再从include文件中删除
        node.setDisallowed(true);
      } else {

        // 以毫秒为单位获取维护过期时间。
        long maintenanceExpireTimeInMS = hostConfigManager.getMaintenanceExpirationTimeInMS(node);

        if (node.maintenanceNotExpired(maintenanceExpireTimeInMS)) {
          // 开始维护
          datanodeAdminManager.startMaintenance(  node, maintenanceExpireTimeInMS);
        } else if (hostConfigManager.isExcluded(node)) {
          ///开始退役
          datanodeAdminManager.startDecommission(node);
        } else {
          //停止退役/维护操作
          datanodeAdminManager.stopMaintenance(node);
          datanodeAdminManager.stopDecommission(node);
        }
      }
      // 更新..
      node.setUpgradeDomain(hostConfigManager.getUpgradeDomain(node));
    }
  }

DatanodeInfo.adminState字段用于标识当前Datanode的状态。

public enum AdminStates {
    NORMAL("In Service"), //服务中
    DECOMMISSION_INPROGRESS("Decommission In Progress"), //退役中
    DECOMMISSIONED("Decommissioned"),   //退役
    ENTERING_MAINTENANCE("Entering Maintenance"), //这维护中
    IN_MAINTENANCE("In Maintenance"); //维护中

    //代码略....
}

startDecommission()方法会首先调用heartbeatManager.startDecommission()将当前Datanode对应的DatanodeDescriptor.adminState设置为AdminStates.DECOMMISSION_INPROGRESS状态。之后调用checkDecommissionState()检查撤销操作是否完成, 如果完成则将DatanodeDescriptor.adminState设置为AdminStates.DECOMMISSIONED

 


  /**
   * Start decommissioning the specified datanode.
   * @param node
   */
  @VisibleForTesting
  public void startDecommission(DatanodeDescriptor node) {
    //判断当前节点不为退役相关状态
    if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
      // HeartbeatManager 设置DatanodeDescriptor.adminState状态
      hbManager.startDecommission(node);
      // hbManager.startDecommission will set dead node to decommissioned.
      //退役中...
      if (node.isDecommissionInProgress()) {
        for (DatanodeStorageInfo storage : node.getStorageInfos()) {
          LOG.info("Starting decommission of {} {} with {} blocks",
              node, storage, storage.numBlocks());
        }
        //设置退役/维护的开始时间
        node.getLeavingServiceStatus().setStartTime(monotonicNow());
        //加入队列,准备开启退役...
        pendingNodes.add(node);
      }

    } else {
      LOG.trace("startDecommission: Node {} in {}, nothing to do.",
          node, node.getAdminState());
    }
  }

stopDecommission()方法的逻辑也是类似的.首先调用heartbeatManager.stopDecommission()方法将DatanodeDescriptor.adminState设置为null, 之后调用blockManager.processExtraRedundancyBlocksOnInService对超出备份数目的数据块进行判断, 并进行删除操作.最后将改datanode从pendingNodes 和outOfServiceNodeBlocks中移除.

  /**
   * Stop decommissioning the specified datanode.
   * @param node
   */
  @VisibleForTesting
  public void stopDecommission(DatanodeDescriptor node) {
    if (node.isDecommissionInProgress() || node.isDecommissioned()) {
      // Update DN stats maintained by HeartbeatManager
      // 将DatanodeDescriptor.adminState设置为null
      hbManager.stopDecommission(node);
      // extra redundancy blocks will be detected and processed when
      // the dead node comes back and send in its full block report.
      if (node.isAlive()) {
        //由于节点重新上架, 需要对超出备份数目的数据块进行判断, 并进行删除操作
        blockManager.processExtraRedundancyBlocksOnInService(node);
      }
      // Remove from tracking in DatanodeAdminManager
      pendingNodes.remove(node);

      //加入 退出 退役队列
      outOfServiceNodeBlocks.remove(node);
    } else {
      LOG.trace("stopDecommission: Node {} in {}, nothing to do.",
          node, node.getAdminState());
    }
  }

 

4.2.DatanodeAdminManager#Monitor: run()

startDecommission 和stopDecommission都是将datanode节点加入到pendingNodes队列中. 

DatanodeAdminManager的内部类Monitor会启动一个线程用户用于处理挂起/退役的datanode节点.


    @Override
    public void run() {
      LOG.debug("DatanodeAdminMonitor is running.");
      if (!namesystem.isRunning()) {
        LOG.info("Namesystem is not running, skipping " +
            "decommissioning/maintenance checks.");
        return;
      }
      // Reset the checked count at beginning of each iteration
      numBlocksChecked = 0;
      numBlocksCheckedPerLock = 0;
      numNodesChecked = 0;
      // Check decommission or maintenance progress.
      namesystem.writeLock();
      try {

        // 处理挂起节点
        processPendingNodes();
        check();
      } catch (Exception e) {
        LOG.warn("DatanodeAdminMonitor caught exception when processing node.",
            e);
      } finally {
        namesystem.writeUnlock();
      }
      if (numBlocksChecked + numNodesChecked > 0) {
        LOG.info("Checked {} blocks and {} nodes this tick", numBlocksChecked,
            numNodesChecked);
      }
    }

    /**
     * Pop datanodes off the pending list and into decomNodeBlocks,
     * subject to the maxConcurrentTrackedNodes limit.
     */
    private void processPendingNodes() {
      while (!pendingNodes.isEmpty() &&
          (maxConcurrentTrackedNodes == 0 ||
          outOfServiceNodeBlocks.size() < maxConcurrentTrackedNodes)) {
        outOfServiceNodeBlocks.put(pendingNodes.poll(), null);
      }
    }

后面还有个check() 方法, 那个太长了,我就不啰嗦了.自己看一下....

 

4.3.Datanode的启动

Datanode启动时, 需要与Namenode进行握手、 注册和数据块上报三个操作。这三个操作分别对应于DatanodeProtocol的versionRequest()、 registerDatanode()以及blockReport()方法。

握手请求由NameNodeRpcServer实现, 非常简单, 它直接返回命名空间的信息

  // 握手操作
  @Override // DatanodeProtocol, NamenodeProtocol
  public NamespaceInfo versionRequest() throws IOException {
    //检测NN是否启动
    checkNNStartup();
    // 获取getNamespaceInfo
    return namesystem.getNamespaceInfo();
  }

4.4.注册

注册请求由NameNodeRpcServer实现, 非常简单, 它直接返回命名空间的信息

Datanode的注册请求会由通过NameNodeRpcServer.registerDatanode方法,调用 FSNamesystem.registerDatanode()方法响应。通过Datanode会为注册的Datanode分配唯一的storageld作为标识(storageld在datanodeMap中作为key, 用于获取DatanodeDescriptor对象) 。 需要注意的是, 数据节点可以重复发送注册信息, 并且Datanode的storageld是有可能发生改变的(当数据节点上的数据都被擦除时) 。 同时在Datanode注册时, DatanodeManager还需要对Datanode是否可以接入进行判断(include/exclude文件判断) 。

 

Datanode的注册情况可以分为以下三种。

■ 该Datanode没有注册过。
■ 该Datanode注册过, 但是这次注册使用了新的storageld, 表明该数据节点的存储空间已经被清理过, 原有的数据块副本都被删除了。
■ 该Datanode注册过, 这次是重复注册。

 

NameNodeRpcServer#registerDatanode:

  @Override // DatanodeProtocol
  public DatanodeRegistration registerDatanode(DatanodeRegistration nodeReg)
      throws IOException {
    checkNNStartup();
    verifySoftwareVersion(nodeReg);
    namesystem.registerDatanode(nodeReg);
    return nodeReg;
  }

 

FSNamesystem#registerDatanode: 

  /**
   * Register Datanode.
   * <p>
   * The purpose of registration is to identify whether the new datanode
   * serves a new data storage, and will report new data block copies,
   * which the namenode was not aware of; or the datanode is a replacement
   * node for the data storage that was previously served by a different
   * or the same (in terms of host:port) datanode.
   * The data storages are distinguished by their storageIDs. When a new
   * data storage is reported the namenode issues a new unique storageID.
   * <p>
   * Finally, the namenode returns its namespaceID as the registrationID
   * for the datanodes. 
   * namespaceID is a persistent attribute of the name space.
   * The registrationID is checked every time the datanode is communicating
   * with the namenode. 
   * Datanodes with inappropriate registrationID are rejected.
   * If the namenode stops, and then restarts it can restore its 
   * namespaceID and will continue serving the datanodes that has previously
   * registered with the namenode without restarting the whole cluster.
   * 
   * @see org.apache.hadoop.hdfs.server.datanode.DataNode
   */
  void registerDatanode(DatanodeRegistration nodeReg) throws IOException {
    writeLock();
    try {
      // 注册datanode
      blockManager.registerDatanode(nodeReg);
    } finally {
      writeUnlock("registerDatanode");
    }
  }

BlockManager#registerDatanode:

  public void registerDatanode(DatanodeRegistration nodeReg)
      throws IOException {
    assert namesystem.hasWriteLock();
    datanodeManager.registerDatanode(nodeReg);
    bmSafeMode.checkSafeMode();
  }

DataNodeManager#registerDatanode


  /**
   *
   *
   * Register the given datanode with the namenode.
   * NB: the given registration is mutated and given back to the datanode.
   *
   * @param nodeReg the datanode registration
   * @throws DisallowedDatanodeException if the registration request is
   *    denied because the datanode does not match includes/excludes
   * @throws UnresolvedTopologyException if the registration request is 
   *    denied because resolving datanode network location fails.
   *
   */
  public void registerDatanode(DatanodeRegistration nodeReg)
      throws DisallowedDatanodeException, UnresolvedTopologyException {

    //获取 remote 端 的ip
    InetAddress dnAddress = Server.getRemoteIp();
    if (dnAddress != null) {
      // 主要在RPC内部调用,更新ip和对等主机名
      // Mostly called inside an RPC, update ip and peer hostname
      String hostname = dnAddress.getHostName();
      String ip = dnAddress.getHostAddress();

      if (checkIpHostnameInRegistration && !isNameResolved(dnAddress)) {
        // Reject registration of unresolved datanode to prevent performance
        // impact of repetitive DNS lookups later.
        final String message = "hostname cannot be resolved (ip="
            + ip + ", hostname=" + hostname + ")";
        LOG.warn("Unresolved datanode registration: " + message);
        throw new DisallowedDatanodeException(nodeReg, message);
      }


      // update node registration with the ip and hostname from rpc request
      nodeReg.setIpAddr(ip);
      nodeReg.setPeerHostName(hostname);
    }
    
    try {

      nodeReg.setExportedKeys(blockManager.getBlockKeys());

      // 判断是否在include中,如果没有则不允许注册
      // Checks if the node is not on the hosts list.
      // If it is not, then it will be disallowed from registering.
      if (!hostConfigManager.isIncluded(nodeReg)) {
        throw new DisallowedDatanodeException(nodeReg);
      }
        
      NameNode.stateChangeLog.info("BLOCK* registerDatanode: from " + nodeReg + " storage " + nodeReg.getDatanodeUuid());



      DatanodeDescriptor nodeS = getDatanode(nodeReg.getDatanodeUuid());



      DatanodeDescriptor nodeN = host2DatanodeMap.getDatanodeByXferAddr(nodeReg.getIpAddr(), nodeReg.getXferPort());



      if (nodeN != null && nodeN != nodeS) {


        NameNode.LOG.info("BLOCK* registerDatanode: " + nodeN);

        // nodeN以前提供了一个不同的数据存储,现在任何人都不能使用它。
        // nodeN previously served a different data storage,  which is not served by anybody anymore.
        removeDatanode(nodeN);


        // 从datanodeMap物理删除节点
        wipeDatanode(nodeN);
        nodeN = null;
      }


      if (nodeS != null) {

        if (nodeN == nodeS) {

          // 刚刚重新启动同一个datanode以服务于相同的数据存储。
          // 我们不需要移除旧的数据块,增量将在下一个块报告上从datanode计算出来
          //
          // The same datanode has been just restarted to serve the same data 
          // storage. We do not need to remove old data blocks, the delta will
          // be calculated on the next block report from the datanode
          if(NameNode.stateChangeLog.isDebugEnabled()) {
            NameNode.stateChangeLog.debug("BLOCK* registerDatanode: "
                + "node restarted.");
          }
        } else {
          // nodeS is found
          /* The registering datanode is a replacement node for the existing 
            data storage, which from now on will be served by a new node.
            If this message repeats, both nodes might have same storageID 
            by (insanely rare) random chance. User needs to restart one of the
            nodes with its data cleared (or user can just remove the StorageID
            value in "VERSION" file under the data directory of the datanode,
            but this is might not work if VERSION file format has changed 
         */        
          NameNode.stateChangeLog.info("BLOCK* registerDatanode: " + nodeS
              + " is replaced by " + nodeReg + " with the same storageID "
              + nodeReg.getDatanodeUuid());
        }
        
        boolean success = false;
        try {

          // update cluster map
          getNetworkTopology().remove(nodeS);

          if(shouldCountVersion(nodeS)) {
            decrementVersionCount(nodeS.getSoftwareVersion());
          }

          nodeS.updateRegInfo(nodeReg);

          nodeS.setSoftwareVersion(nodeReg.getSoftwareVersion());

          nodeS.setDisallowed(false); // Node is in the include list

          // resolve network location
          if(this.rejectUnresolvedTopologyDN) {
            nodeS.setNetworkLocation(resolveNetworkLocation(nodeS));
            nodeS.setDependentHostNames(getNetworkDependencies(nodeS));
          } else {
            nodeS.setNetworkLocation(
                resolveNetworkLocationWithFallBackToDefaultLocation(nodeS));
            nodeS.setDependentHostNames(
                getNetworkDependenciesWithDefault(nodeS));
          }


          getNetworkTopology().add(nodeS);


          resolveUpgradeDomain(nodeS);

          // also treat the registration message as a heartbeat
          heartbeatManager.register(nodeS);


          incrementVersionCount(nodeS.getSoftwareVersion());


          startAdminOperationIfNecessary(nodeS);

          success = true;
        } finally {
          if (!success) {
            removeDatanode(nodeS);
            wipeDatanode(nodeS);
            countSoftwareVersions();
          }
        }
        return;
      }

      DatanodeDescriptor nodeDescr  = new DatanodeDescriptor(nodeReg, NetworkTopology.DEFAULT_RACK);

      boolean success = false;
      try {


        // resolve network location
        if(this.rejectUnresolvedTopologyDN) {
          nodeDescr.setNetworkLocation(resolveNetworkLocation(nodeDescr));
          nodeDescr.setDependentHostNames(getNetworkDependencies(nodeDescr));
        } else {
          nodeDescr.setNetworkLocation(
              resolveNetworkLocationWithFallBackToDefaultLocation(nodeDescr));
          nodeDescr.setDependentHostNames(
              getNetworkDependenciesWithDefault(nodeDescr));
        }

        nodeDescr.setSoftwareVersion(nodeReg.getSoftwareVersion());

        resolveUpgradeDomain(nodeDescr);


        // register new datanode

        addDatanode(nodeDescr);


        blockManager.getBlockReportLeaseManager().register(nodeDescr);



        // 还可以将注册消息视为心跳,无需更新其时间戳,因为它是在创建描述符时完成的

        // also treat the registration message as a heartbeat
        // no need to update its timestamp
        // because its is done when the descriptor is created
        heartbeatManager.addDatanode(nodeDescr);
        heartbeatManager.updateDnStat(nodeDescr);


        incrementVersionCount(nodeReg.getSoftwareVersion());

        startAdminOperationIfNecessary(nodeDescr);

        success = true;
      } finally {
        if (!success) {
          removeDatanode(nodeDescr);
          wipeDatanode(nodeDescr);
          countSoftwareVersions();
        }
      }
    } catch (InvalidTopologyException e) {

      // 如果网络位置无效,请清除缓存的映射,以便以后有机会使用正确的网络位置重新添加此DataNode。

      // If the network location is invalid, clear the cached mappings
      // so that we have a chance to re-add this DataNode with the
      // correct network location later.
      List<String> invalidNodeNames = new ArrayList<>(3);

      // 清除IP或主机名中节点的缓存
      // clear cache for nodes in IP or Hostname
      invalidNodeNames.add(nodeReg.getIpAddr());
      invalidNodeNames.add(nodeReg.getHostName());
      invalidNodeNames.add(nodeReg.getPeerHostName());

      dnsToSwitchMapping.reloadCachedMappings(invalidNodeNames);

      throw e;
    }
  }

4.5.数据块上报

Datanode成功完成握手和注册操作后, 就会进行数据块上报操作, 将当前Datanode上存储的数据块上报给Namenode。 这个方法最终会由BlockManager.processReport()方法响应, 如果是第一次块汇报, 则调用processFirstBlockReport()方法, 因为效率会高很多; 否
则调用私有的processReport()方法。 当进行过一次块汇报时, 我们还需要判断这个节点之前是否为stale状态, 如果是, 则需要对postponedMisreplicatedBlock进行重新扫描, 删除那些已经不是stale状态的数据块。


  /**
   *
   * processReport()方法会判断当前块汇报是否是该数据节点的第一次块汇报,
   * 如果是则调用 processFirstBlockReport () 方法处理,
   * 这个方法的效率会很高。
   *
   * 如果不是第一次块汇报, 则调用私有的proceeReport()方法处理。
   *
   * The given storage is reporting all its blocks.
   * Update the (storage-->block list) and (block-->storage list) maps.
   *
   * @return true if all known storages of the given DN have finished reporting.
   * @throws IOException
   */
  public boolean processReport(final DatanodeID nodeID,
      final DatanodeStorage storage,
      final BlockListAsLongs newReport,
      BlockReportContext context) throws IOException {

    namesystem.writeLock();
    final long startTime = Time.monotonicNow(); //after acquiring write lock
    final long endTime;
    DatanodeDescriptor node;
    Collection<Block> invalidatedBlocks = Collections.emptyList();
    String strBlockReportId =
        context != null ? Long.toHexString(context.getReportId()) : "";

    try {
      node = datanodeManager.getDatanode(nodeID);
      if (node == null || !node.isRegistered()) {
        throw new IOException(
            "ProcessReport from dead or unregistered node: " + nodeID);
      }

      // To minimize startup time, we discard any second (or later) block reports
      // that we receive while still in startup phase.
      // Register DN with provided storage, not with storage owned by DN
      // DN should still have a ref to the DNStorageInfo.
      DatanodeStorageInfo storageInfo =
          providedStorageMap.getStorage(node, storage);

      if (storageInfo == null) {
        // We handle this for backwards compatibility.
        storageInfo = node.updateStorage(storage);
      }
      if (namesystem.isInStartupSafeMode()
          && storageInfo.getBlockReportCount() > 0) {
        blockLog.info("BLOCK* processReport 0x{}: "
            + "discarded non-initial block report from {}"
            + " because namenode still in startup phase",
            strBlockReportId, nodeID);
        blockReportLeaseManager.removeLease(node);
        return !node.hasStaleStorages();
      }

      if (storageInfo.getBlockReportCount() == 0) {
        // The first block report can be processed a lot more efficiently than
        // ordinary block reports.  This shortens restart times.
        blockLog.info("BLOCK* processReport 0x{}: Processing first "
            + "storage report for {} from datanode {}",
            strBlockReportId,
            storageInfo.getStorageID(),
            nodeID.getDatanodeUuid());

        //  对于第一次块汇报, 调用processFirstBlockReport()
        processFirstBlockReport(storageInfo, newReport);
      } else {
        // Block reports for provided storage are not
        // maintained by DN heartbeats
        if (!StorageType.PROVIDED.equals(storageInfo.getStorageType())) {

          //不是第一次块汇报, 则调用私有的processReport()方法
          invalidatedBlocks = processReport(storageInfo, newReport, context);
        }
      }
      storageInfo.receivedBlockReport();
    } finally {
      endTime = Time.monotonicNow();
      namesystem.writeUnlock();
    }

    for (Block b : invalidatedBlocks) {
      blockLog.debug("BLOCK* processReport 0x{}: {} on node {} size {} does not"
          + " belong to any file", strBlockReportId, b, node, b.getNumBytes());
    }

    // Log the block report processing stats from Namenode perspective
    final NameNodeMetrics metrics = NameNode.getNameNodeMetrics();
    if (metrics != null) {
      metrics.addStorageBlockReport((int) (endTime - startTime));
    }
    blockLog.info("BLOCK* processReport 0x{}: from storage {} node {}, " +
        "blocks: {}, hasStaleStorage: {}, processing time: {} msecs, " +
        "invalidatedBlocks: {}", strBlockReportId, storage.getStorageID(),
        nodeID, newReport.getNumberOfBlocks(),
        node.hasStaleStorages(), (endTime - startTime),
        invalidatedBlocks.size());
    return !node.hasStaleStorages();
  }

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

参考:
Hadoop 2.X HDFS源码剖析 -- 徐鹏

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值