HDFS源码解析---Decommission

本文深入剖析了Hadoop HDFS中数据节点(DataNode)的下线(Decommission)过程。DecommissionManager通过后台监控线程定期检查节点状态,管理数据节点的停用流程。当节点被添加到exclude文件中时,开始下线过程。DecommissionManager监控线程逐个检查节点,根据节点状态决定是否开始或停止下线,并更新节点状态。此过程中涉及节点状态的刷新、副本复制调度以及节点健康检查,确保数据安全和集群稳定性。

Introduction

DecommissionManager管理数据节点停用。 后台监控线程定期检查正在进行停用的数据节点的状态。

启动流程

启动命令

sudo -u hdfs hdfs dfsadmin -refreshNodes

FSNamesystem.java

exitCode = refreshNodes();
void refreshNodes() throws IOException {
    Configuration conf = new HdfsConfiguration();
    checkOperation(OperationCategory.UNCHECKED);
    checkSuperuserPrivilege();

    // 读取xml文件,找到DFS_HOSTS_EXCLUDE对应的文件路径
    RPC.Server.hostsReader.updateFileNames(
        conf.get(DFSConfigKeys.DFS_HOSTS_WHITELIST, ""),
        conf.get(DFSConfigKeys.DFS_HOSTS_EXCLUDE, ""));
    // 把文件里的主机名拿出来
    RPC.Server.hostsReader.refresh();
    reloadNeverdelete(conf);
    dir.permissionCheckpatch.refreshAclMap();
    // 刷datanodes
    getBlockManager().getDatanodeManager().refreshNodes(conf);
  }

HostFileReader.java

refresh()就是把included和excluded的主机名从文件读到内存 ,这里includes,excludes就是两个Set<String>,保存主机名。

public synchronized void refresh(String hostNameString) throws IOException {
    LOG.info("Refreshing hosts (include/exclude) list");
    Set<String> newIncludes = new HashSet<String>();
    Set<String> newExcludes = new HashSet<String>();
    HashMap<String,ArrayList<String>> tmpvalid = new HashMap<String,  ArrayList<String>>();
    boolean switchIncludes = false;
    boolean switchExcludes = false;
    if (!includesFile.isEmpty()) {
      readFileToSet("included", includesFile, newIncludes, tmpvalid);
      switchIncludes = true;
    }
    if (!excludesFile.isEmpty()) {
      readFileToSet("excluded", excludesFile, newExcludes);
      switchExcludes = true;
    }

    if (switchIncludes) {
      // switch the new hosts that are to be included
      includes = newIncludes;
      validuserForHost = tmpvalid;
    }
    if (switchExcludes) {
      // switch the excluded hosts
      excludes = newExcludes;
    }
  }

DatanodeManager.java

refreshNodes(conf)会去refreshDatanodes

/**
   * Rereads conf to get hosts and exclude list file names.
   * Rereads the files to update the hosts and exclude lists.  It
   * checks if any of the hosts have changed states:
   */
  public void refreshNodes(final Configuration conf) throws IOException {
    refreshHostsReader(conf);
    namesystem.getBMLock().writeLock(OpName.REFRESH_NODES);
    try {
      refreshDatanodes();
      countSoftwareVersions();
    } finally {
      namesystem.getBMLock().writeUnlock(OpName.REFRESH_NODES);
    }
  }

判断该节点是否需要开始下线 (DatanodeManager初始化的时候会新建DecommissionManager对象)。

/**
   * 1. Added to hosts  --> no further work needed here.
   * 2. Removed from hosts --> mark AdminState as decommissioned. 
   * 3. Added to exclude --> start decommission.
   * 4. Removed from exclude --> stop decommission.
   */
  private void refreshDatanodes() {
    for(DatanodeDescriptor node : datanodeMap.values()) {
      // Check if not include.
      if (!hostConfigManager.isIncluded(node)) {
        node.setDisallowed(true); // case 2.
      } else {
        long maintenanceExpireTimeInMS =
            hostConfigManager.getMaintenanceExpirationTimeInMS(node);
        if (node.maintenanceNotExpired(maintenanceExpireTimeInMS)) {
          decomManager.startMaintenance(node, maintenanceExpireTimeInMS);
        // 判断是否需要decommission
        } else if (hostConfigManager.isExcluded(node)) {
          decomManager.startDecommission(node); // case 3.
        } else {
          decomManager.stopMaintenance(node);
          decomManager.stopDecommission(node); // case 4.
        }
      }
      node.setUpgradeDomain(hostConfigManager.getUpgradeDomain(node));
    }
  }

判断是否在excluded中

@Override
public synchronized boolean isExcluded(DatanodeID dn) {
    return isExcluded(dn.getResolvedAddress());
}

private boolean isExcluded(InetSocketAddress address) {
    return excludes.match(address);
}

源码分析

DecommissionManager

/**
 * Manage node decommissioning.
 * 节点Decommission操作状态管理器
 */
class DecommissionManager {
  static final Log LOG = LogFactory.getLog(DecommissionManager.class);
  
  //名字空间系统
  private final FSNamesystem fsnamesystem;
 
  DecommissionManager(FSNamesystem namesystem) {
    this.fsnamesystem = namesystem;
  }
 
  /** Periodically check decommission status. */
  //监控方法
  class Monitor implements Runnable {
    ...
  }
}

startDecommission

@VisibleForTesting
  public void startDecommission(DatanodeDescriptor node) {
    if (!node.isDecommissionInProgress()) {
      if (!node.isAlive()) {
        LOG.info("Dead node {} is decommissioned immediately.", node);
        node.setDecommissioned();
      } else if (!node.isDecommissioned()) {
        for (DatanodeStorageInfo storage : node.getStorageInfos()) {
          LOG.info("Starting decommission of {} {} with {} blocks", 
              node, storage, storage.numBlocks());
        }
        // Update DN stats maintained by HeartbeatManager
        hbManager.startDecommission(node);
        node.decommissioningStatus.setStartTime(now());
        // 添加到下线节点
        pendingNodes.add(node);
      }
    } else {
      LOG.trace("startDecommission: Node {} is already decommission in "
              + "progress, nothing to do.", node);
    }
  }

在监听器中,调用processPendingNodes()方法,拿到待下线的datanodes

try {
  processPendingNodes();
  check();
} finally {
  namesystem.getBMLock().writeUnlock(OpName.DECOMMISSION_MONITOR);
}

从pendingNodes中poll()出来,放入outOfServiceNodeBlocks。

private void processPendingNodes() {
  while (!pendingNodes.isEmpty() &&
    (maxConcurrentTrackedNodes == 0 ||
    outOfServiceNodeBlocks.size() < maxConcurrentTrackedNodes)) {
    outOfServiceNodeBlocks.put(pendingNodes.poll(), null);
  }
}

activate

启动Decommission监听器

void activate(Configuration conf) {
  ...
    monitor = new Monitor(blocksPerInterval, 
        nodesPerInterval, maxConcurrentTrackedNodes);
    executor.scheduleAtFixedRate(monitor, intervalSecs, intervalSecs,
        TimeUnit.SECONDS);
  ...
}

Monitor

一个常驻的监控线程

// 自定义一个Monitor类实现Runnable接口
private class Monitor implements Runnable {
    /**
     * The maximum number of blocks to check per tick.
     */
    private final int numBlocksPerCheck;
    /**
     * The maximum number of nodes to check per tick.
     */
    private final int numNodesPerCheck;
    /**
     * The maximum number of nodes to track in outOfServiceNodeBlocks.
     * A value of 0 means no limit.
     */
    private final int maxConcurrentTrackedNodes;
    /**
     * The number of blocks that have been checked on this tick.
     */
    private int numBlocksChecked = 0;
    /**
     * The number of nodes that have been checked on this tick. Used for 
     * testing.
     */
    private int numNodesChecked = 0;
    /**
     * The last datanode in outOfServiceNodeBlocks that we've processed
     */
    private DatanodeDescriptor iterkey = new DatanodeDescriptor(new 
        DatanodeID("", "", "", 0, 0, 0, 0));
​
    Monitor(int numBlocksPerCheck, int numNodesPerCheck, int 
        maxConcurrentTrackedNodes) {
      this.numBlocksPerCheck = numBlocksPerCheck;
      this.numNodesPerCheck = numNodesPerCheck;
      this.maxConcurrentTrackedNodes = maxConcurrentTrackedNodes;
    }
​
    private boolean exceededNumBlocksPerCheck() {
      LOG.trace("Processed {} blocks so far this tick", numBlocksChecked);
      return numBlocksChecked >= numBlocksPerCheck;
    }
​
    @Deprecated
    private boolean exceededNumNodesPerCheck() {
      LOG.trace("Processed {} nodes so far this tick", numNodesChecked);
      return numNodesChecked >= numNodesPerCheck;
    }
​
    // 重写run方法
    @Override
    public void run() {
      if (!namesystem.isRunning()) {
        LOG.info("Namesystem is not running, skipping decommissioning checks"
            + ".");
        return;
      }
      // Reset the checked count at beginning of each iteration
      numBlocksChecked = 0;
      numNodesChecked = 0;
      // Check decom progress
      namesystem.getBMLock().writeLock(OpName.DECOMMISSION_MONITOR);
      try {
        // 将datanode从挂起列表中弹出到decomNodeBlocks中。
        processPendingNodes();
        // 扫描datanode所有的blocks,并且生成需要进行副本复制的blocks。
        check();
      } finally {
        namesystem.getBMLock().writeUnlock(OpName.DECOMMISSION_MONITOR);
      }
      if (numBlocksChecked + numNodesChecked > 0) {
        LOG.info("Checked {} blocks and {} nodes this tick", numBlocksChecked,
            numNodesChecked);
      }
    }
​
    /**
     * Pop datanodes off the pending list and into decomNodeBlocks, 
     * subject to the maxConcurrentTrackedNodes limit.
     */
    private void processPendingNodes() {
      while (!pendingNodes.isEmpty() &&
          (maxConcurrentTrackedNodes == 0 ||
          outOfServiceNodeBlocks.size() < maxConcurrentTrackedNodes)) {
        outOfServiceNodeBlocks.put(pendingNodes.poll(), null);
      }
    }
​
    // only need bm write lock
    private void check() {
      final Iterator<Map.Entry<DatanodeDescriptor, AbstractList<BlockInfo>>>
          it = new CyclicIteration<>(outOfServiceNodeBlocks,
              iterkey).iterator();
      final LinkedList<DatanodeDescriptor> toRemove = new LinkedList<>();
​
      while (it.hasNext()
          && !exceededNumBlocksPerCheck()
          && !exceededNumNodesPerCheck()) {
        numNodesChecked++;
        final Map.Entry<DatanodeDescriptor, AbstractList<BlockInfo>>
            entry = it.next();
        final DatanodeDescriptor dn = entry.getKey();
        // 找到datanode的所有blocks
        AbstractList<BlockInfo> blocks = entry.getValue();
        boolean fullScan = false;
        if (dn.isMaintenance()) {
          // TODO HDFS-9390 make sure blocks are minimally replicated
          // before transitioning the node to IN_MAINTENANCE state.
​
          // If maintenance expires, stop tracking it.
          if (dn.maintenanceExpired()) {
            stopMaintenance(dn);
            toRemove.add(dn);
          }
          continue;
        }
        
        // 这是第一次扫描这个datanode,需要把所有的副本数不足的blocks都找出来并且添加到需要复制的队列中。
        // 这里判断条件的blocks是指副本数不足的blocks列表。
        if (blocks == null) {
          LOG.debug("Newly-added node {}, doing full scan to find " +
              "insufficiently-replicated blocks.", dn);
          // 返回datanode上未充分复制的block列表,
          blocks = handleInsufficientlyReplicated(dn);
          // 加入退出服务的node-blocklist的Map
          outOfServiceNodeBlocks.put(dn, blocks);
          fullScan = true;
        } else {
          // 已经扫描过这个datanode了,那么只需要再扫描副本数不足的blocks列表即可。
          LOG.debug("Processing decommission-in-progress node {}", dn);
          pruneSufficientlyReplicated(dn, blocks);
        }
        // 某个datanode没有blocks了,说明可以停用了
        if (blocks.size() == 0) {
          if (!fullScan) {
            // If we didn't just do a full scan, need to re-check with the 
            // full block map.
            //
            // We've replicated all the known insufficiently replicated 
            // blocks. Re-check with the full block map before finally 
            // marking the datanode as decommissioned 
            LOG.debug("Node {} has finished replicating current set of "
                + "blocks, checking with the full block map.", dn);
            blocks = handleInsufficientlyReplicated(dn);
            outOfServiceNodeBlocks.put(dn, blocks);
          }
          final boolean isHealthy =
              blockManager.isNodeHealthyForDecommission(dn);
          if (blocks.size() == 0 && isHealthy) {
            setDecommissioned(dn);
            toRemove.add(dn);
            LOG.debug("Node {} is sufficiently replicated and healthy, "
                + "marked as decommissioned.", dn);
          } else {
            if (LOG.isDebugEnabled()) {
              StringBuilder b = new StringBuilder("Node {} ");
              if (isHealthy) {
                b.append("is ");
              } else {
                b.append("isn't ");
              }
              b.append("healthy and still needs to replicate {} more blocks," +
                  " decommissioning is still in progress.");
              LOG.debug(b.toString(), dn, blocks.size());
            }
          }
        } else {
          LOG.debug("Node {} still has {} blocks to replicate "
                  + "before it is a candidate to finish decommissioning.",
              dn, blocks.size());
        }
        iterkey = dn;
      }
      // Remove the datanodes that are decommissioned or in service after
      // maintenance expiration.
      for (DatanodeDescriptor dn : toRemove) {
        Preconditions.checkState(dn.isDecommissioned() || dn.isInService(),
            "Removing a node that is not yet decommissioned or in service!");
        outOfServiceNodeBlocks.remove(dn);
      }
    }
​
    // 非第一次扫描走这个逻辑
    private void pruneSufficientlyReplicated(final DatanodeDescriptor datanode,
        AbstractList<BlockInfo> blocks) {
      // blocks表示第一次扫描生成的列表(包含该datanode所有副本数不足的blocks)
      // 副本数够了就从这个列表中移除,直到长度为0。
      processBlocksForDecomInternal(datanode, blocks.iterator(), null, true);
    }
​
    // 第一次扫描走这个逻辑
    private AbstractList<BlockInfo> handleInsufficientlyReplicated(
        final DatanodeDescriptor datanode) {
      AbstractList<BlockInfo> insufficient = new ChunkedArrayList<>();
      // 生成一个blocks列表(insufficient),包含该datanode所有副本数不足的blocks。
      // 下一次扫描就去扫这个insufficient,副本数够了就移走,直到这个列表长度为0。
      processBlocksForDecomInternal(datanode, datanode.getBlockIterator(),
          insufficient, false);
      return insufficient;
    }
​
    // 第一次扫描,List<BlockInfo> insufficientlyReplicated为空(但是不是null),副本数不足的block的不断往里加,pruneSufficientlyReplicated为false,此时不可以移除这个block,因为此时扫描的是待下线的datanode上的原始blocks,如果移除,datanode的blocks就不完整了。返回一个副本数不足的blocks列表。
    // 不是第一次扫描,List<BlockInfo> insufficientlyReplicated为空,因为此时的操作都是针对第一次扫描扫出来的副本数不足的blocks列表。pruneSufficientlyReplicated为true,一旦副本数够了,就从副本数不足的blocks列表中移除这个block。
    private void processBlocksForDecomInternal(
        final DatanodeDescriptor datanode,
        final Iterator<BlockInfo> it,
        final List<BlockInfo> insufficientlyReplicated,
        boolean pruneSufficientlyReplicated) {
      boolean firstReplicationLog = true;
      int underReplicatedBlocks = 0;
      int decommissionOnlyReplicas = 0;
      int underReplicatedInOpenFiles = 0;
      while (it.hasNext()) {
        numBlocksChecked++;
        final BlockInfo block = it.next();
        // Remove the block from the list if it's no longer in the block map,
        // e.g. the containing file has been deleted
        if (blockManager.blocksMap.getStoredBlock(block) == null) {
          LOG.trace("Removing unknown block {}", block);
          it.remove();
          continue;
        }
        BlockCollection bc = blockManager.blocksMap.getBlockCollection(block);
        if (bc == null) {
          // Orphan block, will be invalidated eventually. Skip.
          continue;
        }
        synchronized (bc){
          if (blockManager.blocksMap.getBlockCollection(block) == null) {
            // Orphan block, will be invalidated eventually. Skip.
            continue;
          }
​
          final NumberReplicas num = blockManager.countNodes(block);
          final int liveReplicas = num.liveReplicas();
          final int curReplicas = liveReplicas;
​
          // Schedule under-replicated blocks for replication if not already
          // pending
          // 如果需要复制
          if (blockManager.isNeededReplication(block, bc.getBlockReplication(),
                  liveReplicas)) {
            if (!blockManager.neededReplications.contains(block) &&
                    blockManager.pendingReplications.getNumReplicas(block) == 0 &&
                    namesystem.isPopulatingReplQueues()) {
              // Process these blocks only when active NN is out of safe mode.
              // 加入复制队列
              blockManager.neededReplications.add(block,
                      curReplicas,
                      num.decommissionedAndDecommissioning(),
                      bc.getBlockReplication());
            }
          }
​
          // 如果副本数够了,(从候选列表,也就是第一次扫描的所有副本数不足的blocks列表)移除
          if (isSufficientlyReplicated(block, bc, num)) {
            if (pruneSufficientlyReplicated) {
              it.remove();
            }
            continue;
          }
​
          // We've found an insufficiently replicated block.
          if (insufficientlyReplicated != null) {
            insufficientlyReplicated.add(block);
          }
          // Log if this is our first time through
          if (firstReplicationLog) {
            logBlockReplicationInfo(block, bc, datanode, num,
                    blockManager.blocksMap.getStorages(block));
            firstReplicationLog = false;
          }
          // Update various counts
          underReplicatedBlocks++;
          if (bc.isUnderConstruction()) {
            underReplicatedInOpenFiles++;
          }
          if ((curReplicas == 0) && (num.decommissionedAndDecommissioning() > 0)) {
            decommissionOnlyReplicas++;
          }
        }
      }
​
      datanode.decommissioningStatus.set(underReplicatedBlocks,
          decommissionOnlyReplicas,
          underReplicatedInOpenFiles);
    }
  }

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值