Introduction
DecommissionManager管理数据节点停用。 后台监控线程定期检查正在进行停用的数据节点的状态。
启动流程
启动命令
sudo -u hdfs hdfs dfsadmin -refreshNodes
FSNamesystem.java
exitCode = refreshNodes();
void refreshNodes() throws IOException {
Configuration conf = new HdfsConfiguration();
checkOperation(OperationCategory.UNCHECKED);
checkSuperuserPrivilege();
// 读取xml文件,找到DFS_HOSTS_EXCLUDE对应的文件路径
RPC.Server.hostsReader.updateFileNames(
conf.get(DFSConfigKeys.DFS_HOSTS_WHITELIST, ""),
conf.get(DFSConfigKeys.DFS_HOSTS_EXCLUDE, ""));
// 把文件里的主机名拿出来
RPC.Server.hostsReader.refresh();
reloadNeverdelete(conf);
dir.permissionCheckpatch.refreshAclMap();
// 刷datanodes
getBlockManager().getDatanodeManager().refreshNodes(conf);
}
HostFileReader.java
refresh()就是把included和excluded的主机名从文件读到内存 ,这里includes,excludes就是两个Set<String>,保存主机名。
public synchronized void refresh(String hostNameString) throws IOException {
LOG.info("Refreshing hosts (include/exclude) list");
Set<String> newIncludes = new HashSet<String>();
Set<String> newExcludes = new HashSet<String>();
HashMap<String,ArrayList<String>> tmpvalid = new HashMap<String, ArrayList<String>>();
boolean switchIncludes = false;
boolean switchExcludes = false;
if (!includesFile.isEmpty()) {
readFileToSet("included", includesFile, newIncludes, tmpvalid);
switchIncludes = true;
}
if (!excludesFile.isEmpty()) {
readFileToSet("excluded", excludesFile, newExcludes);
switchExcludes = true;
}
if (switchIncludes) {
// switch the new hosts that are to be included
includes = newIncludes;
validuserForHost = tmpvalid;
}
if (switchExcludes) {
// switch the excluded hosts
excludes = newExcludes;
}
}
DatanodeManager.java
refreshNodes(conf)会去refreshDatanodes
/**
* Rereads conf to get hosts and exclude list file names.
* Rereads the files to update the hosts and exclude lists. It
* checks if any of the hosts have changed states:
*/
public void refreshNodes(final Configuration conf) throws IOException {
refreshHostsReader(conf);
namesystem.getBMLock().writeLock(OpName.REFRESH_NODES);
try {
refreshDatanodes();
countSoftwareVersions();
} finally {
namesystem.getBMLock().writeUnlock(OpName.REFRESH_NODES);
}
}
判断该节点是否需要开始下线 (DatanodeManager初始化的时候会新建DecommissionManager对象)。
/**
* 1. Added to hosts --> no further work needed here.
* 2. Removed from hosts --> mark AdminState as decommissioned.
* 3. Added to exclude --> start decommission.
* 4. Removed from exclude --> stop decommission.
*/
private void refreshDatanodes() {
for(DatanodeDescriptor node : datanodeMap.values()) {
// Check if not include.
if (!hostConfigManager.isIncluded(node)) {
node.setDisallowed(true); // case 2.
} else {
long maintenanceExpireTimeInMS =
hostConfigManager.getMaintenanceExpirationTimeInMS(node);
if (node.maintenanceNotExpired(maintenanceExpireTimeInMS)) {
decomManager.startMaintenance(node, maintenanceExpireTimeInMS);
// 判断是否需要decommission
} else if (hostConfigManager.isExcluded(node)) {
decomManager.startDecommission(node); // case 3.
} else {
decomManager.stopMaintenance(node);
decomManager.stopDecommission(node); // case 4.
}
}
node.setUpgradeDomain(hostConfigManager.getUpgradeDomain(node));
}
}
判断是否在excluded中
@Override
public synchronized boolean isExcluded(DatanodeID dn) {
return isExcluded(dn.getResolvedAddress());
}
private boolean isExcluded(InetSocketAddress address) {
return excludes.match(address);
}
源码分析
DecommissionManager
/**
* Manage node decommissioning.
* 节点Decommission操作状态管理器
*/
class DecommissionManager {
static final Log LOG = LogFactory.getLog(DecommissionManager.class);
//名字空间系统
private final FSNamesystem fsnamesystem;
DecommissionManager(FSNamesystem namesystem) {
this.fsnamesystem = namesystem;
}
/** Periodically check decommission status. */
//监控方法
class Monitor implements Runnable {
...
}
}
startDecommission
@VisibleForTesting
public void startDecommission(DatanodeDescriptor node) {
if (!node.isDecommissionInProgress()) {
if (!node.isAlive()) {
LOG.info("Dead node {} is decommissioned immediately.", node);
node.setDecommissioned();
} else if (!node.isDecommissioned()) {
for (DatanodeStorageInfo storage : node.getStorageInfos()) {
LOG.info("Starting decommission of {} {} with {} blocks",
node, storage, storage.numBlocks());
}
// Update DN stats maintained by HeartbeatManager
hbManager.startDecommission(node);
node.decommissioningStatus.setStartTime(now());
// 添加到下线节点
pendingNodes.add(node);
}
} else {
LOG.trace("startDecommission: Node {} is already decommission in "
+ "progress, nothing to do.", node);
}
}
在监听器中,调用processPendingNodes()方法,拿到待下线的datanodes
try {
processPendingNodes();
check();
} finally {
namesystem.getBMLock().writeUnlock(OpName.DECOMMISSION_MONITOR);
}
从pendingNodes中poll()出来,放入outOfServiceNodeBlocks。
private void processPendingNodes() {
while (!pendingNodes.isEmpty() &&
(maxConcurrentTrackedNodes == 0 ||
outOfServiceNodeBlocks.size() < maxConcurrentTrackedNodes)) {
outOfServiceNodeBlocks.put(pendingNodes.poll(), null);
}
}
activate
启动Decommission监听器
void activate(Configuration conf) {
...
monitor = new Monitor(blocksPerInterval,
nodesPerInterval, maxConcurrentTrackedNodes);
executor.scheduleAtFixedRate(monitor, intervalSecs, intervalSecs,
TimeUnit.SECONDS);
...
}
Monitor
一个常驻的监控线程
// 自定义一个Monitor类实现Runnable接口
private class Monitor implements Runnable {
/**
* The maximum number of blocks to check per tick.
*/
private final int numBlocksPerCheck;
/**
* The maximum number of nodes to check per tick.
*/
private final int numNodesPerCheck;
/**
* The maximum number of nodes to track in outOfServiceNodeBlocks.
* A value of 0 means no limit.
*/
private final int maxConcurrentTrackedNodes;
/**
* The number of blocks that have been checked on this tick.
*/
private int numBlocksChecked = 0;
/**
* The number of nodes that have been checked on this tick. Used for
* testing.
*/
private int numNodesChecked = 0;
/**
* The last datanode in outOfServiceNodeBlocks that we've processed
*/
private DatanodeDescriptor iterkey = new DatanodeDescriptor(new
DatanodeID("", "", "", 0, 0, 0, 0));
Monitor(int numBlocksPerCheck, int numNodesPerCheck, int
maxConcurrentTrackedNodes) {
this.numBlocksPerCheck = numBlocksPerCheck;
this.numNodesPerCheck = numNodesPerCheck;
this.maxConcurrentTrackedNodes = maxConcurrentTrackedNodes;
}
private boolean exceededNumBlocksPerCheck() {
LOG.trace("Processed {} blocks so far this tick", numBlocksChecked);
return numBlocksChecked >= numBlocksPerCheck;
}
@Deprecated
private boolean exceededNumNodesPerCheck() {
LOG.trace("Processed {} nodes so far this tick", numNodesChecked);
return numNodesChecked >= numNodesPerCheck;
}
// 重写run方法
@Override
public void run() {
if (!namesystem.isRunning()) {
LOG.info("Namesystem is not running, skipping decommissioning checks"
+ ".");
return;
}
// Reset the checked count at beginning of each iteration
numBlocksChecked = 0;
numNodesChecked = 0;
// Check decom progress
namesystem.getBMLock().writeLock(OpName.DECOMMISSION_MONITOR);
try {
// 将datanode从挂起列表中弹出到decomNodeBlocks中。
processPendingNodes();
// 扫描datanode所有的blocks,并且生成需要进行副本复制的blocks。
check();
} finally {
namesystem.getBMLock().writeUnlock(OpName.DECOMMISSION_MONITOR);
}
if (numBlocksChecked + numNodesChecked > 0) {
LOG.info("Checked {} blocks and {} nodes this tick", numBlocksChecked,
numNodesChecked);
}
}
/**
* Pop datanodes off the pending list and into decomNodeBlocks,
* subject to the maxConcurrentTrackedNodes limit.
*/
private void processPendingNodes() {
while (!pendingNodes.isEmpty() &&
(maxConcurrentTrackedNodes == 0 ||
outOfServiceNodeBlocks.size() < maxConcurrentTrackedNodes)) {
outOfServiceNodeBlocks.put(pendingNodes.poll(), null);
}
}
// only need bm write lock
private void check() {
final Iterator<Map.Entry<DatanodeDescriptor, AbstractList<BlockInfo>>>
it = new CyclicIteration<>(outOfServiceNodeBlocks,
iterkey).iterator();
final LinkedList<DatanodeDescriptor> toRemove = new LinkedList<>();
while (it.hasNext()
&& !exceededNumBlocksPerCheck()
&& !exceededNumNodesPerCheck()) {
numNodesChecked++;
final Map.Entry<DatanodeDescriptor, AbstractList<BlockInfo>>
entry = it.next();
final DatanodeDescriptor dn = entry.getKey();
// 找到datanode的所有blocks
AbstractList<BlockInfo> blocks = entry.getValue();
boolean fullScan = false;
if (dn.isMaintenance()) {
// TODO HDFS-9390 make sure blocks are minimally replicated
// before transitioning the node to IN_MAINTENANCE state.
// If maintenance expires, stop tracking it.
if (dn.maintenanceExpired()) {
stopMaintenance(dn);
toRemove.add(dn);
}
continue;
}
// 这是第一次扫描这个datanode,需要把所有的副本数不足的blocks都找出来并且添加到需要复制的队列中。
// 这里判断条件的blocks是指副本数不足的blocks列表。
if (blocks == null) {
LOG.debug("Newly-added node {}, doing full scan to find " +
"insufficiently-replicated blocks.", dn);
// 返回datanode上未充分复制的block列表,
blocks = handleInsufficientlyReplicated(dn);
// 加入退出服务的node-blocklist的Map
outOfServiceNodeBlocks.put(dn, blocks);
fullScan = true;
} else {
// 已经扫描过这个datanode了,那么只需要再扫描副本数不足的blocks列表即可。
LOG.debug("Processing decommission-in-progress node {}", dn);
pruneSufficientlyReplicated(dn, blocks);
}
// 某个datanode没有blocks了,说明可以停用了
if (blocks.size() == 0) {
if (!fullScan) {
// If we didn't just do a full scan, need to re-check with the
// full block map.
//
// We've replicated all the known insufficiently replicated
// blocks. Re-check with the full block map before finally
// marking the datanode as decommissioned
LOG.debug("Node {} has finished replicating current set of "
+ "blocks, checking with the full block map.", dn);
blocks = handleInsufficientlyReplicated(dn);
outOfServiceNodeBlocks.put(dn, blocks);
}
final boolean isHealthy =
blockManager.isNodeHealthyForDecommission(dn);
if (blocks.size() == 0 && isHealthy) {
setDecommissioned(dn);
toRemove.add(dn);
LOG.debug("Node {} is sufficiently replicated and healthy, "
+ "marked as decommissioned.", dn);
} else {
if (LOG.isDebugEnabled()) {
StringBuilder b = new StringBuilder("Node {} ");
if (isHealthy) {
b.append("is ");
} else {
b.append("isn't ");
}
b.append("healthy and still needs to replicate {} more blocks," +
" decommissioning is still in progress.");
LOG.debug(b.toString(), dn, blocks.size());
}
}
} else {
LOG.debug("Node {} still has {} blocks to replicate "
+ "before it is a candidate to finish decommissioning.",
dn, blocks.size());
}
iterkey = dn;
}
// Remove the datanodes that are decommissioned or in service after
// maintenance expiration.
for (DatanodeDescriptor dn : toRemove) {
Preconditions.checkState(dn.isDecommissioned() || dn.isInService(),
"Removing a node that is not yet decommissioned or in service!");
outOfServiceNodeBlocks.remove(dn);
}
}
// 非第一次扫描走这个逻辑
private void pruneSufficientlyReplicated(final DatanodeDescriptor datanode,
AbstractList<BlockInfo> blocks) {
// blocks表示第一次扫描生成的列表(包含该datanode所有副本数不足的blocks)
// 副本数够了就从这个列表中移除,直到长度为0。
processBlocksForDecomInternal(datanode, blocks.iterator(), null, true);
}
// 第一次扫描走这个逻辑
private AbstractList<BlockInfo> handleInsufficientlyReplicated(
final DatanodeDescriptor datanode) {
AbstractList<BlockInfo> insufficient = new ChunkedArrayList<>();
// 生成一个blocks列表(insufficient),包含该datanode所有副本数不足的blocks。
// 下一次扫描就去扫这个insufficient,副本数够了就移走,直到这个列表长度为0。
processBlocksForDecomInternal(datanode, datanode.getBlockIterator(),
insufficient, false);
return insufficient;
}
// 第一次扫描,List<BlockInfo> insufficientlyReplicated为空(但是不是null),副本数不足的block的不断往里加,pruneSufficientlyReplicated为false,此时不可以移除这个block,因为此时扫描的是待下线的datanode上的原始blocks,如果移除,datanode的blocks就不完整了。返回一个副本数不足的blocks列表。
// 不是第一次扫描,List<BlockInfo> insufficientlyReplicated为空,因为此时的操作都是针对第一次扫描扫出来的副本数不足的blocks列表。pruneSufficientlyReplicated为true,一旦副本数够了,就从副本数不足的blocks列表中移除这个block。
private void processBlocksForDecomInternal(
final DatanodeDescriptor datanode,
final Iterator<BlockInfo> it,
final List<BlockInfo> insufficientlyReplicated,
boolean pruneSufficientlyReplicated) {
boolean firstReplicationLog = true;
int underReplicatedBlocks = 0;
int decommissionOnlyReplicas = 0;
int underReplicatedInOpenFiles = 0;
while (it.hasNext()) {
numBlocksChecked++;
final BlockInfo block = it.next();
// Remove the block from the list if it's no longer in the block map,
// e.g. the containing file has been deleted
if (blockManager.blocksMap.getStoredBlock(block) == null) {
LOG.trace("Removing unknown block {}", block);
it.remove();
continue;
}
BlockCollection bc = blockManager.blocksMap.getBlockCollection(block);
if (bc == null) {
// Orphan block, will be invalidated eventually. Skip.
continue;
}
synchronized (bc){
if (blockManager.blocksMap.getBlockCollection(block) == null) {
// Orphan block, will be invalidated eventually. Skip.
continue;
}
final NumberReplicas num = blockManager.countNodes(block);
final int liveReplicas = num.liveReplicas();
final int curReplicas = liveReplicas;
// Schedule under-replicated blocks for replication if not already
// pending
// 如果需要复制
if (blockManager.isNeededReplication(block, bc.getBlockReplication(),
liveReplicas)) {
if (!blockManager.neededReplications.contains(block) &&
blockManager.pendingReplications.getNumReplicas(block) == 0 &&
namesystem.isPopulatingReplQueues()) {
// Process these blocks only when active NN is out of safe mode.
// 加入复制队列
blockManager.neededReplications.add(block,
curReplicas,
num.decommissionedAndDecommissioning(),
bc.getBlockReplication());
}
}
// 如果副本数够了,(从候选列表,也就是第一次扫描的所有副本数不足的blocks列表)移除
if (isSufficientlyReplicated(block, bc, num)) {
if (pruneSufficientlyReplicated) {
it.remove();
}
continue;
}
// We've found an insufficiently replicated block.
if (insufficientlyReplicated != null) {
insufficientlyReplicated.add(block);
}
// Log if this is our first time through
if (firstReplicationLog) {
logBlockReplicationInfo(block, bc, datanode, num,
blockManager.blocksMap.getStorages(block));
firstReplicationLog = false;
}
// Update various counts
underReplicatedBlocks++;
if (bc.isUnderConstruction()) {
underReplicatedInOpenFiles++;
}
if ((curReplicas == 0) && (num.decommissionedAndDecommissioning() > 0)) {
decommissionOnlyReplicas++;
}
}
}
datanode.decommissioningStatus.set(underReplicatedBlocks,
decommissionOnlyReplicas,
underReplicatedInOpenFiles);
}
}