基于hadoop-3.3.0
1 什么是安全模式
安全模式是HDFS所处的一种特殊状态,在这种状态下,文件系统只接受读数据请求,而不接受删除、修改等变更请求。
在NameNode主节点启动时,HDFS首先进入安全模式,DataNode在启动的时候会向namenode汇报可用的block等状态,当整个系统达到安全标准时,HDFS自动离开安全模式。如果HDFS出于安全模式下,则文件block不能进行任何的副本复制操作,因此达到最小的副本数量要求是基于datanode启动时的状态来判定的,启动时不会再做任何复制(从而达到最小副本数量要求)
2 安全模式的相关配置
当系统满足一些条件时,集群可以离开安全模式。那么我们先看下安全模式的常见配置:
- dfs.namenode.replication.min:最小的block副本数,默认值是1,即默认block副本数为1即满足要求;
- dfs.namenode.safemode.threshold-pct:副本数达到最小要求的block占系统总block数的百分比,当实际比例超过该配置后,才能离开安全模式(但是还需要其他条件也满足)。默认为0.999f,也就是说符合最小副本数要求的block占比超过99.9%时,并且其他条件也满足才能离开安全模式。如果为小于等于0,则不会等待任何副本达到要求即可离开。如果大于1,则永远处于安全模式。
- dfs.namenode.safemode.min.datanodes:离开安全模式的最小可用(alive)datanode数量要求,默认为0。也就是即使所有datanode都不可用,仍然可以离开安全模式。
- dfs.namenode.safemode.extension:当集群可用block比例,可用datanode都达到要求之后,如果在extension配置的时间段之后依然能满足要求,此时集群才离开安全模式。单位为毫秒,默认为30000,即30s。
综上,离开安全模式需要满足一下条件:
- nn的资源情况,如editLog目录的资源情况等,NN的安全模式
后面是BlockManager的安全模式离开方式
- 满足最小副本要求的block需要达到一定比例;
- datanode存活的数量;
- 满足2、3条件后还要等待一定时间(extension)后看是否持续满足;
3 相关命令
hadoop dfsadmin -safemode <command>
command | 功能 |
get | 查看当前状态 |
enter | 进入安全模式 |
leave | 强制离开安全模式 |
wait | 一直等待直到安全模式结束 |
4 源码
书接上文,namenode在启动时,会在initialize的时候启动公共服务,而在启动公共服务的过程中会对与安全模式相关的参数的初始化,并且有一部分情况会进入安全模式(BlockManageSafeMode)。安全模式的是否进入是在NameNode初始化时的enterState中有一部分(nn的资源情况)
protected void initialize(Configuration conf) throws IOException {
...
// 启动一些公共服务
startCommonServices(conf);
startMetricsLogger(conf);
}
protected NameNode(Configuration conf, NamenodeRole role) throws IOException {
...
try {
initializeGenericKeys(conf, nsId, namenodeId);
// NameNode初始化
initialize(getConf());
state.prepareToEnterState(haContext);
try {
haContext.writeLock();
// 启动对应状态的服务,如active、standby,在这个过程中会根据实际情况看是否进入安全模式
state.enterState(haContext);
} finally {
haContext.writeUnlock();
}
} catch (IOException e) {
this.stopAtException(e);
throw e;
} catch (HadoopIllegalArgumentException e) {
this.stopAtException(e);
throw e;
}
...
}
4.1 startCommonServices
/**
* FSNamesystem#startCommonServices
* Start the services common to active and standby states
*/
private void startCommonServices(Configuration conf) throws IOException {
// 启动通用服务,具体的步骤参加下面同名方法
namesystem.startCommonServices(conf, haContext);
registerNNSMXBean();
// 如果当前nn的角色不为NAMENODE,即backup或者checkpoint,则启动nn的httpServer
if (NamenodeRole.NAMENODE != role) {
startHttpServer(conf);
httpServer.setNameNodeAddress(getNameNodeAddress());
httpServer.setFSImage(getFSImage());
if (levelDBAliasMapServer != null) {
httpServer.setAliasMap(levelDBAliasMapServer.getAliasMap());
}
}
rpcServer.start();
try {
plugins = conf.getInstances(DFS_NAMENODE_PLUGINS_KEY,
ServicePlugin.class);
} catch (RuntimeException e) {
String pluginsValue = conf.get(DFS_NAMENODE_PLUGINS_KEY);
LOG.error("Unable to load NameNode plugins. Specified list of plugins: " +
pluginsValue, e);
throw e;
}
// 启动nn配置的插件
for (ServicePlugin p: plugins) {
try {
p.start(this);
} catch (Throwable t) {
LOG.warn("ServicePlugin " + p + " could not be started", t);
}
}
LOG.info(getRole() + " RPC up at: " + getNameNodeAddress());
if (rpcServer.getServiceRpcAddress() != null) {
LOG.info(getRole() + " service RPC up at: "
+ rpcServer.getServiceRpcAddress());
}
}
/**
* Start services common to both active and standby states
*/
void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
this.registerMBean(); // register the MBean for the FSNamesystemState
writeLock();
this.haContext = haContext;
try {
/**
* 该对象主要设置三个配置:
* * dfs.namenode.resource.du.reserved:nn预留的用来存储的空间,默认100MB
* * dfs.namenode.resource.checked.volumes:除了edit目录外,NameNode资源检查器要检查的本地目录列表
* * dfs.namenode.resource.checked.volumes.minimum:nn所需的最小冗余卷数
* 除此之外,还会通过conf配置获取本地edit的目录配置并将其添加到检测行列,根据dfs.namenode.edits.dir.required决定是否是必须检查的
*/
nnResourceChecker = new NameNodeResourceChecker(conf);
/**
* 检查在NameNodeResourceChecker中添加的volume是否符合duReserve(100MB)的要求,并标记hasResourcesAvailable
* 判断逻辑其实挺简单,根据上一步添加的各个数据卷做循环判断
* 1. 是否必须,这里循环:
* true:直接判断是否满足 > 100MB,不满足返回false,required volume+1
* false: 多余volume计数+1,而后判断资源是否符合100MB的限制,不符合则不可用的多余卷+1
* 2. 循环结束,多余是否为0
* true:required volume 是否大于0
* false:多余卷的数据 - 不可用的多余卷的数目是否大于nn的最小冗余数
*/
checkAvailableResources();
assert !blockManager.isPopulatingReplQueues();
StartupProgress prog = NameNode.getStartupProgress();
prog.beginPhase(Phase.SAFEMODE);
// 获取已经complete(副本数符合要求)的block
long completeBlocksTotal = getCompleteBlocksTotal();
// 等待dn向nn汇报block状态
prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
completeBlocksTotal);
// 这里也会进入blockManager的安全模式,主要是根据block的合理阈值(0.999f)以及dn的个数
blockManager.activate(conf, completeBlocksTotal);
} finally {
writeUnlock("startCommonServices");
}
registerMXBean();
DefaultMetricsSystem.instance().register(this);
if (inodeAttributeProvider != null) {
inodeAttributeProvider.start();
dir.setINodeAttributeProvider(inodeAttributeProvider);
}
snapshotManager.registerMXBean();
InetSocketAddress serviceAddress = NameNode.getServiceAddress(conf, true);
this.nameNodeHostName = (serviceAddress != null) ?
serviceAddress.getHostName() : "";
}
4.2 资源不足的安全模式进入
这里我们只分析ActiveState的逻辑,ActiveState是抽象类HAStae的一个实现:
// Active#enterState
@Override
public void enterState(HAContext context) throws ServiceFailedException {
try {
// 继续跟进到NameNode的startActiveServices方法
context.startActiveServices();
} catch (IOException e) {
throw new ServiceFailedException("Failed to start active services", e);
}
}
// NameNode#startActiveServices
@Override
public void startActiveServices() throws IOException {
try {
// 继续跟进
namesystem.startActiveServices();
startTrashEmptier(getConf());
} catch (Throwable t) {
doImmediateShutdown(t);
}
}
下文是FSNamesystem#startActiveServices
/**
* Start services required in active state
* @throws IOException
*/
void startActiveServices() throws IOException {
startingActiveService = true;
LOG.info("Starting services required for active state");
writeLock();
try {
// 获取editLog,其中涉及的FsImage在FSNamesystem初始化时填充,而FSNamesystem则在NameNode初始化时loadNamesystem完成
FSEditLog editLog = getFSImage().getEditLog();
/**
* 判断FSEditLog的state是否是IN_SEGMENT(nn刚启动)或者BETWEEN_LOG_SEGMENT(nn刚从standby状态转换过来)
* OPEN_FOR_WRITE状态:一旦NN已经初始化完成
*/
if (!editLog.isOpenForWrite()) {
// During startup, we're already open for write during initialization.
editLog.initJournalsForWrite();
// May need to recover
editLog.recoverUnclosedStreams();
LOG.info("Catching up to latest edits from old active before " +
"taking over writer role in edits logs");
editLogTailer.catchupDuringFailover();
blockManager.setPostponeBlocksFromFuture(false);
blockManager.getDatanodeManager().markAllDatanodesStale();
blockManager.clearQueues();
blockManager.processAllPendingDNMessages();
blockManager.getBlockIdManager().applyImpendingGenerationStamp();
// Only need to re-process the queue, If not in SafeMode.
if (!isInSafeMode()) {
LOG.info("Reprocessing replication and invalidation queues");
blockManager.initializeReplQueues();
}
if (LOG.isDebugEnabled()) {
LOG.debug("NameNode metadata after re-processing " +
"replication and invalidation queues during failover:\n" +
metaSaveAsString());
}
long nextTxId = getFSImage().getLastAppliedTxId() + 1;
LOG.info("Will take over writing edit logs at txnid " +
nextTxId);
editLog.setNextTxId(nextTxId);
getFSImage().editLog.openForWrite(getEffectiveLayoutVersion());
}
// Initialize the quota.
dir.updateCountForQuota();
// Enable quota checks.
dir.enableQuotaChecks();
dir.ezManager.startReencryptThreads();
if (haEnabled) {
// Renew all of the leases before becoming active.
// This is because, while we were in standby mode,
// the leases weren't getting renewed on this NN.
// Give them all a fresh start here.
leaseManager.renewAllLeases();
}
leaseManager.startMonitor();
startSecretManagerIfNecessary();
//ResourceMonitor required only at ActiveNN. See HDFS-2914
// 这里会启动一个NameNodeResourceMonitor的后台线程,一旦过程中资源不符合要求则会进入安全模式
this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
nnrmthread.start();
...
} finally {
startingActiveService = false;
blockManager.checkSafeMode();
writeUnlock("startActiveServices");
}
}
看一下NameNodeResourceMonitor
/**
* Periodically calls hasAvailableResources of NameNodeResourceChecker, and if
* there are found to be insufficient resources available, causes the NN to
* enter safe mode. If resources are later found to have returned to
* acceptable levels, this daemon will cause the NN to exit safe mode.
*/
class NameNodeResourceMonitor implements Runnable {
boolean shouldNNRmRun = true;
@Override
public void run () {
try {
while (fsRunning && shouldNNRmRun) {
checkAvailableResources();
// 这个就是在startCommonService中标记的hasResourcesAvailable变量,如果资源不可用则进入安全模式
if(!nameNodeHasResourcesAvailable()) {
String lowResourcesMsg = "NameNode low on available disk space. ";
if (!isInSafeMode()) {
LOG.warn(lowResourcesMsg + "Entering safe mode.");
} else {
LOG.warn(lowResourcesMsg + "Already in safe mode.");
}
// 进入FSNamesystem safe mode
enterSafeMode(true);
}
try {
Thread.sleep(resourceRecheckInterval);
} catch (InterruptedException ie) {
// Deliberately ignore
}
}
} catch (Exception e) {
FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
}
}
public void stopMonitor() {
shouldNNRmRun = false;
}
}
4.3 blockManager的安全模式
在FSNamesystem#startCommonServices我们会激活blockManager,而在激活blockManager过程中会有启动安全模式的过程。
在此之前先简单了解下BlockManager,如果把 Hdfs 比喻成一个人的躯干,那么 NameNode 就是他的大脑,了解每一个节点的状态信息,控制并管理四肢( DataNode )的运作。而 BlockManager 则是整个躯干中的心脏,他源源不断的接收来自四肢的血液( BlockInfo), 再反将血液( Command )传输回四肢中,让其能够正常运作。
对于BlockManager,主要是用来保留与存储在 Hadoop 集群中的块相关的信息。对于块状态管理,它尝试在任何事件(例如退役、namenode 故障转移、datanode 故障)下保持“活副本数 == 预期冗余”的安全属性。
而启动安全模式的逻辑在此:
...
blockManager.activate(conf, completeBlocksTotal);
...
// blockManager中的服务开始激动与激活
public void activate(Configuration conf, long blockTotal) {
pendingReconstruction.start();
datanodeManager.activate(conf);
this.redundancyThread.setName("RedundancyMonitor");
this.redundancyThread.start();
storageInfoDefragmenterThread.setName("StorageInfoMonitor");
storageInfoDefragmenterThread.start();
this.blockReportThread.start();
mxBeanName = MBeans.register("NameNode", "BlockStats", this);
// 触发安全模式的启动
bmSafeMode.activate(blockTotal);
}
/**
* Initialize the safe mode information.
* @param total initial total blocks
*/
void activate(long total) {
assert namesystem.hasWriteLock();
assert status == BMSafeModeStatus.OFF;
startTime = monotonicNow();
// 设置block总量以及计算block的阈值
setBlockTotal(total);
// 如果符合要求(参见下文areThresholdsMet方法),退出安全模式,否则进入安全模式
if (areThresholdsMet()) {
boolean exitResult = leaveSafeMode(false);
Preconditions.checkState(exitResult, "Failed to leave safe mode.");
} else {
// enter safe mode
// 进入安全模式,Pending on more safe blocks or live datanode,在这个模式下,通过后会转入EXTENSION状态
status = BMSafeModeStatus.PENDING_THRESHOLD;
initializeReplQueuesIfNecessary();
reportStatus("STATE* Safe mode ON.", true);
lastStatusReport = monotonicNow();
}
}
/**
* @return true if both block and datanode threshold are met else false.
*/
private boolean areThresholdsMet() {
assert namesystem.hasWriteLock();
// Calculating the number of live datanodes is time-consuming
// in large clusters. Skip it when datanodeThreshold is zero.
// We need to evaluate getNumLiveDataNodes only when
// (blockSafe >= blockThreshold) is true and hence moving evaluation
// of datanodeNum conditional to isBlockThresholdMet as well
synchronized (this) {
// 合法的block是否符合阈值,默认值是0.999
boolean isBlockThresholdMet = (blockSafe >= blockThreshold);
boolean isDatanodeThresholdMet = true;
// 如果要求存活的datanode个数的阈值>0才进行后续判断
if (isBlockThresholdMet && datanodeThreshold > 0) {
int datanodeNum = blockManager.getDatanodeManager().
getNumLiveDataNodes();
isDatanodeThresholdMet = (datanodeNum >= datanodeThreshold);
}
return isBlockThresholdMet && isDatanodeThresholdMet;
}
}
关于blockManager的安全模式,主要通过BlockManagerSafeMode进行管理。
/**
* Block manager safe mode info.
*
* During name node startup, counts the number of <em>safe blocks</em>, those
* that have at least the minimal number of replicas, and calculates the ratio
* of safe blocks to the total number of blocks in the system, which is the size
* of blocks. When the ratio reaches the {@link #threshold} and enough live data
* nodes have registered, it needs to wait for the safe mode {@link #extension}
* interval. After the extension period has passed, it will not leave safe mode
* until the safe blocks ratio reaches the {@link #threshold} and enough live
* data node registered.
*/
@InterfaceAudience.Private
@InterfaceStability.Evolving
class BlockManagerSafeMode {
enum BMSafeModeStatus {
PENDING_THRESHOLD, /** Pending on more safe blocks or live datanode. */
EXTENSION, /** In extension period. */
OFF /** Safe mode is off. */
}
...
}