//org.apache.hadoop.hdfs.server.namenode.NameNode#formatprivatestaticbooleanformat(Configuration conf,boolean force,boolean isInteractive)throws IOException {
String nsId = DFSUtil.getNamenodeNameServiceId(conf);
String namenodeId = HAUtil.getNameNodeId(conf, nsId);initializeGenericKeys(conf, nsId, namenodeId);checkAllowFormat(conf);//kerberos 认证登录if(UserGroupInformation.isSecurityEnabled()){
InetSocketAddress socAddr =getAddress(conf);
SecurityUtil.login(conf, DFS_NAMENODE_KEYTAB_FILE_KEY,
DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY, socAddr.getHostName());}// namenode fsimage,editlog 等元数据信息封装
Collection<URI> nameDirsToFormat = FSNamesystem.getNamespaceDirs(conf);
List<URI> sharedDirs = FSNamesystem.getSharedEditsDirs(conf);
List<URI> dirsToPrompt =newArrayList<URI>();
dirsToPrompt.addAll(nameDirsToFormat);
dirsToPrompt.addAll(sharedDirs);
List<URI> editDirsToFormat =
FSNamesystem.getNamespaceEditsDirs(conf);// if clusterID is not provided - see if you can find the current one
String clusterId = StartupOption.FORMAT.getClusterId();if(clusterId == null || clusterId.equals("")){//Generate a new cluster id
clusterId = NNStorage.newClusterID();}
System.out.println("Formatting using clusterid: "+ clusterId);//实例化FSImage//关于FSImage的描述:FSImage handles checkpointing and logging of the namespace edits.
FSImage fsImage =newFSImage(conf, nameDirsToFormat, editDirsToFormat);try{//实例化FSNamesystem//FSNamesystem does the actual bookkeeping work for the DataNode.
FSNamesystem fsn =newFSNamesystem(conf, fsImage);
fsImage.getEditLog().initJournalsForWrite();if(!fsImage.confirmFormat(force, isInteractive)){returntrue;// aborted}//namenode 格式化操作
fsImage.format(fsn, clusterId);}catch(IOException ioe){
LOG.warn("Encountered exception during format: ", ioe);
fsImage.close();throw ioe;}returnfalse;}//org.apache.hadoop.hdfs.server.namenode.FSImage#formatvoidformat(FSNamesystem fsn, String clusterId)throws IOException {long fileCount = fsn.getTotalFiles();// Expect 1 file, which is the root inode
Preconditions.checkState(fileCount ==1,"FSImage.format should be called with an uninitialized namesystem, has "+
fileCount +" files");
NamespaceInfo ns = NNStorage.newNamespaceInfo();
LOG.info("Allocated new BlockPoolId: "+ ns.getBlockPoolID());
ns.clusterID = clusterId;//存储,editlog格式化
storage.format(ns);
editLog.formatNonFileJournals(ns);saveFSImageInAllDirs(fsn,0);}
5.2 default
//org.apache.hadoop.hdfs.server.namenode.NameNode#NameNode 构造方法try{initializeGenericKeys(conf, nsId, namenodeId);//最主要的方法initialize(conf);try{
haContext.writeLock();
state.prepareToEnterState(haContext);
state.enterState(haContext);}finally{
haContext.writeUnlock();}//......}//org.apache.hadoop.hdfs.server.namenode.NameNode#initializeprotectedvoidinitialize(Configuration conf)throws IOException {//........
NameNode.initMetrics(conf,this.getRole());
StartupProgressMetrics.register(startupProgress);if(NamenodeRole.NAMENODE == role){//1.启动httpserver2 服务(50070)startHttpServer(conf);}this.spanReceiverHost =
SpanReceiverHost.get(conf, DFSConfigKeys.DFS_SERVER_HTRACE_PREFIX);//2.将磁盘内的元数据信息加载到内存loadNamesystem(conf);//3.创建RpcServer
rpcServer =createRpcServer(conf);if(clientNamenodeAddress == null){// This is expected for MiniDFSCluster. Set it now using // the RPC server's bind address.
clientNamenodeAddress =
NetUtils.getHostPortString(rpcServer.getRpcAddress());
LOG.info("Clients are to use "+ clientNamenodeAddress +" to access"+" this namenode/service.");}if(NamenodeRole.NAMENODE == role){
httpServer.setNameNodeAddress(getNameNodeAddress());
httpServer.setFSImage(getFSImage());}//jvm 内存等信息监控
pauseMonitor =newJvmPauseMonitor(conf);
pauseMonitor.start();
metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);//4.启动服务startCommonServices(conf);}
/**
* Create the RPC server implementation. Used as an extension point for the
* BackupNode.
*/protected NameNodeRpcServer createRpcServer(Configuration conf)throws IOException {returnnewNameNodeRpcServer(conf,this);}//org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer#NameNodeRpcServer//This class is responsible for handling all of the RPC calls to the NameNode.
/**
* Start services common to both active and standby states
*/voidstartCommonServices(Configuration conf, HAContext haContext)throws IOException {this.registerMBean();// register the MBean for the FSNamesystemStatewriteLock();this.haContext = haContext;try{//实例化 NameNodeResourceChecker,用于检查磁盘资源//active状态的namenod会启动一个监控线程NameNodeResourceMonitor,定期执行NameNodeResourceChecker#hasAvailableDiskSpace()检查可用的磁盘资源。//主要检查Fsimage,editlog目录
nnResourceChecker =newNameNodeResourceChecker(conf);//检查可用资源是否足够:如果不够,日志打印警告信息,然后进入安全模式checkAvailableResources();// 判断是否进入安全模式,并且副本队列是否应该被同步/复制/**
* 磁盘资源不足的情况下,任何对元数据修改所产生的日志都无法确保能够写入到磁盘,
* 即新产生的edits log和fsimage都无法确保写入磁盘。所以要进入安全模式,
* 来禁止元数据的变动以避免往磁盘写入新的日志数据
* */assert safeMode != null &&!isPopulatingReplQueues();//获取StartupProgress实例用来获取NameNode各任务的启动信息
StartupProgress prog = NameNode.getStartupProgress();//目前NameNode启动,进入到safemode阶段,处于一个等待汇报blocks的状态
prog.beginPhase(Phase.SAFEMODE);long completeBlocksTotal =getCompleteBlocksTotal();
prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,completeBlocksTotal);//设置所有的block,用于后面判断是否进入安全模式setBlockTotal(completeBlocksTotal);//激活BlockManager
blockManager.activate(conf);//关于blockmanager介绍:Keeps information related to the blocks stored in the Hadoop cluster.}finally{writeUnlock();}registerMXBean();
DefaultMetricsSystem.instance().register(this);if(inodeAttributeProvider != null){
inodeAttributeProvider.start();
dir.setINodeAttributeProvider(inodeAttributeProvider);}
snapshotManager.registerMXBean();}
5.2.4.2 NameNodeResourceChecker
publicNameNodeResourceChecker(Configuration conf)throws IOException {this.conf = conf;//创建集合,用于存放需要检查的路径
volumes =newHashMap<String, CheckedVolume>();//磁盘最小容忍刻度 默认100M
duReserved = conf.getLong(DFSConfigKeys.DFS_NAMENODE_DU_RESERVED_KEY,
DFSConfigKeys.DFS_NAMENODE_DU_RESERVED_DEFAULT);//获取本地元数据列表
Collection<URI> extraCheckedVolumes = Util.stringCollectionAsURIs(conf
.getTrimmedStringCollection(DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_KEY));//获取共享目录(HA模式下)
Collection<URI> localEditDirs = Collections2.filter(
FSNamesystem.getNamespaceEditsDirs(conf),newPredicate<URI>(){@Overridepublicbooleanapply(URI input){if(input.getScheme().equals(NNStorage.LOCAL_URI_SCHEME)){returntrue;}returnfalse;}});// Add all the local edits dirs, marking some as required if they are// configured as such.for(URI editsDirToCheck : localEditDirs){addDirToCheck(editsDirToCheck,
FSNamesystem.getRequiredNamespaceEditsDirs(conf).contains(
editsDirToCheck));}// All extra checked volumes are marked "required"for(URI extraDirToCheck : extraCheckedVolumes){addDirToCheck(extraDirToCheck,true);}
minimumRedundantVolumes = conf.getInt(
DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_KEY,
DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_DEFAULT);}//我们在前面说到的,NameNodeResourceMonitor在哪里使用呢?//org.apache.hadoop.hdfs.server.namenode.FSNamesystem#startActiveServices//大概在1130行//ResourceMonitor required only at ActiveNN. See HDFS-2914this.nnrmthread =newDaemon(newNameNodeResourceMonitor());
nnrmthread.start();//启动一个线程实时监控//一旦磁盘空间不足,开启安全模式publicvoid run (){try{while(fsRunning && shouldNNRmRun){checkAvailableResources();if(!nameNodeHasResourcesAvailable()){
String lowResourcesMsg ="NameNode low on available disk space. ";if(!isInSafeMode()){
LOG.warn(lowResourcesMsg +"Entering safe mode.");}else{
LOG.warn(lowResourcesMsg +"Already in safe mode.");}enterSafeMode(true);}try{
Thread.sleep(resourceRecheckInterval);}catch(InterruptedException ie){// Deliberately ignore}}}catch(Exception e){
FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);}}
5.2.4.3 checkAvailableResources
//org.apache.hadoop.hdfs.server.namenode.FSNamesystem#checkAvailableResourcesvoidcheckAvailableResources(){
Preconditions.checkState(nnResourceChecker != null,"nnResourceChecker not initialized");
hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();}//初始化nameNode的时候会主动检查一次,启动后就会通过NameNodeResourceMonitor这个线程不断的去检查(每隔1秒去检查一遍)
long completeBlocksTotal =getCompleteBlocksTotal();
prog.setTotal(Phase.SAFEMODE,STEP_AWAITING_REPORTED_BLOCKS,completeBlocksTotal);setBlockTotal(completeBlocksTotal);//namonode通过setBlockTotal()来回报blocks的状态,那么blocks的状态怎么拿到?//其实就是通过getCompleteBlocksTotal()/**
* Get the total number of COMPLETE blocks in the system.
* For safe mode only complete blocks are counted.
* This is invoked only during NN startup and checkpointing.
*/publiclonggetCompleteBlocksTotal(){// Calculate number of blocks under constructionlong numUCBlocks =0;readLock();try{//在HDFS中,文件有四种状态://1.正在被写入:UnderConstruction//2.正在被恢复:UnderRecover//3.还没被写入:Committed//4.已经被写入:complete//在启动时,namonode只会去读取complete状态的block
numUCBlocks = leaseManager.getNumUnderConstructionBlocks();returngetBlocksTotal()- numUCBlocks;}finally{readUnlock();}}/**
* This method iterates through all the leases and counts the number of blocks
* which are not COMPLETE. The FSNamesystem read lock MUST be held before
* calling this method.
*///编译每一个文件加,做++操作synchronizedlonggetNumUnderConstructionBlocks(){assertthis.fsnamesystem.hasReadLock():"The FSNamesystem read lock wasn't"+"acquired before counting under construction blocks";long numUCBlocks =0;for(Long id :getINodeIdWithLeases()){final INodeFile cons = fsnamesystem.getFSDirectory().getInode(id).asFile();
Preconditions.checkState(cons.isUnderConstruction());
BlockInfo[] blocks = cons.getBlocks();if(blocks == null){continue;}for(BlockInfo b : blocks){if(!b.isComplete())
numUCBlocks++;}}
LOG.info("Number of blocks under construction: "+ numUCBlocks);return numUCBlocks;}
5.4.4.6 setBlockTotal
setBlockTotal(completeBlocksTotal);//publicvoidsetBlockTotal(long completeBlocksTotal){// safeMode is volatile, and may be set to null at any time
SafeModeInfo safeMode =this.safeMode;if(safeMode == null)return;
safeMode.setBlockTotal((int) completeBlocksTotal);}/**
* Set total number of blocks.
*/privatesynchronizedvoidsetBlockTotal(int total){//获取所有汇总来的blockthis.blockTotal = total;//获取一个阈值,比如说:1000 * 0.999fthis.blockThreshold =(int)(blockTotal * threshold);this.blockReplQueueThreshold =(int)(blockTotal * replQueueThreshold);if(haEnabled){// After we initialize the block count, any further namespace// modifications done while in safe mode need to keep track// of the number of total blocks in the system.this.shouldIncrementallyTrackBlocks =true;}if(blockSafe <0)this.blockSafe =0;//根据汇总block 值来判断是否需要进入安全模式(>=999,退出安全模式)//用于检查安全模式的状态://1、判断阈值系数是否满足进入安全模式:needEnter//对于离开安全模式,有两个条件判断://1、判断系数是否满足离开安全模式//2、启动SafeModeMonitor线程,每隔1秒去查看下,是否可以退出安全模式checkMode();}/**
* Check and trigger safe mode if needed.
*/privatevoidcheckMode(){//安全模式下,禁止写入,执行写锁asserthasWriteLock();//如果当前节点时active,则不需要检查if(inTransitionToActive()){return;}//根据 needEnter判断是否需要进入安全模式if(smmthread == null &&needEnter()){enter();//..................}//.........................}/**
* There is no need to enter safe mode
* if DFS is empty or {@link #threshold} == 0
*/privatebooleanneedEnter(){return(threshold !=0&& blockSafe < blockThreshold)||(datanodeThreshold !=0&&getNumLiveDataNodes()< datanodeThreshold)||(!nameNodeHasResourcesAvailable());}//看一下啥条件下执行安全模式://1.汇报block的总数量小于 block阈值(block =1000,blockThreshold= 1000 * 0.999f)//2.存活的datanode的数据小于datanodeThreshold阈值0//3.namenode 元数据存储磁盘不足100M//以上便是NameNode启动全流程