1.hadoop源码分析:namenode的启动

namenode源码位于hadoop-hdfs-project项目的org.apache.hadoop.hdfs.server.namenode包中,首先查看该类注释,描述了namenode的大体功能:

* namenode server负责目录名称空间和inode表的管理,namenode中维护两张表:

* 1) filename{@literal ->}blocksequence (namespace)

* 2) block{@literal ->}machinelist ("inodes")

* 第一张表存储在namenode的磁盘中,第二张表由datanode在集群启动的时候上报上来。

* namenode实现了如下接口:

* (1)org.apache.hadoop.hdfs.protocol.ClientProtocol 该接口用于与client端通信,但是一般用于内部,我们自己连接集群一般使用FileSystem类

* (2)org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol 与datanode通信的接口。

* (3)org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol 与其他namenode通信的接口。

集群启动的时候直接执行的namenode类的main方法:

public static void main(String argv[]) throws Exception {
  if (DFSUtil.parseHelpArgument(argv, NameNode.USAGE, System.out, true)) {
    System.exit(0);
  }

  try {
    StringUtils.startupShutdownMessage(NameNode.class, argv, LOG);
    //创建namenode实例
    NameNode namenode = createNameNode(argv, null);
    if (namenode != null) {
      //join到主线程 无限等待
      namenode.join();
    }
  } catch (Throwable e) {
    LOG.error("Failed to start namenode.", e);
    terminate(1, e);
  }
}

 

进入NameNode namenode = createNameNode(argv, null);方法,集群启动的时候该方法主要做两件事:加载配置文件和初始化namenode类。

 public static NameNode createNameNode(String argv[], Configuration conf)
      throws IOException {
    LOG.info("createNameNode " + Arrays.asList(argv));
    if (conf == null)
      //加载配置文件
      conf = new HdfsConfiguration();
    // Parse out some generic args into Configuration.
    GenericOptionsParser hParser = new GenericOptionsParser(conf, argv);
    argv = hParser.getRemainingArgs();
    // Parse the rest, NN specific args.
    StartupOption startOpt = parseArguments(argv);
    if (startOpt == null) {
      printUsage(System.err);
      return null;
    }
    setStartupOption(conf, startOpt);

    boolean aborted = false;
    switch (startOpt) {
    /**
	* 这里省略一堆case情况,集群正常启动会直接走到default,返回一个NameNode的新构造的对象。
	*/
    default:
      DefaultMetricsSystem.initialize("NameNode");
      return new NameNode(conf);
    }
  }

程序进入到重载的namenode的构造方法:

protected NameNode(Configuration conf, NamenodeRole role)
    throws IOException {
  super(conf);
  this.tracer = new Tracer.Builder("NameNode").
      conf(TraceUtils.wrapHadoopConf(NAMENODE_HTRACE_PREFIX, conf)).
      build();
  this.tracerConfigurationManager =
      new TracerConfigurationManager(NAMENODE_HTRACE_PREFIX, conf);
  this.role = role;
  String nsId = getNameServiceId(conf);
  String namenodeId = HAUtil.getNameNodeId(conf, nsId);
  clientNamenodeAddress = NameNodeUtils.getClientNamenodeAddress(
      conf, nsId);

  if (clientNamenodeAddress != null) {
    LOG.info("Clients should use {} to access"
        + " this namenode/service.", clientNamenodeAddress);
  }
  this.haEnabled = HAUtil.isHAEnabled(conf, nsId);
  state = createHAState(getStartupOption(conf));
  this.allowStaleStandbyReads = HAUtil.shouldAllowStandbyReads(conf);
  this.haContext = createHAContext();
  try {
    initializeGenericKeys(conf, nsId, namenodeId);
    //初始化namenode组件
    initialize(getConf());
    state.prepareToEnterState(haContext);
    try {
      haContext.writeLock();
      state.enterState(haContext);
    } finally {
      haContext.writeUnlock();
    }
  } catch (IOException e) {
    this.stopAtException(e);
    throw e;
  } catch (HadoopIllegalArgumentException e) {
    this.stopAtException(e);
    throw e;
  }
  notBecomeActiveInSafemode = conf.getBoolean(
      DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE,
      DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT);
  this.started.set(true);
}

进入initialize(getConf());方法,namenode中包含的服务启动基本都在该方法中:

protected void initialize(Configuration conf) throws IOException {
  if (conf.get(HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS) == null) {
    String intervals = conf.get(DFS_METRICS_PERCENTILES_INTERVALS_KEY);
    if (intervals != null) {
      conf.set(HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS,
        intervals);
    }
  }
  //用户账户相关
  UserGroupInformation.setConfiguration(conf);
  loginAsNameNodeUser(conf);
  //metric相关
  NameNode.initMetrics(conf, this.getRole());
  StartupProgressMetrics.register(startupProgress);
  //monitor相关
  pauseMonitor = new JvmPauseMonitor();
  pauseMonitor.init(conf);
  pauseMonitor.start();
  metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
  //1.启动一个httpserver 50070端口
  if (NamenodeRole.NAMENODE == role) {
    startHttpServer(conf);
  }
  //2.从磁盘加载fsimage和edits文件
  loadNamesystem(conf);
  startAliasMapServerIfNecessary(conf);
  //3.创建rpc server,使用pb创建的一个rpc server
  rpcServer = createRpcServer(conf);

  initReconfigurableBackoffKey();

  if (clientNamenodeAddress == null) {
    // This is expected for MiniDFSCluster. Set it now using 
    // the RPC server's bind address.
    clientNamenodeAddress = 
        NetUtils.getHostPortString(getNameNodeAddress());
    LOG.info("Clients are to use " + clientNamenodeAddress + " to access"
        + " this namenode/service.");
  }
  if (NamenodeRole.NAMENODE == role) {
    httpServer.setNameNodeAddress(getNameNodeAddress());
    httpServer.setFSImage(getFSImage());
    if (levelDBAliasMapServer != null) {
      httpServer.setAliasMap(levelDBAliasMapServer.getAliasMap());
    }
  }
  //4开启多个服务:
  //(1).startCommonServices
  //(2).startHttpServer
  //(3).开启rpcServer
  startCommonServices(conf);
  startMetricsLogger(conf);
}

忽略掉监控,用户权限等信息,该初始化主要的方法有:

(1)startHttpServer(conf):启动一个httpserver 50070端口

(2)loadNamesystem(conf):从磁盘加载fsimage和edits文件

(3)rpcServer = createRpcServer(conf):创建rpc server,使用pb创建的一个rpc server

(4)startCommonServices(conf):开启多个服务:

下面挨个分析:

(1)startHttpServer(conf):首先看第一个方法,启动httpserver,startHttpServer会启动一个httpServer并配置相关映射路径的servlet,该server名称为HttpServer2,HttpServer2内部嵌入了jetty容器。默认地址是http://0.0.0.0:50070。启动httpserver后,打开页面可以看到整个hdfs文件系统情况。

private void startHttpServer(final Configuration conf) throws IOException {
  //getHttpServerBindAddress获取的是配置:dfs.namenode.http-bind-host
  httpServer = new NameNodeHttpServer(conf, this, getHttpServerBindAddress(conf));
  httpServer.start();
  httpServer.setStartupProgress(startupProgress);
}

getHttpServerBindAddress(conf)):从配置文件中获取httpserver的host及端口号。

protected InetSocketAddress getHttpServerBindAddress(Configuration conf) {
  //配置项:dfs.namenode.http-address,默认值是DFS_NAMENODE_HTTP_ADDRESS_DEFAULT = "0.0.0.0:" + (50070)DFS_NAMENODE_HTTP_PORT_DEFAULT;
  InetSocketAddress bindAddress = getHttpServerAddress(conf);

  // If DFS_NAMENODE_HTTP_BIND_HOST_KEY exists then it overrides the
  // host name portion of DFS_NAMENODE_HTTP_ADDRESS_KEY.
  //dfs.namenode.http-bind-host
  final String bindHost = conf.getTrimmed(DFS_NAMENODE_HTTP_BIND_HOST_KEY);
  if (bindHost != null && !bindHost.isEmpty()) {
    bindAddress = new InetSocketAddress(bindHost, bindAddress.getPort());
  }

  return bindAddress;
}

实际上这段代码就是获取了一个绑定50070端口的InetSocketAddress,这就是我们访问的50070web页面,里面可以查看hdfs集群各个节点的情况,目录结构,block信息,元数据信息等。

 

再来看start方法:主要就是将上面配置好的一个httpserver服务器启动起来,接收用户的浏览器访问。

void start() throws IOException {
  HttpConfig.Policy policy = DFSUtil.getHttpPolicy(conf);
  //绑定的机器名
  final String infoHost = bindAddress.getHostName();

  final InetSocketAddress httpAddr = bindAddress;
  //https地址,默认不会使用这个
  final String httpsAddrString = conf.get(
      DFSConfigKeys.DFS_NAMENODE_HTTPS_ADDRESS_KEY,//dfs.namenode.https-address
      DFSConfigKeys.DFS_NAMENODE_HTTPS_ADDRESS_DEFAULT);//"0.0.0.0:" + DFS_NAMENODE_HTTPS_PORT_DEFAULT(50470);
  InetSocketAddress httpsAddr = NetUtils.createSocketAddr(httpsAddrString);

  if (httpsAddr != null) {
    // If DFS_NAMENODE_HTTPS_BIND_HOST_KEY exists then it overrides the
    // host name portion of DFS_NAMENODE_HTTPS_ADDRESS_KEY.
    final String bindHost =
        conf.getTrimmed(DFSConfigKeys.DFS_NAMENODE_HTTPS_BIND_HOST_KEY);
    if (bindHost != null && !bindHost.isEmpty()) {
      httpsAddr = new InetSocketAddress(bindHost, httpsAddr.getPort());
    }
  }
  //hadoop默认使用的是httpserver2,一个hadoop自己实现的web系统,
  // 位于org.apache.hadoop.http包下
  //是一个包装了jetty(Server webServer)的服务器的一个web容器。
  HttpServer2.Builder builder = DFSUtil.httpServerTemplateForNNAndJN(conf,
      httpAddr, httpsAddr, "hdfs",
      DFSConfigKeys.DFS_NAMENODE_KERBEROS_INTERNAL_SPNEGO_PRINCIPAL_KEY,//dfs.namenode.kerberos.internal.spnego.principal
      DFSConfigKeys.DFS_NAMENODE_KEYTAB_FILE_KEY);//dfs.namenode.keytab.file

  httpServer = builder.build();

  if (policy.isHttpsEnabled()) {
    // assume same ssl port for all datanodes
    InetSocketAddress datanodeSslPort = NetUtils.createSocketAddr(conf.get(
        DFSConfigKeys.DFS_DATANODE_HTTPS_ADDRESS_KEY, infoHost + ":"
            + DFSConfigKeys.DFS_DATANODE_HTTPS_DEFAULT_PORT));
    httpServer.setAttribute(DFSConfigKeys.DFS_DATANODE_HTTPS_PORT_KEY,
        datanodeSslPort.getPort());
  }

  initWebHdfs(conf);

  httpServer.setAttribute(NAMENODE_ATTRIBUTE_KEY, nn);
  httpServer.setAttribute(JspHelper.CURRENT_CONF, conf);
  //设置servlet映射,类似于springmvc里的路径请求映射,
  //设置哪些请求由哪些servlet来处理.
  setupServlets(httpServer, conf);
  //开启http服务
  httpServer.start();

  int connIdx = 0;
  if (policy.isHttpEnabled()) {
    httpAddress = httpServer.getConnectorAddress(connIdx++);
    conf.set(DFSConfigKeys.DFS_NAMENODE_HTTP_ADDRESS_KEY,
        NetUtils.getHostPortString(httpAddress));
  }
  //是否开启https
  if (policy.isHttpsEnabled()) {
    httpsAddress = httpServer.getConnectorAddress(connIdx);
    conf.set(DFSConfigKeys.DFS_NAMENODE_HTTPS_ADDRESS_KEY,
        NetUtils.getHostPortString(httpsAddress));
  }
}

这样,基于jetty的一个httpserver2服务就启动了,并且绑定了路径与servlet的映射,从而能够响应web端50070端口的web请求了。

(2)loadNamesystem(conf):

protected void loadNamesystem(Configuration conf) throws IOException {
  //创建并初始化FSNamesystem:namesystem,namesystem代表的就是fsimage,也就是集群的元数据
  //读取合并后放在内存里
  this.namesystem = FSNamesystem.loadFromDisk(conf);
}

该方法主要就是创建并初始化FSNamesystem:namesystem,namesystem代表的就是包装了fsimage的一个类,也就是集群的元数据信息,进入FSNamesystem.loadFromDisk(conf)方法,先看该方法的注释信息:

* Instantiates an FSNamesystem loaded from the image and edits

* directories specified in the passed Configuration.

初始化一个FSNamesystem,该FSNamesystem对象的构建是通过读取配置文件中设置的fsimage目录和editslog目录的文件,并合并而来。

static FSNamesystem loadFromDisk(Configuration conf) throws IOException {
  //检查集群配置
  checkConfiguration(conf);
  //FsImage终于出现了,该对象代表了hdfs的元数据信息,通过读取fsimage文件与editslog目录的文件来构造一个fsimage对象
  FSImage fsImage = new FSImage(conf,
      //配置项:dfs.namenode.name.dir所指向的目录,该目录就是fsimage存在磁盘上的位置
      // 默认情况下是 /tmp下面一个目录,一般会修改该目录,而且一般会指向多个目录
      FSNamesystem.getNamespaceDirs(conf),
      //配置项:dfs.namenode.shared.edits.dir,指向editslog所在的目录,默认情况下fsimage与editslog是放在同一个目录的
      FSNamesystem.getNamespaceEditsDirs(conf));
  //通过fsImage创建一个FSNamesystem对象
  FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
  //dfs.namenode.startup
  StartupOption startOpt = NameNode.getStartupOption(conf);
  //这里如果启动选项是“-recover”,那么就进入安全模式,这里是启动,不会进入
  if (startOpt == StartupOption.RECOVER) {
    namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
  }

  long loadStart = now();
  try {
    //从磁盘加载fsimage和editslog两个数据并在内存中合并。
    namesystem.loadFSImage(startOpt);
  } catch (IOException ioe) {
    LOG.warn("Encountered exception loading fsimage", ioe);
    fsImage.close();
    throw ioe;
  }
  long timeTakenToLoadFSImage = now() - loadStart;
  LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
  NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
  if (nnMetrics != null) {
    nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
  }
  return namesystem;
}

来看namesystem.loadFSImage(startOpt);方法:

private void loadFSImage(StartupOption startOpt) throws IOException {
  final FSImage fsImage = getFSImage();

  // format before starting up if requested
  if (startOpt == StartupOption.FORMAT) {
    
    fsImage.format(this, fsImage.getStorage().determineClusterId());// reuse current id

    startOpt = StartupOption.REGULAR;
  }
  boolean success = false;
  //读取fsimage的时候加排它锁
  writeLock();
  try {
    // We shouldn't be calling saveNamespace if we've come up in standby state.
    MetaRecoveryContext recovery = startOpt.createRecoveryContext();
    //
    final boolean staleImage = fsImage.recoverTransitionRead(startOpt, this, recovery);
    if (RollingUpgradeStartupOption.ROLLBACK.matches(startOpt) ||
        RollingUpgradeStartupOption.DOWNGRADE.matches(startOpt)) {
      rollingUpgradeInfo = null;
    }
    final boolean needToSave = staleImage && !haEnabled && !isRollingUpgrade(); 
    LOG.info("Need to save fs image? " + needToSave
        + " (staleImage=" + staleImage + ", haEnabled=" + haEnabled
        + ", isRollingUpgrade=" + isRollingUpgrade() + ")");
    if (needToSave) {
      //将合并后的fsimage文件写入到配置的多个目录下
      fsImage.saveNamespace(this);
    } else {
      updateStorageVersionForRollingUpgrade(fsImage.getLayoutVersion(),
          startOpt);
      // No need to save, so mark the phase done.
      StartupProgress prog = NameNode.getStartupProgress();
      prog.beginPhase(Phase.SAVING_CHECKPOINT);
      prog.endPhase(Phase.SAVING_CHECKPOINT);
    }
    // This will start a new log segment and write to the seen_txid file, so
    // we shouldn't do it when coming up in standby state
    if (!haEnabled || (haEnabled && startOpt == StartupOption.UPGRADE)
        || (haEnabled && startOpt == StartupOption.UPGRADEONLY)) {
      //开启一个新的editslog文件
      fsImage.openEditLogForWrite();
    }
    success = true;
  } finally {
    if (!success) {
      fsImage.close();
    }
    writeUnlock();
  }
  imageLoadComplete();
}

所以,该方法主要是构造出了一个FsNameSystem对象,该对象包含了fsImage对象。fsimage对象在启动的时候是由fsimage与editslog合并而来,最后会将fsimage写入到各个fsimage目录中,并且开辟出一个新的editslog文件。

 

(3)rpcServer = createRpcServer(conf)

来查看rpcserver的启动源码:

protected NameNodeRpcServer createRpcServer(Configuration conf)
    throws IOException {
  return new NameNodeRpcServer(conf, this);
}

NameNodeRpcServer(conf, this):

public NameNodeRpcServer(Configuration conf, NameNode nn)
     throws IOException {
   this.nn = nn;
   this.namesystem = nn.getNamesystem();
   this.metrics = NameNode.getNameNodeMetrics();
   //dfs.namenode.handler.count
   //NameNode有一个工作线程池用来处理客户端的远程过程调用及集群守护进程的调用。
   // 处理程序数量越多意味着要更大的池来处理来自不同DataNode的并发心跳以及客户端并发的元数据操作。
   // 对于大集群或者有大量客户端的集群来说,通常需要增大参数dfs.namenode.handler.count的默认值10。
   // 设置该值的一般原则是将其设置为集群大小的自然对数乘以20,即20logN,N为集群大小。
   int handlerCount = 
     conf.getInt(DFS_NAMENODE_HANDLER_COUNT_KEY, 
                 DFS_NAMENODE_HANDLER_COUNT_DEFAULT);//DFS_NAMENODE_HANDLER_COUNT_DEFAULT:10

   RPC.setProtocolEngine(conf, ClientNamenodeProtocolPB.class,
       ProtobufRpcEngine.class);
   //下面定义了namenode与外界通信的所有rpc接口功能方法:
   //namenode 与client的接口,定义了诸如getBlockLocations,create,append,setReplication等客户端操作的方法
   ClientNamenodeProtocolServerSideTranslatorPB 
      clientProtocolServerTranslator = 
        new ClientNamenodeProtocolServerSideTranslatorPB(this);
    BlockingService clientNNPbService = ClientNamenodeProtocol.
        newReflectiveBlockingService(clientProtocolServerTranslator);
     //namenode 与datanode的接口,定义了诸如registerDatanode,sendHeartbeat,blockReport等接口方法
   DatanodeProtocolServerSideTranslatorPB dnProtoPbTranslator = 
       new DatanodeProtocolServerSideTranslatorPB(this);
   BlockingService dnProtoPbService = DatanodeProtocolService
       .newReflectiveBlockingService(dnProtoPbTranslator);
   //namenode之间通信的接口
   NamenodeProtocolServerSideTranslatorPB namenodeProtocolXlator = 
       new NamenodeProtocolServerSideTranslatorPB(this);
   BlockingService NNPbService = NamenodeProtocolService
         .newReflectiveBlockingService(namenodeProtocolXlator);
   //权限相关的接口
   RefreshAuthorizationPolicyProtocolServerSideTranslatorPB refreshAuthPolicyXlator = 
       new RefreshAuthorizationPolicyProtocolServerSideTranslatorPB(this);
   BlockingService refreshAuthService = RefreshAuthorizationPolicyProtocolService
       .newReflectiveBlockingService(refreshAuthPolicyXlator);

   RefreshUserMappingsProtocolServerSideTranslatorPB refreshUserMappingXlator = 
       new RefreshUserMappingsProtocolServerSideTranslatorPB(this);
   BlockingService refreshUserMappingService = RefreshUserMappingsProtocolService
       .newReflectiveBlockingService(refreshUserMappingXlator);

   RefreshCallQueueProtocolServerSideTranslatorPB refreshCallQueueXlator = 
       new RefreshCallQueueProtocolServerSideTranslatorPB(this);
   BlockingService refreshCallQueueService = RefreshCallQueueProtocolService
       .newReflectiveBlockingService(refreshCallQueueXlator);

   GenericRefreshProtocolServerSideTranslatorPB genericRefreshXlator =
       new GenericRefreshProtocolServerSideTranslatorPB(this);
   BlockingService genericRefreshService = GenericRefreshProtocolService
       .newReflectiveBlockingService(genericRefreshXlator);

   GetUserMappingsProtocolServerSideTranslatorPB getUserMappingXlator = 
       new GetUserMappingsProtocolServerSideTranslatorPB(this);
   BlockingService getUserMappingService = GetUserMappingsProtocolService
       .newReflectiveBlockingService(getUserMappingXlator);
   //ha相关的接口,standby与active转换等方法
   HAServiceProtocolServerSideTranslatorPB haServiceProtocolXlator = 
       new HAServiceProtocolServerSideTranslatorPB(this);
   BlockingService haPbService = HAServiceProtocolService
       .newReflectiveBlockingService(haServiceProtocolXlator);

   TraceAdminProtocolServerSideTranslatorPB traceAdminXlator =
       new TraceAdminProtocolServerSideTranslatorPB(this);
   BlockingService traceAdminService = TraceAdminService
       .newReflectiveBlockingService(traceAdminXlator);
   
   WritableRpcEngine.ensureInitialized();
   //dfs.namenode.servicerpc-address:处理datanode请求的rpc监听的地址和端口,默认本地
     //RPC服务器将绑定的实际地址。如果设置了这个可选的地址,那么它只覆盖dfs.namenode.servicerpc-addres的主机名部分。
     //它也可以指定HA/Federation中的每个namenode或name service。这有助于通过设置它的值为0.0.0.0来使namenode监听所有的接口。
   InetSocketAddress serviceRpcAddr = nn.getServiceRpcServerAddress(conf);
   if (serviceRpcAddr != null) {
       //dfs.namenode.servicerpc-bind-host
     String bindHost = nn.getServiceRpcServerBindHost(conf);
     if (bindHost == null) {
       bindHost = serviceRpcAddr.getHostName();
     }
     LOG.info("Service RPC server is binding to " + bindHost + ":" +
         serviceRpcAddr.getPort());

     int serviceHandlerCount =
       conf.getInt(DFS_NAMENODE_SERVICE_HANDLER_COUNT_KEY,//dfs.namenode.service.handler.count
                   DFS_NAMENODE_SERVICE_HANDLER_COUNT_DEFAULT);//默认10
       //该server用来处理datanode的请求,serviceHandlerCount个处理线程
     this.serviceRpcServer = new RPC.Builder(conf)
         .setProtocol(
             org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB.class)
         .setInstance(clientNNPbService)
         .setBindAddress(bindHost)
         .setPort(serviceRpcAddr.getPort()).setNumHandlers(serviceHandlerCount)
         .setVerbose(false)
         .setSecretManager(namesystem.getDelegationTokenSecretManager())
         .build();

     // Add all the RPC protocols that the namenode implements
       将这些service绑定到serviceRpcServer这个server上
     DFSUtil.addPBProtocol(conf, HAServiceProtocolPB.class, haPbService,
         serviceRpcServer);
     DFSUtil.addPBProtocol(conf, NamenodeProtocolPB.class, NNPbService,
         serviceRpcServer);
     DFSUtil.addPBProtocol(conf, DatanodeProtocolPB.class, dnProtoPbService,
         serviceRpcServer);
     DFSUtil.addPBProtocol(conf, RefreshAuthorizationPolicyProtocolPB.class,
         refreshAuthService, serviceRpcServer);
     DFSUtil.addPBProtocol(conf, RefreshUserMappingsProtocolPB.class, 
         refreshUserMappingService, serviceRpcServer);
     // We support Refreshing call queue here in case the client RPC queue is full
     DFSUtil.addPBProtocol(conf, RefreshCallQueueProtocolPB.class,
         refreshCallQueueService, serviceRpcServer);
     DFSUtil.addPBProtocol(conf, GenericRefreshProtocolPB.class,
         genericRefreshService, serviceRpcServer);
     DFSUtil.addPBProtocol(conf, GetUserMappingsProtocolPB.class, 
         getUserMappingService, serviceRpcServer);
     DFSUtil.addPBProtocol(conf, TraceAdminProtocolPB.class,
         traceAdminService, serviceRpcServer);

     // Update the address with the correct port
     InetSocketAddress listenAddr = serviceRpcServer.getListenerAddress();
     serviceRPCAddress = new InetSocketAddress(
           serviceRpcAddr.getHostName(), listenAddr.getPort());
     nn.setRpcServiceServerAddress(conf, serviceRPCAddress);
   } else {
     serviceRpcServer = null;
     serviceRPCAddress = null;
   }
   InetSocketAddress rpcAddr = nn.getRpcServerAddress(conf);
   String bindHost = nn.getRpcServerBindHost(conf);
   if (bindHost == null) {
     bindHost = rpcAddr.getHostName();
   }
   LOG.info("RPC server is binding to " + bindHost + ":" + rpcAddr.getPort());
   //该server处理client端的请求,使用handlerCount个线程
   this.clientRpcServer = new RPC.Builder(conf)
       .setProtocol(
           org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB.class)
       .setInstance(clientNNPbService).setBindAddress(bindHost)
       .setPort(rpcAddr.getPort()).setNumHandlers(handlerCount)
       .setVerbose(false)
       .setSecretManager(namesystem.getDelegationTokenSecretManager()).build();

   // Add all the RPC protocols that the namenode implements
   //将这些service绑定到clientRpcServer这个server上
   DFSUtil.addPBProtocol(conf, HAServiceProtocolPB.class, haPbService,
       clientRpcServer);
   DFSUtil.addPBProtocol(conf, NamenodeProtocolPB.class, NNPbService,
       clientRpcServer);
   DFSUtil.addPBProtocol(conf, DatanodeProtocolPB.class, dnProtoPbService,
       clientRpcServer);
   DFSUtil.addPBProtocol(conf, RefreshAuthorizationPolicyProtocolPB.class, 
       refreshAuthService, clientRpcServer);
   DFSUtil.addPBProtocol(conf, RefreshUserMappingsProtocolPB.class, 
       refreshUserMappingService, clientRpcServer);
   DFSUtil.addPBProtocol(conf, RefreshCallQueueProtocolPB.class,
       refreshCallQueueService, clientRpcServer);
   DFSUtil.addPBProtocol(conf, GenericRefreshProtocolPB.class,
       genericRefreshService, clientRpcServer);
   DFSUtil.addPBProtocol(conf, GetUserMappingsProtocolPB.class, 
       getUserMappingService, clientRpcServer);
   DFSUtil.addPBProtocol(conf, TraceAdminProtocolPB.class,
       traceAdminService, clientRpcServer);

   // set service-level authorization security policy
   if (serviceAuthEnabled =
         conf.getBoolean(
           CommonConfigurationKeys.HADOOP_SECURITY_AUTHORIZATION, false)) {
     clientRpcServer.refreshServiceAcl(conf, new HDFSPolicyProvider());
     if (serviceRpcServer != null) {
       serviceRpcServer.refreshServiceAcl(conf, new HDFSPolicyProvider());
     }
   }

   // The rpc-server port can be ephemeral... ensure we have the correct info
   InetSocketAddress listenAddr = clientRpcServer.getListenerAddress();
     clientRpcAddress = new InetSocketAddress(
         rpcAddr.getHostName(), listenAddr.getPort());
   nn.setRpcServerAddress(conf, clientRpcAddress);
   
   minimumDataNodeVersion = conf.get(
       DFSConfigKeys.DFS_NAMENODE_MIN_SUPPORTED_DATANODE_VERSION_KEY,
       DFSConfigKeys.DFS_NAMENODE_MIN_SUPPORTED_DATANODE_VERSION_DEFAULT);

   // Set terse exception whose stack trace won't be logged
   this.clientRpcServer.addTerseExceptions(SafeModeException.class,
       FileNotFoundException.class,
       HadoopIllegalArgumentException.class,
       FileAlreadyExistsException.class,
       InvalidPathException.class,
       ParentNotDirectoryException.class,
       UnresolvedLinkException.class,
       AlreadyBeingCreatedException.class,
       QuotaExceededException.class,
       RecoveryInProgressException.class,
       AccessControlException.class,
       InvalidToken.class,
       LeaseExpiredException.class,
       NSQuotaExceededException.class,
       DSQuotaExceededException.class,
       AclException.class,
       FSLimitException.PathComponentTooLongException.class,
       FSLimitException.MaxDirectoryItemsExceededException.class,
       UnresolvedPathException.class);
}

这段代码构造出一个NameNodeRpcServer对象,该对象承载着namenode的对外rpc服务,该类内部包含两个rpcserver:serviceRpcServer和clientRpcServer

//DataNodes发起的请求由该server处理
private final RPC.Server serviceRpcServer;
//client发起的请求由该server处理
protected final RPC.Server clientRpcServer;

serviceRpcServer主要处理DataNodes发起的请求,clientRpcServer处理client发起的请求。每个server背后绑定着很多个service,如haPbService,refreshUserMappingService等等,请求过来后通过rpc接口来调用相关的service。serviceRpcServer与clientRpcServer有可能调用相同的service,因为client端与datanode端可能有相同的操作。这里面还有两个线程的设置,handlerCount和serviceHandlerCount,分别代码处理客户端和datanode请求的rpc线程数,设置该值的一般原则是将其设置为集群大小的自然对数乘以20,即20logN,N为集群大小。一般可适当调大该值。这样,包含了两个内部rpcserver的NameNodeRpcServer对象就构建完毕了。

 

(4)startCommonServices(conf)

startCommonServices启动了多个功能:

private void startCommonServices(Configuration conf) throws IOException {
  //启动commonservice
  namesystem.startCommonServices(conf, haContext);
  registerNNSMXBean();
  if (NamenodeRole.NAMENODE != role) {
    startHttpServer(conf);
    httpServer.setNameNodeAddress(getNameNodeAddress());
    httpServer.setFSImage(getFSImage());
  }
  rpcServer.start();
  plugins = conf.getInstances(DFS_NAMENODE_PLUGINS_KEY,
      ServicePlugin.class);
  for (ServicePlugin p: plugins) {
    try {
      p.start(this);
    } catch (Throwable t) {
      LOG.warn("ServicePlugin " + p + " could not be started", t);
    }
  }
  LOG.info(getRole() + " RPC up at: " + rpcServer.getRpcAddress());
  if (rpcServer.getServiceRpcAddress() != null) {
    LOG.info(getRole() + " service RPC up at: "
        + rpcServer.getServiceRpcAddress());
  }
}

  1. 检查editslog目录磁盘空间

进入namesystem.startCommonServices(conf, haContext);

void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
  this.registerMBean(); // register the MBean for the FSNamesystemState
  writeLock();
  this.haContext = haContext;
  try {
    //创建了NameNodeResourceChecker:nnResourceChecker对象
    nnResourceChecker = new NameNodeResourceChecker(conf);
    //检查是否有足够的空间
    checkAvailableResources();
    assert safeMode != null && !isPopulatingReplQueues();
    StartupProgress prog = NameNode.getStartupProgress();
    prog.beginPhase(Phase.SAFEMODE);
    prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
      getCompleteBlocksTotal());
    //查看是否进入了safemode,并设置已经上报的block数量
    setBlockTotal();
    //启动blockManager
    blockManager.activate(conf);
  } finally {
    writeUnlock();
  }
  
  registerMXBean();
  DefaultMetricsSystem.instance().register(this);
  snapshotManager.registerMXBean();
}

首先,创建了一个NameNodeResourceChecker:nnResourceChecker对象,查看该对象构造方法:

public NameNodeResourceChecker(Configuration conf) throws IOException {
  this.conf = conf;
  volumes = new HashMap<String, CheckedVolume>();
  //通过配置获取dfs.namenode.resource.du.reserved参数,默认100M,生产系统经常设置为10G左右
  duReserved = conf.getLong(DFSConfigKeys.DFS_NAMENODE_DU_RESERVED_KEY,
      DFSConfigKeys.DFS_NAMENODE_DU_RESERVED_DEFAULT);
  
  Collection<URI> extraCheckedVolumes = Util.stringCollectionAsURIs(conf
      .getTrimmedStringCollection(DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_KEY));
  
  Collection<URI> localEditDirs = Collections2.filter(
      FSNamesystem.getNamespaceEditsDirs(conf),
      new Predicate<URI>() {
        @Override
        public boolean apply(URI input) {
          if (input.getScheme().equals(NNStorage.LOCAL_URI_SCHEME)) {
            return true;
          }
          return false;
        }
      });

  // Add all the local edits dirs, marking some as required if they are
  // configured as such.
  for (URI editsDirToCheck : localEditDirs) {
    addDirToCheck(editsDirToCheck,
        FSNamesystem.getRequiredNamespaceEditsDirs(conf).contains(
            editsDirToCheck));
  }

  // All extra checked volumes are marked "required"
  for (URI extraDirToCheck : extraCheckedVolumes) {
    addDirToCheck(extraDirToCheck, true);
  }
  
  minimumRedundantVolumes = conf.getInt(
      DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_KEY,
      DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_DEFAULT);
}

该对象初始化主要是通过配置获取一个机器需要保留的一个磁盘大小,参数是duReserved。用于对比当前空间是否大于该值,一般生产系统我们需要设置的大一些,另外,通过配置文件收集获取editslog等文件目录,用于后面的判断。初始化后,进入检查:

void checkAvailableResources() {
  Preconditions.checkState(nnResourceChecker != null,
      "nnResourceChecker not initialized");
  hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
}

nnResourceChecker.hasAvailableDiskSpace()方法:

public boolean hasAvailableDiskSpace() {
  return NameNodeResourcePolicy.areResourcesAvailable(volumes.values(),
      minimumRedundantVolumes);
}

继续跟进:

static boolean areResourcesAvailable(
    Collection<? extends CheckableNameNodeResource> resources,
    int minimumRedundantResources) {

  // TODO: workaround:
  // - during startup, if there are no edits dirs on disk, then there is
  // a call to areResourcesAvailable() with no dirs at all, which was
  // previously causing the NN to enter safemode
  if (resources.isEmpty()) {
    return true;
  }
  //磁盘充足的个数
  int requiredResourceCount = 0;
  int redundantResourceCount = 0;
  //记录空间不足的磁盘个数
  int disabledRedundantResourceCount = 0;
  for (CheckableNameNodeResource resource : resources) {
    if (!resource.isRequired()) {
      redundantResourceCount++;
      //真正检查的代码
      if (!resource.isResourceAvailable()) {
        disabledRedundantResourceCount++;
      }
    } else {
      requiredResourceCount++;
      if (!resource.isResourceAvailable()) {
        // Short circuit - a required resource is not available.
        return false;
      }
    }
  }
  
  if (redundantResourceCount == 0) {
    // If there are no redundant resources, return true if there are any
    // required resources available.
    return requiredResourceCount > 0;
  } else {
    //最后返回空间充足的个数是否符合预期
    return redundantResourceCount - disabledRedundantResourceCount >=
        minimumRedundantResources;
  }
}

isResourceAvailable(),该方法为真正检查磁盘空间的代码:

public boolean isResourceAvailable() {
  long availableSpace = df.getAvailable();
  if (LOG.isDebugEnabled()) {
    LOG.debug("Space available on volume '" + volume + "' is "
        + availableSpace);
  }
  if (availableSpace < duReserved) {
    LOG.warn("Space available on volume '" + volume + "' is "
        + availableSpace +
        ", which is below the configured reserved amount " + duReserved);
    return false;
  } else {
    return true;
  }
}

进入df.getAvailable():

public long getAvailable() {
  return dirFile.getUsableSpace();
}

--> dirFile.getUsableSpace()

public long getUsableSpace() {
    SecurityManager sm = System.getSecurityManager();
    if (sm != null) {
        sm.checkPermission(new RuntimePermission("getFileSystemAttributes"));
        sm.checkRead(path);
    }
    if (isInvalid()) {
        return 0L;
    }
    return fs.getSpace(this, FileSystem.SPACE_USABLE);
}

最后调用的是java的File类的getUsableSpace方法来计算可用空间大小。通过判断可用空间大小与duReserved的比较,来返回是否有足够的空间。最后的返回值是个boolean变量hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();

该方法最后改变的是FsNameSystem的成员变量bool类型的hasResourcesAvailable,也就是查看是否editslog目录有足够的空间。

checkAvailableResources();方法看完后,查看后面的setBlockTotal();该方法是决定是否进入安全模式的关键:

 /**
 * Set the total number of blocks in the system. 
 */
public void setBlockTotal() {
  // safeMode is volatile, and may be set to null at any time
  SafeModeInfo safeMode = this.safeMode;
  if (safeMode == null)
    return;
  safeMode.setBlockTotal((int)getCompleteBlocksTotal());
}

这里实例化一个SafeModeInfo类,并且通过getCompleteBlocksTotal()方法来设置完成的block数量。进入getCompleteBlocksTotal():

/**
 * Get the total number of COMPLETE blocks in the system.
 * For safe mode only complete blocks are counted.
 */
private long getCompleteBlocksTotal() {
  // Calculate number of blocks under construction
  long numUCBlocks = 0;
  readLock();
  numUCBlocks = leaseManager.getNumUnderConstructionBlocks();
  try {
    return getBlocksTotal() - numUCBlocks;
  } finally {
    readUnlock();
  }
}

该方法就是检查complete状态的block的数量,这里什么叫做complete状态呢?就是总的block数量减去正在构建状态的block数量,构建状态的block指的是namenode还没有收到datanode汇报的block,所以,complete状态的block的数量也就是当前可用状态的block数量。

进入getNumUnderConstructionBlocks()方法:

/**
 * This method iterates through all the leases and counts the number of blocks
 * which are not COMPLETE. The FSNamesystem read lock MUST be held before
 * calling this method.
 * @return
 */
synchronized long getNumUnderConstructionBlocks() {
  assert this.fsnamesystem.hasReadLock() : "The FSNamesystem read lock wasn't"
    + "acquired before counting under construction blocks";
  long numUCBlocks = 0;
  for (Lease lease : sortedLeases) {
    for (String path : lease.getPaths()) {
      final INodeFile cons;
      try {
        //当前这个路径的文件
        cons = this.fsnamesystem.getFSDirectory().getINode(path).asFile();
        //文件是否处于被构建中的状态
        if (!cons.isUnderConstruction()) {
          LOG.warn("The file " + cons.getFullPathName()
              + " is not under construction but has lease.");
          continue;
        }
      } catch (UnresolvedLinkException e) {
        throw new AssertionError("Lease files should reside on this FS");
      }
      //如果文件正在被构建,则遍历该文件的对应的所有block
      BlockInfo[] blocks = cons.getBlocks();
      if(blocks == null)
        continue;
      for(BlockInfo b : blocks) {
        //记录该文件对应的所有block,如果不是complete状态,那么就任务它是在UnderConstruction状态
        if(!b.isComplete())
          numUCBlocks++;
      }
    }
  }
  LOG.info("Number of blocks under construction: " + numUCBlocks);
  return numUCBlocks;
}

该方法大概就是遍历FsnameSystem的文件目录树,在树中的所有文件节点,拿到该节点的文件对应的所有block,这些block如果处于构建状态就累加一次,从而获得整个文件系统的处于构建状态的block数量。

对于getBlocksTotal():这个方法,显然是拿到整个集群的block总数,是从blockmanager中获取,这里不深入研究了。

那么SafeModeInfo实例safeMode的block数量就被设置为了所有已经完成的block数量:整个集群的block数量减去构建中的block数量。

回到上面的代码:safeMode.setBlockTotal((int)getCompleteBlocksTotal());

private synchronized void setBlockTotal(int total) {
  this.blockTotal = total;
  //阈值,threshold:默认99.9%,也就是10000个block的话,必须有9990个block满足最少副本的数量:1
  this.blockThreshold = (int) (blockTotal * threshold);
  //replQueueThreshold:99.9%
  this.blockReplQueueThreshold = 
    (int) (blockTotal * replQueueThreshold);
  if (haEnabled) {
    // After we initialize the block count, any further namespace
    // modifications done while in safe mode need to keep track
    // of the number of total blocks in the system.
    this.shouldIncrementallyTrackBlocks = true;
  }
  if(blockSafe < 0)
    this.blockSafe = 0;
  //检查是否要进入safemode
  checkMode();
}

进入checkMode:

/**
 * Check and trigger safe mode if needed. 
 */
private void checkMode() {
  // Have to have write-lock since leaving safemode initializes
  // repl queues, which requires write lock
  assert hasWriteLock();
  if (inTransitionToActive()) {
    return;
  }
  // if smmthread is already running, the block threshold must have been 
  // reached before, there is no need to enter the safe mode again
  //needEnter():判断是否进入安全模式
  if (smmthread == null && needEnter()) {
    enter();
    // check if we are ready to initialize replication queues
    if (canInitializeReplQueues() && !isPopulatingReplQueues()
        && !haEnabled) {
      initializeReplQueues();
    }
    reportStatus("STATE* Safe mode ON.", false);
    return;
  }
  // the threshold is reached or was reached before
  if (!isOn() ||                           // safe mode is off
      extension <= 0 || threshold <= 0) {  // don't need to wait
    this.leave(); // leave safe mode
    return;
  }
  if (reached > 0) {  // threshold has already been reached before
    reportStatus("STATE* Safe mode ON.", false);
    return;
  }
  // start monitor
  reached = now();
  if (smmthread == null) {
    smmthread = new Daemon(new SafeModeMonitor());
    smmthread.start();
    reportStatus("STATE* Safe mode extension entered.", true);
  }

  // check if we are ready to initialize replication queues
  if (canInitializeReplQueues() && !isPopulatingReplQueues() && !haEnabled) {
    initializeReplQueues();
  }
}

这里最重要的判断是否进入安全模式的代码:needEnter()

private boolean needEnter() {

  /*
  三个条件:
  1.blockSafe数量小于blockThreshold阈值
  2.datanode数量小于datanodeThreshold阈值
  3.没有足够的磁盘空间
   */
  return (threshold != 0 && blockSafe < blockThreshold) ||
    (datanodeThreshold != 0 && getNumLiveDataNodes() < datanodeThreshold) ||
    (!nameNodeHasResourcesAvailable());
}

blockSafe这个变量会在datanode启动后,随着datanode对block的汇报,这个变量会不断增加。

以上代码这三个条件:

1.blockSafe数量小于blockThreshold阈值

2.datanode数量小于datanodeThreshold阈值

3.没有足够的磁盘空间

只要满足其中之一就进入安全模式:enter()方法

private void enter() {
  this.reached = 0;
}

reached是个标记位,标记是否进入安全模式

/** Time when threshold was reached.
 * <br> -1 safe mode is off
 * <br> 0 safe mode is on, and threshold is not reached yet
 * <br> >0 safe mode is on, but we are in extension period 
 */
private long reached = -1; 

最后开启一个安全模式监控线程:

if (smmthread == null) {
  smmthread = new Daemon(new SafeModeMonitor());
  smmthread.start();
  reportStatus("STATE* Safe mode extension entered.", true);
}

该线程每秒会去检查一下状态,是否已经恢复正常,恢复正常的话就离开安全模式。

2.启动blockManager的一些后台线程

磁盘空间检查完成后,比较重要的就是blockManager.activate(conf, completeBlocksTotal);方法了,该方法会启动一大堆的线程:

public void activate(Configuration conf, long blockTotal) {
  //启动PendingReconstructionMonitor线程
  //当我们进行文件传输和副本备份的时候,会默认挑选出一些目标节点进行传输,此时我们认为这些目标节点之后会拥有特定的 Block,
  // 这时候我们会在 PendingReconstructionBlocks 的 pendingReconstructions 中存放 BlockInfo 和期待的储存位置之间的关系。
  pendingReconstruction.start();
  //开启datanodeAdminManager(datanode管理线程)和heartbeatManager(心跳过期检测线程)两个线程
  datanodeManager.activate(conf);
  this.redundancyThread.setName("RedundancyMonitor");
  //定期检查block恢复线程
  this.redundancyThread.start();
  storageInfoDefragmenterThread.setName("StorageInfoMonitor");
  //最后一个独立的工作线程是 StorageInfoDefragmenter。这个类负责优化每个 Storage 中的 block 内存占用。
  //DataNode 中的每一个存储路径都会被抽象成为一个 StorageInfo 对象,在这个对象中会有一棵红黑树 FoldedTreeSet
  // 用来保存存储路径中的节点信息。随着 DataNode 节点的不断运行,不断有新的 Replica 被创建,有旧的 Replica 被移除,
  // 如果长时间不对 FoldedTreeSet 的内存做优化,则其内部的数据占用空间会越来越来,影响节点性能。
  //为了解决这个问题,在 BlockManager 中会启动一个 StorageInfoDefragmenter 的线程,定期通过 scanAndCompactStorages()
  // 方法找到红黑树中 fillRatio 占用比较低的树,然后通过 compat 对树的节点进行压缩,减少内存占用情况。
  storageInfoDefragmenterThread.start();
  //汇报心跳给namenode线程
  this.blockReportThread.start();
  mxBeanName = MBeans.register("NameNode", "BlockStats", this);
  bmSafeMode.activate(blockTotal);
}

3.启动httpserver

//开启Backup Node或者Checkpoint Node的httpserver
if (NamenodeRole.NAMENODE != role) {
  startHttpServer(conf);
  httpServer.setNameNodeAddress(getNameNodeAddress());
  httpServer.setFSImage(getFSImage());
  if (levelDBAliasMapServer != null) {
    httpServer.setAliasMap(levelDBAliasMapServer.getAliasMap());
  }
}

 

4.启动rpcserver,启动之前已经初始化过的三个rpcserver

void start() {
  clientRpcServer.start();
  if (serviceRpcServer != null) {
    serviceRpcServer.start();      
  }
  if (lifelineRpcServer != null) {
    lifelineRpcServer.start();
  }
}

到此,namenode启动完成。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值