参考http://www.cnblogs.com/ggjucheng/archive/2013/02/04/2889386.html
1、StringUtils.startupShutdownMessage(NameNode.class, argv, LOG);
入参:
public static final Log LOG = LogFactory.getLog(NameNode.class.getName());
{
final String hostname = getHostname(); // InetAddress.getLocalHost() 程序中使用了较多的throws Exception。
final String classname = clazz.getSimpleName();
LOG.info(
toStartupShutdownString("STARTUP_MSG: ", new String[] {
"Starting " + classname,
" host = " + hostname,
" args = " + Arrays.asList(args),
" version = " + VersionInfo.getVersion(), //qes:需要分析下(何时初始化)
" build = " + VersionInfo.getUrl() + " -r "
+ VersionInfo.getRevision()
+ "; compiled by '" + VersionInfo.getUser()
+ "' on " + VersionInfo.getDate()}
)
);
Runtime.getRuntime().addShutdownHook(new Thread() { //key:钩子
public void run() {
LOG.info(toStartupShutdownString("SHUTDOWN_MSG: ", new String[]{
"Shutting down " + classname + " at " + hostname}));
}
});
}
private static String toStartupShutdownString(String prefix, String [] msg) {
StringBuffer b = new StringBuffer(prefix);
b.append("\n/************************************************************");
for(String s : msg)
b.append("\n" + prefix + s);
b.append("\n************************************************************/");
return b.toString();
}
2、
public static NameNode createNameNode(String argv[],
Configuration conf) throws IOException {
if (conf == null)
conf = new Configuration();
StartupOption startOpt = parseArguments(argv);
if (startOpt == null) {
printUsage();
return null;
}
setStartupOption(conf, startOpt);
switch (startOpt) {
case FORMAT://首次启动namenode要格式化,或者是重新初始化namenode
boolean aborted = format(conf, true);
System.exit(aborted ? 1 : 0);
case FINALIZE://完成升级hadoop,删除备份
aborted = finalize(conf, true);
System.exit(aborted ? 1 : 0);
default:
}
DefaultMetricsSystem.initialize("NameNode");//qes:待研究
NameNode namenode = new NameNode(conf);
return namenode;
}
2.1
private static void setStartupOption(Configuration conf, StartupOption opt) {
conf.set("dfs.namenode.startup", opt.toString());
}
2.2
DefaultMetricsSystem.initialize("NameNode");//qes:待研究 打印如下:
13/10/24 21:32:22 INFO impl.MetricsConfig: loaded properties from hadoop-metrics2.properties
13/10/24 21:32:22 INFO impl.MetricsSourceAdapter: MBean for source MetricsSystem,sub=Stats registered.
13/10/24 21:32:22 INFO impl.MetricsSystemImpl: Scheduled snapshot period at 10 second(s).
13/10/24 21:32:22 INFO impl.MetricsSystemImpl: NameNode metrics system started
2.3
NameNode namenode = new NameNode(conf);
->
public NameNode(Configuration conf) throws IOException {
try {
initialize(conf);
} catch (IOException e) {
this.stop(); //key 待研究
throw e;
}
}
->
/**
* Initialize name-node.
*
* @param conf the configuration
*/
private void initialize(Configuration conf) throws IOException {
InetSocketAddress socAddr = NameNode.getAddress(conf);
UserGroupInformation.setConfiguration(conf);
SecurityUtil.login(conf, DFSConfigKeys.DFS_NAMENODE_KEYTAB_FILE_KEY,
DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, socAddr.getHostName());
int handlerCount = conf.getInt("dfs.namenode.handler.count", 10);
// set service-level authorization security policy
if (serviceAuthEnabled =
conf.getBoolean(
ServiceAuthorizationManager.SERVICE_AUTHORIZATION_CONFIG, false)) {
ServiceAuthorizationManager.refresh(conf, new HDFSPolicyProvider());
}
myMetrics = NameNodeInstrumentation.create(conf);
//从fsimage和edits log加载元数据
this.namesystem = new FSNamesystem(this, conf);
if (UserGroupInformation.isSecurityEnabled()) {
namesystem.activateSecretManager();
}
// create rpc server
InetSocketAddress dnSocketAddr = getServiceRpcServerAddress(conf);
if (dnSocketAddr != null) {
int serviceHandlerCount =
conf.getInt(DFSConfigKeys.DFS_NAMENODE_SERVICE_HANDLER_COUNT_KEY,
DFSConfigKeys.DFS_NAMENODE_SERVICE_HANDLER_COUNT_DEFAULT);
this.serviceRpcServer = RPC.getServer(this, dnSocketAddr.getHostName(),
dnSocketAddr.getPort(), serviceHandlerCount,
false, conf, namesystem.getDelegationTokenSecretManager());
this.serviceRPCAddress = this.serviceRpcServer.getListenerAddress();
setRpcServiceServerAddress(conf);
}
//创建RPCServer,默认的rpc线程数是10,默认端口是8020
this.server = RPC.getServer(this, socAddr.getHostName(),
socAddr.getPort(), handlerCount, false, conf, namesystem
.getDelegationTokenSecretManager());
// The rpc-server port can be ephemeral... ensure we have the correct info
this.serverAddress = this.server.getListenerAddress();
FileSystem.setDefaultUri(conf, getUri(serverAddress));
LOG.info("Namenode up at: " + this.serverAddress);
startHttpServer(conf);//启动http服务器,启动后可以通过http://namenode:50070 访问hdfs的管理页面
this.server.start(); //start RPC server
if (serviceRpcServer != null) {
serviceRpcServer.start();
}
startTrashEmptier(conf);启动回收站清理线程,将过期的已删除文件,真正删除。
}
NameNode的启动流程最复杂的就是FSNamesystem的初始化了,这个类是NameNode启动的核心逻辑,而其他启动逻辑都比较好懂。可以自行查看代码。
org.apache.hadoop.hdfs.server.namenode.FSNamesystem具备了Namenode所提供基本服务的基础上,也可以料想到它实现的复杂性。
public class FSNamesystem {
//存储文件树
public FSDirectory dir;
//BlocksMap类维护块(Block)到其元数据的映射表,元数据信息包括块所属的inode、存储块的Datanode。
final BlocksMap blocksMap = new BlocksMap(DEFAULT_INITIAL_MAP_CAPACITY,DEFAULT_MAP_LOAD_FACTOR);
//失效块的映射表。
public CorruptReplicasMap corruptReplicas = new CorruptReplicasMap();
//datanode到块的映射表
NavigableMap<String, DatanodeDescriptor> datanodeMap = new TreeMap<String, DatanodeDescriptor>();
//datanodeMap的子集,只包含认为存活的DatanodeDescriptor,HeartbeatMonitor会定期清除过期的元素
ArrayList<DatanodeDescriptor> heartbeats = new ArrayList<DatanodeDescriptor>();
//描述某些块的副本数量不足块的实体类,而且,对于块设定了优先级,通过一个优先级队列来管理块副本不足的块的集合。
private UnderReplicatedBlocks neededReplications = new UnderReplicatedBlocks();
//描述当前尚未完成块副本复制的块的列表。
private PendingReplicationBlocks pendingReplications;
//对文件的租约进行管理。
public LeaseManager leaseManager = new LeaseManager(this);
Daemon hbthread = null; // 周期性地调用FSNamesystem类定义的heartbeatCheck方法,来监视Datanode结点发送的心跳状态信息,并做出处理
public Daemon lmthread = null; // LeaseMonitor thread
Daemon smmthread = null; // 用来周期性地检查是否达到离开安全模式的条件,因此,该线程必须在进入安全模式之后启动(也就是达到threshold)。
public Daemon replthread = null; // 周期性调用两个方法:计算块副本数量,以制定计划并调度Datanode处理 ;处理未完成块的流水线复制的副本
private ReplicationMonitor replmon = null; // Replication metrics
//用来保存Datanode结点的主机 -> DatanodeDescriptor数组的映射
private Host2NodesMap host2DataNodeMap = new Host2NodesMap();
//表示一个具有树状网络拓扑结构的计算机集群,例如,一个集群可能由多个数据中心(Data Center)组成,在这些数据中心分布着为计算需求而设置的很多计算机的机架(Rack)。
NetworkTopology clusterMap = new NetworkTopology();
//该接口是一个支持插件的定义,通过插件定义DNS-name/IP-address -> RackID之间转换的解析器。
private DNSToSwitchMapping dnsToSwitchMapping;
//对指定的块副本的存放位置进行定位选择的实现类。
ReplicationTargetChooser replicator;
//用来跟踪Datanode的,哪些Datanode允许连接到Namenode,哪些不能够连接到Namenode,都在该类中指定的列表中记录着
private HostsFileReader hostsReader;
}
2.3.1
FSNamesystem
/**
* FSNamesystem constructor.
*/
FSNamesystem(NameNode nn, Configuration conf) throws IOException {
try {
initialize(nn, conf);
} catch(IOException e) {
LOG.error(getClass().getSimpleName() + " initialization failed.", e);
close();
throw e;
}
}
/**
* Initialize FSNamesystem.
*/
private void initialize(NameNode nn, Configuration conf) throws IOException {
this.systemStart = now();
setConfigurationParameters(conf);
dtSecretManager = createDelegationTokenSecretManager(conf);
this.nameNodeAddress = nn.getNameNodeAddress();
this.registerMBean(conf); // register the MBean for the FSNamesystemStutus //key
this.dir = new FSDirectory(this, conf);
StartupOption startOpt = NameNode.getStartupOption(conf);
this.dir.loadFSImage(getNamespaceDirs(conf),
getNamespaceEditsDirs(conf), startOpt);
long timeTakenToLoadFSImage = now() - systemStart;
LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
NameNode.getNameNodeMetrics().setFsImageLoadTime(timeTakenToLoadFSImage);
this.safeMode = new SafeModeInfo(conf);
setBlockTotal();
pendingReplications = new PendingReplicationBlocks(
conf.getInt("dfs.replication.pending.timeout.sec",
-1) * 1000L);
if (isAccessTokenEnabled) {
accessTokenHandler = new BlockTokenSecretManager(true,
accessKeyUpdateInterval, accessTokenLifetime);
}
this.hbthread = new Daemon(new HeartbeatMonitor());//监视Datanode结点发送的心跳状态信息的后台线程
this.lmthread = new Daemon(leaseManager.new Monitor());//对文件的租约进行管理后台线程
this.replmon = new ReplicationMonitor();
this.replthread = new Daemon(replmon);//处理未完成块的流水线复制的副本
hbthread.start();
lmthread.start();
replthread.start();
//从配置文件读取datanode的黑白名单
this.hostsReader = new HostsFileReader(conf.get("dfs.hosts",""),
conf.get("dfs.hosts.exclude",""));
//处理退役节点,一般会把退役节点的块做迁移
this.dnthread = new Daemon(new DecommissionManager(this).new Monitor(
conf.getInt("dfs.namenode.decommission.interval", 30),
conf.getInt("dfs.namenode.decommission.nodes.per.interval", 5)));
dnthread.start();
this.dnsToSwitchMapping = ReflectionUtils.newInstance(
conf.getClass("topology.node.switch.mapping.impl", ScriptBasedMapping.class,
DNSToSwitchMapping.class), conf);
/* If the dns to swith mapping supports cache, resolve network
* locations of those hosts in the include list,
* and store the mapping in the cache; so future calls to resolve
* will be fast.
*/
if (dnsToSwitchMapping instanceof CachedDNSToSwitchMapping) {
dnsToSwitchMapping.resolve(new ArrayList<String>(hostsReader.getHosts()));
}
InetSocketAddress socAddr = NameNode.getAddress(conf);
this.nameNodeHostName = socAddr.getHostName();
registerWith(DefaultMetricsSystem.INSTANCE);
}
2.3.1.2
/**
* Initializes some of the members from configuration
*/
private void setConfigurationParameters(Configuration conf)
throws IOException {
fsNamesystemObject = this;
fsOwner = UserGroupInformation.getCurrentUser(); //key:待研究
LOG.info("fsOwner=" + fsOwner);
this.supergroup = conf.get("dfs.permissions.supergroup", "supergroup");
this.isPermissionEnabled = conf.getBoolean("dfs.permissions", true);
LOG.info("supergroup=" + supergroup);
LOG.info("isPermissionEnabled=" + isPermissionEnabled);
short filePermission = (short)conf.getInt("dfs.upgrade.permission", 0777);
this.defaultPermission = PermissionStatus.createImmutable(
fsOwner.getShortUserName(), supergroup, new FsPermission(filePermission));
this.replicator = new ReplicationTargetChooser(
conf.getBoolean("dfs.replication.considerLoad", true),
this,
clusterMap);
this.defaultReplication = conf.getInt("dfs.replication", 3);
this.maxReplication = conf.getInt("dfs.replication.max", 512);
this.minReplication = conf.getInt("dfs.replication.min", 1);
if (minReplication <= 0)
throw new IOException(
"Unexpected configuration parameters: dfs.replication.min = "
+ minReplication
+ " must be greater than 0");
if (maxReplication >= (int)Short.MAX_VALUE)
throw new IOException(
"Unexpected configuration parameters: dfs.replication.max = "
+ maxReplication + " must be less than " + (Short.MAX_VALUE));
if (maxReplication < minReplication)
throw new IOException(
"Unexpected configuration parameters: dfs.replication.min = "
+ minReplication
+ " must be less than dfs.replication.max = "
+ maxReplication);
this.maxReplicationStreams = conf.getInt("dfs.max-repl-streams", 2);
long heartbeatInterval = conf.getLong("dfs.heartbeat.interval", 3) * 1000;
this.heartbeatRecheckInterval = conf.getInt(
"heartbeat.recheck.interval", 5 * 60 * 1000); // 5 minutes
this.heartbeatExpireInterval = 2 * heartbeatRecheckInterval +
10 * heartbeatInterval;
this.replicationRecheckInterval =
conf.getInt("dfs.replication.interval", 3) * 1000L;
this.defaultBlockSize = conf.getLong("dfs.block.size", DEFAULT_BLOCK_SIZE);
this.maxFsObjects = conf.getLong("dfs.max.objects", 0);
//default limit
this.blockInvalidateLimit = Math.max(this.blockInvalidateLimit,
20*(int)(heartbeatInterval/1000));
//use conf value if it is set.
this.blockInvalidateLimit = conf.getInt(
DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY, this.blockInvalidateLimit);
LOG.info(DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY + "=" + this.blockInvalidateLimit);
this.accessTimePrecision = conf.getLong("dfs.access.time.precision", 0);
this.supportAppends = conf.getBoolean("dfs.support.append", false);
this.isAccessTokenEnabled = conf.getBoolean(
DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY, false);
if (isAccessTokenEnabled) {
this.accessKeyUpdateInterval = conf.getLong(
DFSConfigKeys.DFS_BLOCK_ACCESS_KEY_UPDATE_INTERVAL_KEY, 600) * 60 * 1000L; // 10 hrs
this.accessTokenLifetime = conf.getLong(
DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_LIFETIME_KEY, 600) * 60 * 1000L; // 10 hrs
}
LOG.info("isAccessTokenEnabled=" + isAccessTokenEnabled
+ " accessKeyUpdateInterval=" + accessKeyUpdateInterval / (60 * 1000)
+ " min(s), accessTokenLifetime=" + accessTokenLifetime / (60 * 1000)
+ " min(s)");
}
打印台输出:
13/10/24 22:59:35 INFO namenode.FSNamesystem: fsOwner=user //linux用户
13/10/24 22:59:35 INFO namenode.FSNamesystem: supergroup=supergroup ?
13/10/24 22:59:35 INFO namenode.FSNamesystem: isPermissionEnabled=true ?
13/10/24 22:59:35 INFO namenode.FSNamesystem: dfs.block.invalidate.limit=100 ?
13/10/24 22:59:35 INFO namenode.FSNamesystem: isAccessTokenEnabled=false accessKeyUpdateInterval=0 min(s), accessTokenLifetime=0 min(s)
2.3.1.3
/*
* Delegation Token ????
*/
private DelegationTokenSecretManager createDelegationTokenSecretManager(
Configuration conf) {
return new DelegationTokenSecretManager(conf.getLong(
"dfs.namenode.delegation.key.update-interval", 24*60*60*1000),
conf.getLong(
"dfs.namenode.delegation.token.max-lifetime", 7*24*60*60*1000),
conf.getLong(
"dfs.namenode.delegation.token.renew-interval", 24*60*60*1000),
DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL, this);
}
2.3.1.4
2.3.1.5
FSDirectory类是用来存储文件系统目录的状态。它处理向磁盘中写入或加载数据,并且对目录中的数据发生的改变记录到日志中。它保存了一个最新的filename->blockset的映射表,并且将它写入到磁盘中。它的主要功能实现是成员FSImage fsImage完成。
this.dir = new FSDirectory(this, conf);
FSDirectory
/** Access an existing dfs name directory. */
FSDirectory(FSNamesystem ns, Configuration conf) {
this(new FSImage(), ns, conf);
fsImage.setCheckpointDirectories(FSImage.getCheckpointDirs(conf, null),
FSImage.getCheckpointEditsDirs(conf, null));
}
-->
FSDirectory(FSImage fsImage, FSNamesystem ns, Configuration conf) {
rootDir = new INodeDirectoryWithQuota(INodeDirectory.ROOT_NAME,
ns.createFsOwnerPermissions(new FsPermission((short)0755)), // "":user:supergroup:rwxr-xr-x
Integer.MAX_VALUE, -1);
this.fsImage = fsImage;
fsImage.setRestoreRemovedDirs(conf.getBoolean(
DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_KEY,
DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_DEFAULT));
namesystem = ns;
int configuredLimit = conf.getInt(
DFSConfigKeys.DFS_LIST_LIMIT, DFSConfigKeys.DFS_LIST_LIMIT_DEFAULT);
this.lsLimit = configuredLimit>0 ?
configuredLimit : DFSConfigKeys.DFS_LIST_LIMIT_DEFAULT;
int threshold = conf.getInt(
DFSConfigKeys.DFS_NAMENODE_NAME_CACHE_THRESHOLD_KEY,
DFSConfigKeys.DFS_NAMENODE_NAME_CACHE_THRESHOLD_DEFAULT);
NameNode.LOG.info("Caching file names occuring more than " + threshold
+ " times ");
nameCache = new NameCache<ByteArray>(threshold);
}
[/usr/local/hadoop/tmp/dfs/namesecondary] [/usr/local/hadoop/tmp/dfs/namesecondary]
FSImage.getCheckpointEditsDirs(conf, null) [/usr/local/hadoop/tmp/dfs/namesecondary]
2.3.1.6
2.3.1.6.1
//FSNamesystem在初始化完FSDirectory dir成员,会调用loadFSImage方法,从fsimage和edits加载元数据信息
FSNameSystem : this.dir.loadFSImage(getNamespaceDirs(conf),getNamespaceEditsDirs(conf), startOpt);
public static Collection<File> getNamespaceDirs(Configuration conf) {
Collection<String> dirNames = conf.getStringCollection("dfs.name.dir"); //[/usr/local/hadoop/tmp/dfs/name]
if (dirNames.isEmpty())
dirNames.add("/tmp/hadoop/dfs/name");
Collection<File> dirs = new ArrayList<File>(dirNames.size());
for(String name : dirNames) {
dirs.add(new File(name));
}
return dirs;
}
->
/**
* Get the comma delimited values of the <code>name</code> property as
* a collection of <code>String</code>s.
* If no such property is specified then empty collection is returned.
* <p>
* This is an optimized version of {@link #getStrings(String)}
*
* @param name property name.
* @return property value as a collection of <code>String</code>s.
*/
public Collection<String> getStringCollection(String name) {
String valueString = get(name); ///usr/local/hadoop/tmp/dfs/name
return StringUtils.getStringCollection(valueString);
}
StringUtils.java 学习
/**
* Returns a collection of strings.
* @param str comma seperated string values
* @return an <code>ArrayList</code> of string values
*/
public static Collection<String> getStringCollection(String str){
List<String> values = new ArrayList<String>();
if (str == null)
return values;
StringTokenizer tokenizer = new StringTokenizer (str,",");
values = new ArrayList<String>();
while (tokenizer.hasMoreTokens()) {
values.add(tokenizer.nextToken());
}
return values;
}
public static Collection<File> getNamespaceEditsDirs(Configuration conf) {
Collection<String> editsDirNames =
conf.getStringCollection("dfs.name.edits.dir"); //[/usr/local/hadoop/tmp/dfs/name]
if (editsDirNames.isEmpty())
editsDirNames.add("/tmp/hadoop/dfs/name");
Collection<File> dirs = new ArrayList<File>(editsDirNames.size());
for(String name : editsDirNames) {
dirs.add(new File(name));
}
return dirs;
}
2.3.1.6.2
void loadFSImage(Collection<File> dataDirs,
Collection<File> editsDirs,
StartupOption startOpt) throws IOException {
// format before starting up if requested
if (startOpt == StartupOption.FORMAT) {// 如果启动选项类型为FORMAT(格式化),在启动之前需要进行格式化
fsImage.setStorageDirectories(dataDirs, editsDirs);// 设置FSImage映像文件文件的存储目录:${dfs.name.dir},默认是/tmp/hadoop/dfs/name,是一个目录数组。
fsImage.format();
startOpt = StartupOption.REGULAR;
}
try {
if (fsImage.recoverTransitionRead(dataDirs, editsDirs, startOpt)) {// 根据启动选项及其对应存储目录(${dfs.name.dir}),分析存储目录,必要的话从先前的事务恢复过来
fsImage.saveNamespace(true);
}
FSEditLog editLog = fsImage.getEditLog();
assert editLog != null : "editLog must be initialized";
if (!editLog.isOpen())
editLog.open();
fsImage.setCheckpointDirectories(null, null);
} catch(IOException e) {
fsImage.close();
throw e;
}
synchronized (this) {
this.ready = true;
this.nameCache.initialized();
this.notifyAll();
}
}
通过loadFSImage方法,我们可以看到加载一个FSImage映像的过程:首先需要对内存中的FSImage对象进行格式化;然后从将指定存储目录中的EditLog日志文件作用到格式化完成的FSImage内存映像上;最后需要再创建一个空的EditLog日志准备记录对命名空间进行修改的操作,以备检查点进程根据需要将EditLog内容作用到FSImage映像上,保持FSImage总是最新的,保证EditLog与FSImage同步。
FSDirectory的更多分析参考 http://blog.csdn.net/shirdrn/article/details/4631518
总结
上面将了namenode相关的核心类的成员和初始化流程,这里总结下namenode的代码调用逻辑:
hdfs的目录和文件的创建,删除,还有文件的读写,追加,都是客户端通过rpc,调用namenode的接口。
接着namenode调用成员FSNamesystem namesystem完成文件的操作,namesystem会做租约的管理,网络拓扑的控制,文件权限的控制等。
接着namesystem调用成员FSDirectory dir操作,dir会做文件名到文件块的映射管理。
接着dir调用成员FSImage fsImage操作,fsImage会hdfs的所有变化,追加写入了EditLog,做持久化。
Secondrary Namenoe会定时(默认是一个小时)把namenode的EditLog和fsimage合并为一个fsimage,减少EditLog的文件大小。
本文只讲解namenode核心类的职责和调用逻辑,细节请自行查看hadoop的相关源码。
2.3.1.6.2.1 fsImage.recoverTransitionRead(dataDirs, editsDirs, startOpt)
/**
* Analyze storage directories.
* Recover from previous transitions if required.
* Perform fs state transition if necessary depending on the namespace info.
* Read storage info.
*
* @param dataDirs
* @param startOpt startup option
* @throws IOException
* @return true if the image needs to be saved or false otherwise
*/
boolean recoverTransitionRead(Collection<File> dataDirs,
Collection<File> editsDirs,
StartupOption startOpt
) throws IOException {
assert startOpt != StartupOption.FORMAT :
"NameNode formatting should be performed before reading the image";
// none of the data dirs exist
if (dataDirs.size() == 0 || editsDirs.size() == 0)
throw new IOException(
"All specified directories are not accessible or do not exist.");
if(startOpt == StartupOption.IMPORT
&& (checkpointDirs == null || checkpointDirs.isEmpty()))
throw new IOException("Cannot import image from a checkpoint. "
+ "\"fs.checkpoint.dir\" is not set." );
if(startOpt == StartupOption.IMPORT
&& (checkpointEditsDirs == null || checkpointEditsDirs.isEmpty()))
throw new IOException("Cannot import image from a checkpoint. "
+ "\"fs.checkpoint.edits.dir\" is not set." );
setStorageDirectories(dataDirs, editsDirs);
// 1. For each data directory calculate its state and
// check whether all is consistent before transitioning.
Map<StorageDirectory, StorageState> dataDirStates =
new HashMap<StorageDirectory, StorageState>();
boolean isFormatted = false;
for (Iterator<StorageDirectory> it =
dirIterator(); it.hasNext();) {
StorageDirectory sd = it.next();
StorageState curState;
try {
curState = sd.analyzeStorage(startOpt);
// sd is locked but not opened
switch(curState) {
case NON_EXISTENT:
// name-node fails if any of the configured storage dirs are missing
throw new InconsistentFSStateException(sd.getRoot(),
"storage directory does not exist or is not accessible.");
case NOT_FORMATTED:
break;
case NORMAL:
break;
default: // recovery is possible
sd.doRecover(curState);
}
if (curState != StorageState.NOT_FORMATTED
&& startOpt != StartupOption.ROLLBACK) {
sd.read(); // read and verify consistency with other directories
isFormatted = true;
}
if (startOpt == StartupOption.IMPORT && isFormatted)
// import of a checkpoint is allowed only into empty image directories
throw new IOException("Cannot import image from a checkpoint. "
+ " NameNode already contains an image in " + sd.getRoot());
} catch (IOException ioe) {
sd.unlock();
throw ioe;
}
dataDirStates.put(sd,curState);
}
if (!isFormatted && startOpt != StartupOption.ROLLBACK
&& startOpt != StartupOption.IMPORT)
throw new IOException("NameNode is not formatted.");
if (layoutVersion < LAST_PRE_UPGRADE_LAYOUT_VERSION) {
checkVersionUpgradable(layoutVersion);
}
if (startOpt != StartupOption.UPGRADE
&& layoutVersion < LAST_PRE_UPGRADE_LAYOUT_VERSION
&& layoutVersion != FSConstants.LAYOUT_VERSION)
throw new IOException(
"\nFile system image contains an old layout version " + layoutVersion
+ ".\nAn upgrade to version " + FSConstants.LAYOUT_VERSION
+ " is required.\nPlease restart NameNode with -upgrade option.");
// check whether distributed upgrade is reguired and/or should be continued
verifyDistributedUpgradeProgress(startOpt);
// 2. Format unformatted dirs.
this.checkpointTime = 0L;
for (Iterator<StorageDirectory> it =
dirIterator(); it.hasNext();) {
StorageDirectory sd = it.next();
StorageState curState = dataDirStates.get(sd);
switch(curState) {
case NON_EXISTENT:
assert false : StorageState.NON_EXISTENT + " state cannot be here";
case NOT_FORMATTED:
LOG.info("Storage directory " + sd.getRoot() + " is not formatted.");
LOG.info("Formatting ...");
sd.clearDirectory(); // create empty currrent dir
break;
default:
break;
}
}
// 3. Do transitions
switch(startOpt) {
case UPGRADE:
doUpgrade();
return false; // upgrade saved image already
case IMPORT:
doImportCheckpoint();
return true;
case ROLLBACK:
doRollback();
break;
case REGULAR:
// just load the image
}
return loadFSImage();
}
2.3.1.6.2.1.1
void setStorageDirectories(Collection<File> fsNameDirs,
Collection<File> fsEditsDirs
) throws IOException {
storageDirs = new ArrayList<StorageDirectory>();
removedStorageDirs = new ArrayList<StorageDirectory>();
// Add all name dirs with appropriate NameNodeDirType
for (File dirName : fsNameDirs) {
boolean isAlsoEdits = false;
for (File editsDirName : fsEditsDirs) {
if (editsDirName.compareTo(dirName) == 0) {
isAlsoEdits = true;
fsEditsDirs.remove(editsDirName);
break;
}
}
NameNodeDirType dirType = (isAlsoEdits) ?
NameNodeDirType.IMAGE_AND_EDITS :
NameNodeDirType.IMAGE;
addStorageDir(new StorageDirectory(dirName, dirType));
}
// Add edits dirs if they are different from name dirs
for (File dirName : fsEditsDirs) {
addStorageDir(new StorageDirectory(dirName, NameNodeDirType.EDITS));
}
}
protected void addStorageDir(StorageDirectory sd) {
storageDirs.add(sd);
}
2.3.1.6.2.1.2
Storage.java
/**
* Check consistency of the storage directory
*
* @param startOpt a startup option.
*
* @return state {@link StorageState} of the storage directory
* @throws InconsistentFSStateException if directory state is not
* consistent and cannot be recovered.
* @throws IOException
*/
public StorageState analyzeStorage(StartupOption startOpt) throws IOException {
assert root != null : "root is null";
String rootPath = root.getCanonicalPath(); //usr/local/hadoop/tmp/dfs/name
try { // check that storage exists
if (!root.exists()) {
// storage directory does not exist
if (startOpt != StartupOption.FORMAT) {
LOG.info("Storage directory " + rootPath + " does not exist.");
return StorageState.NON_EXISTENT;
}
LOG.info(rootPath + " does not exist. Creating ...");
if (!root.mkdirs())
throw new IOException("Cannot create directory " + rootPath);
}
// or is inaccessible
if (!root.isDirectory()) {
LOG.info(rootPath + "is not a directory.");
return StorageState.NON_EXISTENT;
}
if (!root.canWrite()) {
LOG.info("Cannot access storage directory " + rootPath);
return StorageState.NON_EXISTENT;
}
} catch(SecurityException ex) {
LOG.info("Cannot access storage directory " + rootPath, ex);
return StorageState.NON_EXISTENT;
}
this.lock(); // lock storage if it exists
if (startOpt == HdfsConstants.StartupOption.FORMAT)
return StorageState.NOT_FORMATTED;
if (startOpt != HdfsConstants.StartupOption.IMPORT) {
//make sure no conversion is required
checkConversionNeeded(this); //ques:有什么用?
}
// check whether current directory is valid
File versionFile = getVersionFile(); ///usr/local/hadoop/tmp/dfs/name/current/VERSIO
boolean hasCurrent = versionFile.exists();
// check which directories exist 在regular调试下都false
boolean hasPrevious = getPreviousDir().exists(); // /usr/local/hadoop/tmp/dfs/name/previous
boolean hasPreviousTmp = getPreviousTmp().exists(); // /usr/local/hadoop/tmp/dfs/name/previous.tmp
boolean hasRemovedTmp = getRemovedTmp().exists(); // /usr/local/hadoop/tmp/dfs/name/removed.tmp
boolean hasFinalizedTmp = getFinalizedTmp().exists();
boolean hasCheckpointTmp = getLastCheckpointTmp().exists();
if (!(hasPreviousTmp || hasRemovedTmp
|| hasFinalizedTmp || hasCheckpointTmp)) {
// no temp dirs - no recovery
if (hasCurrent)
return StorageState.NORMAL;
if (hasPrevious)
throw new InconsistentFSStateException(root,
"version file in current directory is missing.");
return StorageState.NOT_FORMATTED;
}
if ((hasPreviousTmp?1:0) + (hasRemovedTmp?1:0)
+ (hasFinalizedTmp?1:0) + (hasCheckpointTmp?1:0) > 1)
// more than one temp dirs
throw new InconsistentFSStateException(root,
"too many temporary directories.");
// # of temp dirs == 1 should either recover or complete a transition
if (hasCheckpointTmp) {
return hasCurrent ? StorageState.COMPLETE_CHECKPOINT
: StorageState.RECOVER_CHECKPOINT;
}
if (hasFinalizedTmp) {
if (hasPrevious)
throw new InconsistentFSStateException(root,
STORAGE_DIR_PREVIOUS + " and " + STORAGE_TMP_FINALIZED
+ "cannot exist together.");
return StorageState.COMPLETE_FINALIZE;
}
if (hasPreviousTmp) {
if (hasPrevious)
throw new InconsistentFSStateException(root,
STORAGE_DIR_PREVIOUS + " and " + STORAGE_TMP_PREVIOUS
+ " cannot exist together.");
if (hasCurrent)
return StorageState.COMPLETE_UPGRADE;
return StorageState.RECOVER_UPGRADE;
}
assert hasRemovedTmp : "hasRemovedTmp must be true";
if (!(hasCurrent ^ hasPrevious))
throw new InconsistentFSStateException(root,
"one and only one directory " + STORAGE_DIR_CURRENT
+ " or " + STORAGE_DIR_PREVIOUS
+ " must be present when " + STORAGE_TMP_REMOVED
+ " exists.");
if (hasCurrent)
return StorageState.COMPLETE_ROLLBACK;
return StorageState.RECOVER_ROLLBACK;
}
/**
* Lock storage to provide exclusive access.
*
* <p> Locking is not supported by all file systems.
* E.g., NFS does not consistently support exclusive locks.
*
* <p> If locking is supported we guarantee exculsive access to the
* storage directory. Otherwise, no guarantee is given.
*
* @throws IOException if locking fails
*/
public void lock() throws IOException {
this.lock = tryLock();
if (lock == null) {
String msg = "Cannot lock storage " + this.root
+ ". The directory is already locked.";
LOG.info(msg);
throw new IOException(msg);
}
}
/**
* Attempts to acquire an exclusive lock on the storage.
*
* @return A lock object representing the newly-acquired lock or
* <code>null</code> if storage is already locked.
* @throws IOException if locking fails.
*/
FileLock tryLock() throws IOException {
File lockF = new File(root, STORAGE_FILE_LOCK);
lockF.deleteOnExit();
RandomAccessFile file = new RandomAccessFile(lockF, "rws");
FileLock res = null;
try {
res = file.getChannel().tryLock();
} catch(OverlappingFileLockException oe) {
file.close();
return null;
} catch(IOException e) {
LOG.error("Cannot create lock on " + lockF, e);
file.close();
throw e;
}
return res;
}
2.3.1.6.2.1.3
FSImage sd.read(); // read and verify consistency with other directories
enum NameNodeFile {
IMAGE ("fsimage"),
TIME ("fstime"),
EDITS ("edits"),
IMAGE_NEW ("fsimage.ckpt"),
EDITS_NEW ("edits.new");
private String fileName = null;
private NameNodeFile(String name) {this.fileName = name;}
String getName() {return fileName;}
}
下面是调用的Storage的代码
/**
* Read version file.
*
* @throws IOException if file cannot be read or contains inconsistent data
*/
public void read() throws IOException {
read(getVersionFile());
}
public void read(File from) throws IOException {
RandomAccessFile file = new RandomAccessFile(from, "rws"); /usr/local/hadoop/tmp/dfs/name/current/VERSION
FileInputStream in = null;
try {
in = new FileInputStream(file.getFD());
file.seek(0);
Properties props = new Properties();
props.load(in);
getFields(props, this);
} finally {
if (in != null) {
in.close();
}
file.close();
}
}
protected void getFields(Properties props,
StorageDirectory sd
) throws IOException {
super.getFields(props, sd);
if (layoutVersion == 0)
throw new IOException("NameNode directory "
+ sd.getRoot() + " is not formatted.");
String sDUS, sDUV;
sDUS = props.getProperty("distributedUpgradeState"); null
sDUV = props.getProperty("distributedUpgradeVersion"); null
setDistributedUpgradeState(
sDUS == null? false : Boolean.parseBoolean(sDUS),
sDUV == null? getLayoutVersion() : Integer.parseInt(sDUV));
this.checkpointTime = readCheckpointTime(sd);
}
/**
* Get common storage fields.
* Should be overloaded if additional fields need to be get.
*
* @param props
* @throws IOException
*/
protected void getFields(Properties props,
StorageDirectory sd
) throws IOException {
String sv, st, sid, sct;
sv = props.getProperty("layoutVersion");
st = props.getProperty("storageType");
sid = props.getProperty("namespaceID");
sct = props.getProperty("cTime");
if (sv == null || st == null || sid == null || sct == null)
throw new InconsistentFSStateException(sd.root,
"file " + STORAGE_FILE_VERSION + " is invalid.");
int rv = Integer.parseInt(sv);
NodeType rt = NodeType.valueOf(st);
int rid = Integer.parseInt(sid);
long rct = Long.parseLong(sct);
if (!storageType.equals(rt) ||
!((namespaceID == 0) || (rid == 0) || namespaceID == rid))
throw new InconsistentFSStateException(sd.root,
"is incompatible with others.");
if (rv < FSConstants.LAYOUT_VERSION) // future version
throw new IncorrectVersionException(rv, "storage directory "
+ sd.root.getCanonicalPath());
layoutVersion = rv;
storageType = rt;
namespaceID = rid;
cTime = rct;
}
long readCheckpointTime(StorageDirectory sd) throws IOException {
File timeFile = getImageFile(sd, NameNodeFile.TIME); // /usr/local/hadoop/tmp/dfs/name/current/fstime
long timeStamp = 0L;
if (timeFile.exists() && timeFile.canRead()) {
DataInputStream in = new DataInputStream(new FileInputStream(timeFile));
try {
timeStamp = in.readLong();
} catch (IOException e) {
LOG.info("Could not read fstime file in storage directory " + sd, e);
} finally {
in.close();
}
}
return timeStamp;
}