Datanode在启动过程中会校验数据目录进行较多的检验,比如是否需要升级、格式化,是否和nn版本一致,是否需要rollback,并最终更新current目录下的VERSION文件信息。下面从startDataNode这个函数入手开始介绍整个校验流程,这个函数比较长,我们只针对校验部分详细分析,因为这个函数包含的内容太多了,相关内容以后还会介绍
void startDataNode(Configuration conf,
AbstractList<File>dataDirs, SecureResources resources
) throws IOException {
if(UserGroupInformation.isSecurityEnabled()&& resources == null)
throw new RuntimeException("Cannotstart secure cluster without " +
"privilegedresources.");
this.secureResources = resources;
//获得本地主机名,如果没有设置slave.host.name则通过下面两个参数,通过网口和DNS来获得
if (conf.get("slave.host.name")!= null) {
machineName =conf.get("slave.host.name");
}
if (machineName == null) {
//注意两个可配置参数,网口:ethX DNS服务器:1.1.1.1类似这种形式
machineName = DNS.getDefaultHost(
conf.get("dfs.datanode.dns.interface","default"),
conf.get("dfs.datanode.dns.nameserver","default"));
}
//获得配置文件中NN的地址,用于下面创建动态代理,因为在创建动态代理时要和NN通信
InetSocketAddress nameNodeAddr =NameNode.getServiceAddress(conf, true);
//socket连接超时时间
this.socketTimeout = conf.getInt("dfs.socket.timeout",
HdfsConstants.READ_TIMEOUT);
//socket写超时时间
this.socketWriteTimeout =conf.getInt("dfs.datanode.socket.write.timeout",
HdfsConstants.WRITE_TIMEOUT);
//影响本datanode向客户端或其他datanode发送数据块的缓存分配尺寸,具体可见sendBlock函数
this.transferToAllowed =conf.getBoolean("dfs.datanode.transferTo.allowed",
true);
//写包的大小
this.writePacketSize =conf.getInt("dfs.write.packet.size", 64*1024);
//创建注册体,用于DN向NN注册时,服务端的校验
InetSocketAddress socAddr =DataNode.getStreamingAddr(conf);
int tmpPort = socAddr.getPort();
storage = new DataStorage();
// construct registration
this.dnRegistration = newDatanodeRegistration(machineName + ":" + tmpPort);
// 创建代理并通过握手获得NN的版本、ID信息
this.namenode = (DatanodeProtocol)
RPC.waitForProxy(DatanodeProtocol.class,
DatanodeProtocol.versionID,
nameNodeAddr,
conf);
// get version and id info from thename-node
NamespaceInfo nsInfo = handshake();
StartupOption startOpt =getStartupOption(conf);
assert startOpt != null : "Startupoption must be set.";
boolean simulatedFSDataset =
conf.getBoolean("dfs.datanode.simulateddatastorage", false);
if (simulatedFSDataset) {
//因为我这里不是伪分布式,所以会走下面的逻辑
} else {
// 这里才开始校验,也是我们重点关注的部分
storage.recoverTransitionRead(nsInfo,dataDirs, startOpt);
// adjust
this.dnRegistration.setStorageInfo(storage);
//initialize data node internal structure
this.data = new FSDataset(storage, conf);
}
.................
}
下面看下是如何真正校验的,总的来说分三步 1、对数据目录做一致性检查 2、执行升级或回滚的操作流程 3、更新版本文件信息
void recoverTransitionRead(NamespaceInfo nsInfo,
Collection<File> dataDirs,
StartupOptionstartOpt
) throws IOException {
assert FSConstants.LAYOUT_VERSION == nsInfo.getLayoutVersion() :
"Data-node and name-node layout versions must be thesame.";
// 1. For each data directory calculate its state and
// check whether all is consistent before transitioning.
// Format and recover.
this.storageID = "";
this.storageDirs = new ArrayList<StorageDirectory>(dataDirs.size());
ArrayList<StorageState> dataDirStates = new ArrayList<StorageState>(dataDirs.size());
for(Iterator<File> it =dataDirs.iterator(); it.hasNext();) {
File dataDir = it.next();
StorageDirectory sd = new StorageDirectory(dataDir);
StorageState curState;
try {
//数据目录状态分析,是否存在,权限分析、是否需要升级
curState = sd.analyzeStorage(startOpt);
// 根据检测后的状态分别执行不同操作,正常启动、格式化、恢复
switch(curState) {
case NORMAL:
break;
case NON_EXISTENT:
// 数据目录不存在,则直接忽略
LOG.info("Storage directory " + dataDir + " does not exist.");
it.remove();
continue;
case NOT_FORMATTED: // format
LOG.info("Storage directory " + dataDir + " is not formatted.");
LOG.info("Formatting ...");
//数据目录格式化
format(sd, nsInfo);
break;
default: // 从上一次升级或回滚的失败中恢复
sd.doRecover(curState);
}
} catch (IOException ioe) {
sd.unlock();
throw ioe;
}
// add to the storage list
addStorageDir(sd);
dataDirStates.add(curState);
}
if (dataDirs.size() == 0) // none of the data dirs exist
throw new IOException(
"All specified directories are notaccessible or do not exist.");
// 2.执行真正的升级或回滚操作
for(int idx = 0; idx < getNumStorageDirs(); idx++) {
doTransition(getStorageDir(idx), nsInfo,startOpt);
assert this.getLayoutVersion() == nsInfo.getLayoutVersion() :
"Data-node and name-node layoutversions must be the same.";
assert this.getCTime() == nsInfo.getCTime() :
"Data-node and name-node CTimes mustbe the same.";
}
// 3. 更新所有目录的版本文件信息
this.writeAll();
}
现在看如何分析数据目录的,这决定了后两步的操作
public StorageState analyzeStorage(StartupOptionstartOpt) throws IOException {
assert root != null : "rootis null";
String rootPath = root.getCanonicalPath();
try { // 是否存在
if (!root.exists()) {
// storage directory does not exist
if (startOpt != StartupOption.FORMAT) {
LOG.info("Storage directory " + rootPath + " does not exist.");
return StorageState.NON_EXISTENT;
}
LOG.info(rootPath + " does not exist. Creating ...");
if (!root.mkdirs())
throw new IOException("Cannotcreate directory " + rootPath);
}
// 是否为一个目录
if (!root.isDirectory()) {
LOG.info(rootPath + "is not a directory.");
return StorageState.NON_EXISTENT;
}
//是否有些权限
if (!root.canWrite()) {
LOG.info("Cannot access storage directory" + rootPath);
return StorageState.NON_EXISTENT;
}
} catch(SecurityException ex) {
LOG.info("Cannot access storage directory" + rootPath, ex);
return StorageState.NON_EXISTENT;
}
this.lock(); // 对数据目录加锁,防止并发访问
if (startOpt == HdfsConstants.StartupOption.FORMAT)
return StorageState.NOT_FORMATTED;
if (startOpt != HdfsConstants.StartupOption.IMPORT) {
//make sure no conversion is required
checkConversionNeeded(this);
}
// 获得版本文件
File versionFile = getVersionFile();
boolean hasCurrent = versionFile.exists();
// 一系列的临时文件校验,如果这些临时目录存在,则说明这个存储是不正常的,下面会看到这些校验
boolean hasPrevious = getPreviousDir().exists();
boolean hasPreviousTmp = getPreviousTmp().exists();
boolean hasRemovedTmp = getRemovedTmp().exists();
boolean hasFinalizedTmp =getFinalizedTmp().exists();
boolean hasCheckpointTmp = getLastCheckpointTmp().exists();
if (!(hasPreviousTmp || hasRemovedTmp
|| hasFinalizedTmp ||hasCheckpointTmp)) {
// no temp dirs - no recovery
if (hasCurrent)
return StorageState.NORMAL;
if (hasPrevious)
throw new InconsistentFSStateException(root,
"version file in current directory ismissing.");
return StorageState.NOT_FORMATTED;
}
if ((hasPreviousTmp?1:0) + (hasRemovedTmp?1:0)
+ (hasFinalizedTmp?1:0) +(hasCheckpointTmp?1:0) > 1)
// more than one temp dirs
throw new InconsistentFSStateException(root,
"too many temporary directories.");
// # of temp dirs == 1 should eitherrecover or complete a transition
if (hasCheckpointTmp) {
return hasCurrent ? StorageState.COMPLETE_CHECKPOINT
: StorageState.RECOVER_CHECKPOINT;
}
if (hasFinalizedTmp) {
if (hasPrevious)
throw new InconsistentFSStateException(root,
STORAGE_DIR_PREVIOUS + " and " + STORAGE_TMP_FINALIZED
+ "cannotexist together.");
return StorageState.COMPLETE_FINALIZE;
}
if (hasPreviousTmp) {
if (hasPrevious)
throw new InconsistentFSStateException(root,
STORAGE_DIR_PREVIOUS + " and " + STORAGE_TMP_PREVIOUS
+ "cannot exist together.");
if (hasCurrent)
return StorageState.COMPLETE_UPGRADE;
return StorageState.RECOVER_UPGRADE;
}
assert hasRemovedTmp : "hasRemovedTmp must be true";
if (!(hasCurrent ^ hasPrevious))
throw new InconsistentFSStateException(root,
"one and only one directory " + STORAGE_DIR_CURRENT
+ " or" + STORAGE_DIR_PREVIOUS
+ "must be present when " + STORAGE_TMP_REMOVED
+ "exists.");
if (hasCurrent)
return StorageState.COMPLETE_ROLLBACK;
return StorageState.RECOVER_ROLLBACK;
}
在第二步中会做升级或回滚的操作,如果启动参数为NORMAL则直接返回
private void doTransition( StorageDirectory sd,
NamespaceInfonsInfo,
StartupOptionstartOpt
) throws IOException {
//是否需要回滚
if (startOpt == StartupOption.ROLLBACK)
doRollback(sd, nsInfo); // rollback if applicable
//读取版本文件信息
sd.read();
//检测版本文件
checkVersionUpgradable(this.layoutVersion);
assert this.layoutVersion >= FSConstants.LAYOUT_VERSION :
"Future version is not allowed";
//namespaceid校验
if (getNamespaceID() !=nsInfo.getNamespaceID())
throw new IOException(
"Incompatible namespaceIDs in " + sd.getRoot().getCanonicalPath()
+ ": namenode namespaceID = " + nsInfo.getNamespaceID()
+ "; datanode namespaceID = " + getNamespaceID());
//layout版本校验
if (this.layoutVersion == FSConstants.LAYOUT_VERSION
&& this.cTime == nsInfo.getCTime())
return; //regular startup
// verify necessity of a distributed upgrade
verifyDistributedUpgradeProgress(nsInfo);
if (this.layoutVersion > FSConstants.LAYOUT_VERSION
|| this.cTime < nsInfo.getCTime()) {
//执行升级操作
doUpgrade(sd, nsInfo); // upgrade
return;
}
// layoutVersion == LAYOUT_VERSION && this.cTime> nsInfo.cTime
// must shutdown
throw new IOException("Datanodestate: LV = " + this.getLayoutVersion()
+ " CTime = " + this.getCTime()
+ " is newer than the namespace state:LV = "
+nsInfo.getLayoutVersion()
+ " CTime = " + nsInfo.getCTime());
}
至此数据目录的校验如果成功,则会继续执行,需要注意的是在升级和回滚阶段的操作还是比较复杂的,如果正常启动则比较简单。下一篇讲dn内部数据结构的初始化。