1. 我们采用场景驱动的方式,首先我们编写一段代码
package org.apache.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
/**
* Copyright (c) 2019 leyou ALL Rights Reserved
* Project: hadoop-main
* Package: org.apache.hadoop
* Version: 1.0
*
* @author qingzhi.wu
* @date 2020/7/6 20:05
*/
public class TestHDFS {
public static void main(String[] args) throws IOException {
Configuration configuration = new Configuration();
FileSystem fileSystem = FileSystem.newInstance(configuration);
//场景驱动的方式(元数据的更新流程)
fileSystem.mkdirs(new Path("/user/hive/warehouse/test/my"));
/**
* 我们可以提前猜测一下,根据我们之前看的源码来说,这个mkdirs应该是执行在哪里呢,
* 肯定是在rpc的服务端,那么毫无疑问的就是我们的 NamenodeRpcServer
* 那么为什么不直接
* NameNode namenode = new NameNode(conf)
*
* namenode.mkdirs(new path("xx"))
*
* 这么说这个FileSystem里面肯定会获取代理
*
*/
}
}
然后我们是不是就可以一直点,看看他到底怎么做的
2. HDFS创建目录到底发生了什么?
2.1 DistributedFileSystem的mkdirs方法
/**
* Create a directory and its parent directories.
*
* See {@link FsPermission#applyUMask(FsPermission)} for details of how
* the permission is applied.
*
* @param f The path to create
* @param permission The permission. See FsPermission#applyUMask for
* details about how this is used to calculate the
* effective permission.
*/
@Override
public boolean mkdirs(Path f, FsPermission permission) throws IOException {
return mkdirsInternal(f, permission, true);
}
2.2 mkdirsInternal
private boolean mkdirsInternal(Path f, final FsPermission permission,
final boolean createParent) throws IOException {
statistics.incrementWriteOps(1);
Path absF = fixRelativePart(f);
return new FileSystemLinkResolver<Boolean>() {
@Override
public Boolean doCall(final Path p)
throws IOException, UnresolvedLinkException {
//TODO 重要代码
return dfs.mkdirs(getPathName(p), permission, createParent);
}
@Override
public Boolean next(final FileSystem fs, final Path p)
throws IOException {
// FileSystem doesn't have a non-recursive mkdir() method
// Best we can do is error out
if (!createParent) {
throw new IOException("FileSystem does not support non-recursive"
+ "mkdir");
}
return fs.mkdirs(p, permission);
}
}.resolve(this, absF);
}
2.3 DFSClient的prinitiveMkdir
/**
* Same {{@link #mkdirs(String, FsPermission, boolean)} except
* that the permissions has already been masked against umask.
*/
public boolean primitiveMkdir(String src, FsPermission absPermission,
boolean createParent)
throws IOException {
checkOpen();
if (absPermission == null) {
absPermission =
FsPermission.getDefault().applyUMask(dfsClientConf.uMask);
}
if(LOG.isDebugEnabled()) {
LOG.debug(src + ": masked=" + absPermission);
}
TraceScope scope = Trace.startSpan("mkdir", traceSampler);
try {
//TODO 走的HadoopRPC ,调用服务端的方法
return namenode.mkdirs(src, absPermission, createParent);
} catch(RemoteException re) {
throw re.unwrapRemoteException(AccessControlException.class,
InvalidPathException.class,
FileAlreadyExistsException.class,
FileNotFoundException.class,
ParentNotDirectoryException.class,
SafeModeException.class,
NSQuotaExceededException.class,
DSQuotaExceededException.class,
UnresolvedPathException.class,
SnapshotAccessControlException.class);
} finally {
scope.close();
}
}
2.4 NameNodeRpcServer中的mkdirs方法
@Override // ClientProtocol
public boolean mkdirs(String src, FsPermission masked, boolean createParent)
throws IOException {
checkNNStartup();
if(stateChangeLog.isDebugEnabled()) {
stateChangeLog.debug("*DIR* NameNode.mkdirs: " + src);
}
if (!checkPathLength(src)) {
throw new IOException("mkdirs: Pathname too long. Limit "
+ MAX_PATH_LENGTH + " characters, " + MAX_PATH_DEPTH + " levels.");
}
//TODO 通过FSNamesystem来创建目录
return namesystem.mkdirs(src,
new PermissionStatus(getRemoteUser().getShortUserName(),
null, masked), createParent);
}
2.5 FSNamesystem中的mkdirs方法
/**
* Create all the necessary directories
*/
boolean mkdirs(String src, PermissionStatus permissions,
boolean createParent) throws IOException {
HdfsFileStatus auditStat = null;
checkOperation(OperationCategory.WRITE);
writeLock();
try {
checkOperation(OperationCategory.WRITE);
//TODO 校验是否是安全模式
checkNameNodeSafeMode("Cannot create directory " + src);
//TODO hadoop采用了面向对象的思想,将每一步操作都进行了封装成了对象
auditStat = FSDirMkdirOp.mkdirs(this, src, permissions, createParent);
} catch (AccessControlException e) {
logAuditEvent(false, "mkdirs", src);
throw e;
} finally {
writeUnlock();
}
//TODO 重点 元数据日志持久化
getEditLog().logSync();
logAuditEvent(true, "mkdirs", src, null, auditStat);
return true;
}
2.6 FSDirMkdirOp.mkdirs
static HdfsFileStatus mkdirs(FSNamesystem fsn, String src,
PermissionStatus permissions, boolean createParent) throws IOException {
//TODO 获取目录树
/**
* FSDirectory 目录树
*
* hadoop fs -ls /
*
* hdfs dfs -ls /
*
*/
FSDirectory fsd = fsn.getFSDirectory();
if(NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* NameSystem.mkdirs: " + src);
}
if (!DFSUtil.isValidName(src)) {
throw new InvalidPathException(src);
}
FSPermissionChecker pc = fsd.getPermissionChecker();
byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
fsd.writeLock();
try {
//TODO 解析要创建目录的路径 /user/hive/warehouse/data/mytable
/**
* hadoop fs -mkdir /user/hive/warehouse/data/mytable
*/
src = fsd.resolvePath(pc, src, pathComponents);
INodesInPath iip = fsd.getINodesInPath4Write(src);
if (fsd.isPermissionEnabled()) {
fsd.checkTraverse(pc, iip);
}
/**
* 比如我们现在已经存在的目录是:/user/hive/warehouse
* 我们需要创建的目录是:/user/hive/warehouse/data/mytable
* 他首先找到最后一个INode,其实就是warehouse 这个INode
*/
final INode lastINode = iip.getLastINode();
if (lastINode != null && lastINode.isFile()) {
throw new FileAlreadyExistsException("Path is not a directory: " + src);
}
INodesInPath existing = lastINode != null ? iip : iip.getExistingINodes();
if (lastINode == null) {
if (fsd.isPermissionEnabled()) {
fsd.checkAncestorAccess(pc, iip, FsAction.WRITE);
}
if (!createParent) {
fsd.verifyParentDir(iip, src);
}
// validate that we have enough inodes. This is, at best, a
// heuristic because the mkdirs() operation might need to
// create multiple inodes.
fsn.checkFsObjectLimit();
/**
* 已存在: /user/hive/warehouse
* 要创建: /user/hive/warehouse/data/mytable
* 需要创建的目录 /data/mytable
*/
List<String> nonExisting = iip.getPath(existing.length(),
iip.length() - existing.length());
int length = nonExisting.size();
//TODO 需要创建多级目录走这里
if (length > 1) {
List<String> ancestors = nonExisting.subList(0, length - 1);
// Ensure that the user can traversal the path by adding implicit
// u+wx permission to all ancestor directories
existing = createChildrenDirectories(fsd, existing, ancestors,
addImplicitUwx(permissions, permissions));
if (existing == null) {
throw new IOException("Failed to create directory: " + src);
}
}
//TODO 需要创建单目录走这里 /data
if ((existing = createChildrenDirectories(fsd, existing,
nonExisting.subList(length - 1, length), permissions)) == null) {
throw new IOException("Failed to create directory: " + src);
}
}
return fsd.getAuditFileInfo(existing);
} finally {
fsd.writeUnlock();
}
}
2.7 FSDirectory类的注释和重要变量
/**
* Both FSDirectory and FSNamesystem manage the state of the namespace.
* FSDirectory is a pure in-memory data structure, all of whose operations
* happen entirely in memory. In contrast, FSNamesystem persists the operations
* to the disk.
*
* FSDirectory 和FSNamesystem 都是管理元数据的状态的
*
* 1. FSDirectory是一个一直在内存里面的数据结构,其实就是内存目录树
*
* 2. FSNamesystem是把我们的元数据记录信息持久化到内存上面的(先写到内存,再写到磁盘 也就是双缓冲机制)
*
* @see org.apache.hadoop.hdfs.server.namenode.FSNamesystem
**/
//我们的根目录也就是 我们的 /
INodeDirectory rootDir;
/**构造方法*/
private static INodeDirectory createRoot(FSNamesystem namesystem) {
final INodeDirectory r = new INodeDirectory(
INodeId.ROOT_INODE_ID,
INodeDirectory.ROOT_NAME,//为空,因为根目录没有名称
namesystem.createFsOwnerPermissions(new FsPermission((short) 0755)),
0L);
r.addDirectoryWithQuotaFeature(
new DirectoryWithQuotaFeature.Builder().
nameSpaceQuota(DirectoryWithQuotaFeature.DEFAULT_NAMESPACE_QUOTA).
storageSpaceQuota(DirectoryWithQuotaFeature.DEFAULT_STORAGE_SPACE_QUOTA).
build());
r.addSnapshottableFeature();
r.setSnapshotQuota(0);
return r;
}
2.8 createChildrenDirectories
/**
* Create the directory {@code parent} / {@code children} and all ancestors
* along the path.
*
* @param fsd FSDirectory
* @param existing The INodesInPath instance containing all the existing
* ancestral INodes
* @param children The relative path from the parent towards children,
* starting with "/"
* @param perm the permission of the directory. Note that all ancestors
* created along the path has implicit {@code u+wx} permissions.
*
* @return {@link INodesInPath} which contains all inodes to the
* target directory, After the execution parentPath points to the path of
* the returned INodesInPath. The function return null if the operation has
* failed.
*/
private static INodesInPath createChildrenDirectories(FSDirectory fsd,
INodesInPath existing, List<String> children, PermissionStatus perm)
throws IOException {
assert fsd.hasWriteLock();
for (String component : children) {
//TODO 一个目录一个目录的去创建
//TODO 如果只创建单目录,暗恶魔这个循环只会运行一次
existing = createSingleDirectory(fsd, existing, component, perm);
if (existing == null) {
return null;
}
}
return existing;
}
2.9 createSingleDirectory
private static INodesInPath createSingleDirectory(FSDirectory fsd,
INodesInPath existing, String localName, PermissionStatus perm)
throws IOException {
assert fsd.hasWriteLock();
//TODO 更新目录树,这颗目录树是存在内存中的
//更新内存里面的数据
existing = unprotectedMkdir(fsd, fsd.allocateNewInodeId(), existing,
localName.getBytes(Charsets.UTF_8), perm, null, now());
if (existing == null) {
return null;
}
final INode newNode = existing.getLastINode();
// Directory creation also count towards FilesCreated
// to match count of FilesDeleted metric.
NameNode.getNameNodeMetrics().incrFilesCreated();
String cur = existing.getPath();
//TODO 把元数据信息记录到磁盘上(但是一开始先写到内存)
//往磁盘上面记录元数据日志
fsd.getEditLog().logMkDir(cur, newNode);
if (NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("mkdirs: created directory " + cur);
}
return existing;
}
2.10 unprotectedMkdir
/**
* create a directory at path specified by parent
*/
private static INodesInPath unprotectedMkdir(FSDirectory fsd, long inodeId,
INodesInPath parent, byte[] name, PermissionStatus permission,
List<AclEntry> aclEntries, long timestamp)
throws QuotaExceededException, AclException, FileAlreadyExistsException {
assert fsd.hasWriteLock();
assert parent.getLastINode() != null;
if (!parent.getLastINode().isDirectory()) {
throw new FileAlreadyExistsException("Parent path is not a directory: " +
parent.getPath() + " " + DFSUtil.bytes2String(name));
}
/**
* FSDirectory 文件目录树 / 是根目录
* INodeDirectory代表目录
* INodeFile代表文件
*/
//TODO 封装成一个目录
final INodeDirectory dir = new INodeDirectory(inodeId, name, permission,
timestamp);
//TODO 往文件目录树 该添加目录的地方添加节点
INodesInPath iip = fsd.addLastINode(parent, dir, true);
if (iip != null && aclEntries != null) {
AclStorage.updateINodeAcl(dir, aclEntries, Snapshot.CURRENT_STATE_ID);
}
return iip;
}
}
2.11 addLastINode
/**
* Add a child to the end of the path specified by INodesInPath.
* @return an INodesInPath instance containing the new INode
*/
@VisibleForTesting
public INodesInPath addLastINode(INodesInPath existing, INode inode,
boolean checkQuota) throws QuotaExceededException {
assert existing.getLastINode() != null &&
existing.getLastINode().isDirectory();
final int pos = existing.length();
// Disallow creation of /.reserved. This may be created when loading
// editlog/fsimage during upgrade since /.reserved was a valid name in older
// release. This may also be called when a user tries to create a file
// or directory /.reserved.
if (pos == 1 && existing.getINode(0) == rootDir && isReservedName(inode)) {
throw new HadoopIllegalArgumentException(
"File name \"" + inode.getLocalName() + "\" is reserved and cannot "
+ "be created. If this is during upgrade change the name of the "
+ "existing file or directory to another name before upgrading "
+ "to the new release.");
}
//TODO 获取父目录 warehouse
final INodeDirectory parent = existing.getINode(pos - 1).asDirectory();
// The filesystem limits are not really quotas, so this check may appear
// odd. It's because a rename operation deletes the src, tries to add
// to the dest, if that fails, re-adds the src from whence it came.
// The rename code disables the quota when it's restoring to the
// original location because a quota violation would cause the the item
// to go "poof". The fs limits must be bypassed for the same reason.
if (checkQuota) {
final String parentPath = existing.getPath(pos - 1);
verifyMaxComponentLength(inode.getLocalNameBytes(), parentPath);
verifyMaxDirItems(parent, parentPath);
}
// always verify inode name
verifyINodeName(inode.getLocalNameBytes());
final QuotaCounts counts = inode.computeQuotaUsage(getBlockStoragePolicySuite());
updateCount(existing, pos, counts, checkQuota);
boolean isRename = (inode.getParent() != null);
boolean added;
try {
//TODO 在父目录下添加
added = parent.addChild(inode, true, existing.getLatestSnapshotId());
} catch (QuotaExceededException e) {
updateCountNoQuotaCheck(existing, pos, counts.negation());
throw e;
}
if (!added) {
updateCountNoQuotaCheck(existing, pos, counts.negation());
return null;
} else {
if (!isRename) {
AclStorage.copyINodeDefaultAcl(inode);
}
addToInodeMap(inode);
}
return INodesInPath.append(existing, inode, inode.getLocalNameBytes());
}
2.12 parent.addChild
/**
* Add a child inode to the directory.
*
* @param node INode to insert
* @param setModTime set modification time for the parent node
* not needed when replaying the addition and
* the parent already has the proper mod time
* @return false if the child with this name already exists;
* otherwise, return true;
*/
public boolean addChild(INode node, final boolean setModTime,
final int latestSnapshotId) throws QuotaExceededException {
final int low = searchChildren(node.getLocalNameBytes());
if (low >= 0) {
return false;
}
if (isInLatestSnapshot(latestSnapshotId)) {
// create snapshot feature if necessary
DirectoryWithSnapshotFeature sf = this.getDirectoryWithSnapshotFeature();
if (sf == null) {
sf = this.addSnapshotFeature(null);
}
return sf.addChild(this, node, setModTime, latestSnapshotId);
}
//TODO 添加子节点
addChild(node, low);
if (setModTime) {
// update modification time of the parent directory
updateModificationTime(node.getModificationTime(), latestSnapshotId);
}
return true;
}
2.13 addChild
/**
* Add the node to the children list at the given insertion point.
* The basic add method which actually calls children.add(..).
*/
private void addChild(final INode node, final int insertionPoint) {
if (children == null) {
children = new ArrayList<INode>(DEFAULT_FILES_PER_DIRECTORY);
}
node.setParent(this);
//TODO 子节点列表里面再添加一个INode
children.add(-insertionPoint - 1, node);
if (node.getGroupName() == null) {
node.setGroup(getGroupName());
}
}
2.14 INode的重要子类INodeDriectory的属性
/**
* 这是一个重要的属性
* 往这个里面存储子节点
* 可以是目录,也可以是文件
* 模仿的linux
* INodeDirectory 存储目录
* INodeFile 存储文件
*/
private List<INode> children = null;
/**
* Add the node to the children list at the given insertion point.
* The basic add method which actually calls children.add(..).
*/
private void addChild(final INode node, final int insertionPoint) {
if (children == null) {
children = new ArrayList<INode>(DEFAULT_FILES_PER_DIRECTORY);
}
node.setParent(this);
//TODO 子节点列表里面再添加一个INode
children.add(-insertionPoint - 1, node);
if (node.getGroupName() == null) {
node.setGroup(getGroupName());
}
}
上面可以看到,我们的目录树其实就是一个INode的ArrayList集合
3. 总结
- 当我们用java客户端在hdfs创建目录的时候,首先会调用FileSystem中的mkdirs方法
- 其实使用的是其子类DistributedFileSystem的方法
- 然后通过DFSClient获取到NameNodeRpcServer的代理,调用了我们创建目录的方法
- 通过FSNamesystem元数据管理类,先校验我们的NameNode是不是处于安全模式,我们上一次已经知道什么时候处于安全模式了(100M,0.999f,datanode的个数)
- hdfs将我们的操作都封装成了一些对象,例如FSDirMkdirOp等
- 获取目录树
- 截取已经存在的父目录
- 将我们要创建的每级目录封装为一个INodeDirectory(文件是INodeFile)
- 然后将我们创建好的目录依次挂接到我们的父目录上
- 刷写元数据(后面再讲)