1. 前言
本文将从代码层面来介绍文件系统的实现原理,有关文件系统的基础知识请访问 Linux 知:文件系统 了解更多。
2. 流程
下面先从一些基本流程开始介绍文件系统,让读者可以有一些直观的感受,避免直接罗列各种函数和结构体让人望而生畏。
2.1. 打开文件流程
思路:根据文件路径名,从左到右匹配最合适的挂载点,找到挂载点对应的文件系统,然后用文件系统的 open 接口操作剩余的路径名。
比如:根据 /mnt/d/projects/linux/hello.c 匹配到最合适的挂载点 /mnt/d,找到挂载点对应的 drvfs,然后用文件系统的 open 接口操作 /projects/linux/hello.c 文件。
Linux 操作系统中 open() 接口的调用链大致如下所示:
// 用户态:
const char *filename = "/mnt/d/projects/linux/hello.c"
open(filename, ..) // 用户态封装的系统调用,一般由 c 库提供
----------------------------------------------------------
// 内核态:
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) [fs/open.c]
| | do_sys_open(AT_FDCWD, filename, flags, mode)
| | | do_sys_openat2(.., filename, ..)
| | | | struct filename *tmp;
| | | | tmp = getname(filename);
| | | | | tmp->name = filename // tmp->name = "/mnt/d/projects/linux/hello.c"
| | | | fd = get_unused_fd_flags
| | | | struct file *f;
| | | | f = do_filp_open(dfd, tmp, &op) [fs/namei.c]
| | | | | struct filename *pathname = tmp;
| | | | | struct nameidata nd;
| | | | | set_nameidata(&nd, dfd, pathname);
| | | | | | nd.name = pathname // nd.name.name = "/mnt/d/projects/linux/hello.c"
| | | | | | current->nameidata = &nd
| | | | | struct file *f;
| | | | | f = path_openat(&nd, op, flags);
| | | | | | const char *s = path_init(nd, flags) // s = "/mnt/d/projects/linux/hello.c"
| | | | | | | nd_jump_root(struct nameidata *nd)
| | | | | | | | nd->path = nd->root
| | | | | | link_path_walk(s, nd)
| | | | | | | parent->d_op->d_hash(parent, &this)
| | | | | | | nd->last.name = s; // nd.last.name = "/projects/linux/hello.c" 猜测
| | | | | | s = open_last_lookups(nd, file, op)
| | | | | | | dentry = lookup_open(nd, file, op, got_write);
| | | | | | | nd->path.dentry = dentry;
| | | | | | do_open(nd, file, ..); // open()的最后一步
| | | | | | | vfs_open(const struct path *path, struct file *file)
| | | | | | | | dentry = path->dentry
| | | | | | | | inode = d_backing_inode(dentry)
| | | | | | | | | inode = dentry->d_inode
| | | | | | | | do_dentry_open
| | | | | | | | | open = inode->i_fop->open // 最终的 open 接口
| | | | fd_install(fd, f);
备注:由于Linux文件系统设计的过于复杂,目前作者还没有清晰的整理出来,待完善。
2.2. 目录树初始化流程
文件系统要能够链接到目录树才能被我们使用,将文件系统与目录树结合的动作我们称为挂载。
Linux 操作系统目录树的大概初始化流程如下所示:
start_kernel [init/mainc]
| | vfs_caches_init [fs/dcache.c]
| | | mnt_init [fs/namespace.c]
| | | | sysfs_init [fs/sysfs/mount.c]
| | | | | register_filesystem(&sysfs_fs_type) [fs/filesystem.c]
| | | | | | find_filesystem
| | | | init_rootfs [int/do_mounts.c]
| | | | init_mount_tree [fs/namespace.c]
| | | | | struct vfsmount *mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL)
| | | | | struct mnt_namespace *ns = alloc_mnt_ns(&init_user_ns, false)
| | | | | struct mount *m = real_mount(mnt)
| | | | | m->mnt_ns = ns
| | | | | ns->root = m
| | | | | ns->mounts = 1
| | | | | list_add(&m->mnt_list, &ns->list)
| | | | | struct path root
| | | | | root.mnt = mnt
| | | | | root.dentry = mnt->mnt_root
| | | | | set_fs_pwd(current->fs, &root)
| | | | | set_fs_root(current->fs, &root)
初始化后,数据结构关系如下所示:
m [struct mount *]
mnt_devname [const char *]
current [struct task_struct *] | = "rootfs"
fs [struct fs_struct *] ,---|-> mnt [struct vfsmount *]
root [struct path] / | mnt_sb [truct super_block *] |
mnt [struct vfsmount *] | = fc->root->d_sb ---|--> fc [fs_context *]
dentry [struct dentry *]| ,--> mnt_root [struct dentry *] | fs_type [struct file_system_type *]
pwd [struct path] `---|-' = dget(fc->root) | = &rootfs_fs_type
| mnt_mountpoint [struct dentry *] | source [const char *]
| = m->mnt.mnt_root | = "rootfs"
| mnt_parent [struct mount *] |
| = m |
| mmt_ns [struct mnt_namespace *] |
| = ns ---|--> ns [struct mnt_namespace *]
| mnt_list [struct list_head] | root [struct mount *]
| \ list_add = m
| `-----<<------------------ list [struct list_head]
2.3. 文件系统注册流程
文件系统挂载前,需要先进行注册,这样在挂载时才能找到文件系统。以 ext2 文件为例介绍如下:
// fs/ext2/super.c
static struct file_system_type ext2_fs_type = {
.owner = THIS_MODULE,
.name = "ext2",
.mount = ext2_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext2");
static int __init init_ext2_fs(void)
{
int err;
err = init_inodecache();
if (err)
return err;
err = register_filesystem(&ext2_fs_type);
if (err)
goto out;
return 0;
out:
destroy_inodecache();
return err;
}
module_init(init_ext2_fs)
// fs/filesystems.c
/**
* register_filesystem - register a new filesystem
* @fs: the file system structure
*
* Adds the file system passed to the list of file systems the kernel
* is aware of for mount and other syscalls. Returns 0 on success,
* or a negative errno code on an error.
*
* The &struct file_system_type that is passed is linked into the kernel
* structures and must not be freed until the file system has been
* unregistered.
*/
int register_filesystem(struct file_system_type * fs)
{
int res = 0;
struct file_system_type ** p;
if (fs->parameters &&
!fs_validate_description(fs->name, fs->parameters))
return -EINVAL;
BUG_ON(strchr(fs->name, '.'));
if (fs->next)
return -EBUSY;
write_lock(&file_systems_lock);
p = find_filesystem(fs->name, strlen(fs->name));
if (*p)
res = -EBUSY;
else
*p = fs;
write_unlock(&file_systems_lock);
return res;
}
文件系统是作为内核模块存在的,模块初始化函数中会完成文件系统的注册。
注册是将文件系统添加到内核的文件系统列表中,供 mount 或其它系统调用使用。
2.4. 挂载流程
挂载点一定是目录,该目录为进入该文件系统的入口。因此并不是你有任何文件系统都能使用,必须要挂载到目录树的某个目录后,才能够使用该文件系统。
mount(...)
----------------------------------------------------------
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, ...) [fs/namespace.c]
| | do_mount(kernel_dev, dir_name, kernel_type, flags, options) [fs/namespace.c]
| | | struct path path;
| | | user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path) [include/linux/namei.h]
| | | | user_path_at_empty [fs/namei.c]
| | | | | filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root)
| | | | | | path_lookupat
| | | path_mount(dev_name, &path, kernel_type, flags, options) [fs/namespace.c]
| | | | do_new_mount(path, kernel_type, sb_flags, mnt_flags, dev_name, options)
| | | | | | struct file_system_type *type;
| | | | | | struct fs_context *fc;
| | | | | | type = get_fs_type(kernel_type) [fs/filesystems.c]
| | | | | | | __get_fs_type
| | | | | | | | find_filesystem(const char *name, unsigned len) // 查找文件系统
| | | | | | fc = fs_context_for_mount(type, sb_flags) [fs/fs_context.c]
| | | | | | | alloc_fs_context
| | | | | | do_new_mount_fc(fc, path, mnt_flags) [fs/namespace.c]
| | | | | | | struct vfsmount *mnt;
| | | | | | | struct mountpoint *mp;
| | | | | | | mnt = vfs_create_mount(fc);
| | | | | | | mp = lock_mount(path);
| | | | | | | | lookup_mnt
| | | | | | | do_add_mount(real_mount(mnt), mp, path, mnt_flags) [fs/namespace.c]
| | | | | | | | struct mount *newmnt = real_mount(mnt)
| | | | | | | | struct mount *parent = real_mount(path->mnt)
| | | | | | | | graft_tree(newmnt, parent, mp)
| | | | | | | | | attach_recursive_mnt(struct mount *source_mnt, struct mount *dest_mnt, struct mountpoint *dest_mp, ..)
待完善