本文代码基于Linux 5.10。
Linux mount 主要通过mount 命令或者mount api来实现, 本文主要介绍mount 调用在内核中的实现。
数据结构
fs_context
fs_context 是mount 流程中的重要数据结构, 其定义如下
include/linux/fs_context.h
struct fs_context {
const struct fs_context_operations *ops;
struct mutex uapi_mutex; /* Userspace access mutex */
struct file_system_type *fs_type;
void *fs_private; /* The filesystem's context */
void *sget_key;
struct dentry *root; /* The root and superblock */
struct user_namespace *user_ns; /* The user namespace for this mount */
struct net *net_ns; /* The network namespace for this mount */
const struct cred *cred; /* The mounter's credentials */
struct p_log log; /* Logging buffer */
const char *source; /* The source name (eg. dev path) */
void *security; /* Linux S&M options */
void *s_fs_info; /* Proposed s_fs_info */
unsigned int sb_flags; /* Proposed superblock flags (SB_*) */
unsigned int sb_flags_mask; /* Superblock flags that were changed */
unsigned int s_iflags; /* OR'd with sb->s_iflags */
unsigned int lsm_flags; /* Information flags from the fs to the LSM */
enum fs_context_purpose purpose:8;
enum fs_context_phase phase:8; /* The phase the context is in */
bool need_free:1; /* Need to call ops->free() */
bool global:1; /* Goes into &init_user_ns */
bool oldapi:1; /* Coming from mount(2) */
};
Linux 对于这个结构体的注释是:
/*
* Filesystem context for holding the parameters used in the creation or
* reconfiguration of a superblock.
*
* Superblock creation fills in ->root whereas reconfiguration begins with this
* already set.
*
* See Documentation/filesystems/mount_api.rst
*/
我的理解这个结构是从 file_system_type 到 super_block 之间的桥梁, 控制了mount 流程。
fs_type: 对应的fs_type 结构体
ops: 这个比较重要, 指向了fs_context_operations, 一般会在文件系统的init_fs_context回调中对其进行赋值
struct fs_context_operations {
void (*free)(struct fs_context *fc);
int (*dup)(struct fs_context *fc, struct fs_context *src_fc);
int (*parse_param)(struct fs_context *fc, struct fs_parameter *param);
int (*parse_monolithic)(struct fs_context *fc, void *data);
int (*get_tree)(struct fs_context *fc);
int (*reconfigure)(struct fs_context *fc);
};
处理流程
mount 的整体调用栈如下, 下面我们一个一个分析:
#0 exfat_fill_super (sb=0xffff888004865000, fc=0xffff888003053d80) at fs/exfat/super.c:599
#1 0xffffffff8120a2e9 in get_tree_bdev (fc=0xffff888003053d80, fill_super=0xffffffff813232e5 <exfat_fill_super>) at fs/super.c:1344
#2 0xffffffff813236eb in exfat_get_tree (fc=0xffff888003053d80) at fs/exfat/super.c:696
#3 0xffffffff8120915c in vfs_get_tree (fc=fc@entry=0xffff888003053d80) at fs/super.c:1549
#4 0xffffffff8122a997 in do_new_mount (data=0x0 <fixed_percpu_data>, name=0xffff8880032794a0 "/dev/loop0", mnt_flags=32, sb_flags=<optimized out>, fstype=0x20 <fixed_percpu_data+32> <error: Cannot access memory at address 0x20>, path=0xffffc90000183ec8) at fs/namespace.c:2875
#5 path_mount (dev_name=dev_name@entry=0xffff8880032794a0 "/dev/loop0", path=path@entry=0xffffc90000183ec8, type_page=type_page@entry=0xffff8880032d1c78 "exfat", flags=<optimized out>, flags@entry=32768, data_page=data_page@entry=0x0 <fixed_percpu_data>) at fs/namespace.c:3205
#6 0xffffffff8122ae10 in do_mount (dev_name=dev_name@entry=0xffff8880032794a0 "/dev/loop0", dir_name=dir_name@entry=0x7ffd4ff80f31 "/mnt", type_page=type_page@entry=0xffff8880032d1c78 "exfat", flags=flags@entry=32768, data_page=data_page@entry=0x0 <fixed_percpu_data>) at fs/namespace.c:3218
#7 0xffffffff8122b246 in __do_sys_mount (data=<optimized out>, flags=32768, type=<optimized out>, dir_name=0x7ffd4ff80f31 "/mnt", dev_name=<optimized out>) at fs/namespace.c:3426
#8 __se_sys_mount (data=<optimized out>, flags=32768, type=<optimized out>, dir_name=140725945110321, dev_name=<optimized out>) at fs/namespace.c:3403
#9 __x64_sys_mount (regs=<optimized out>) at fs/namespace.c:3403
#10 0xffffffff819bf903 in do_syscall_64 (nr=<optimized out>, regs=0xffffc90000183f58) at arch/x86/entry/common.c:46
#11 0xffffffff81a0007c in entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:120
#12 0x0000000000000000 in ?? ()
入口函数
如下是linux mount系统调用的定义, mount都会走到这个地方来, 主要调用do_mount 完成后续的工作。
fs/namespace.c
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
char __user *, type, unsigned long, flags, void __user *, data)
{
int ret;
char *kernel_type;
char *kernel_dev;
void *options;
kernel_type = copy_mount_string(type);
ret = PTR_ERR(kernel_type);
if (IS_ERR(kernel_type))
goto out_type;
kernel_dev = copy_mount_string(dev_name);
ret = PTR_ERR(kernel_dev);
if (IS_ERR(kernel_dev))
goto out_dev;
options = copy_mount_options(data);
ret = PTR_ERR(options);
if (IS_ERR(options))
goto out_data;
ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
kfree(options);
out_data:
kfree(kernel_dev);
out_dev:
kfree(kernel_type);
out_type:
return ret;
}
do_mount 主要调用了path_mount , path_mount 中主要设置了sb_flags和mnt_flags, 然后调用了do_new_mount
fs/namespace.c
int path_mount(const char *dev_name, struct path *path,
const char *type_page, unsigned long flags, void *data_page)
{
unsigned int mnt_flags = 0, sb_flags;
int ret;
/* Discard magic */
if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
flags &= ~MS_MGC_MSK;
/* Basic sanity checks */
if (data_page)
((char *)data_page)[PAGE_SIZE - 1] = 0;
if (flags & MS_NOUSER)
return -EINVAL;
ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
if (ret)
return ret;
if (!may_mount())
return -EPERM;
if ((flags & SB_MANDLOCK) && !may_mandlock())
return -EPERM;
/* Default to relatime unless overriden */
if (!(flags & MS_NOATIME))
mnt_flags |= MNT_RELATIME;
/* Separate the per-mountpoint flags */
if (flags & MS_NOSUID)
mnt_flags |= MNT_NOSUID;
if (flags & MS_NODEV)
mnt_flags |= MNT_NODEV;
if (flags & MS_NOEXEC)
mnt_flags |= MNT_NOEXEC;
if (flags & MS_NOATIME)
mnt_flags |= MNT_NOATIME;
if (flags & MS_NODIRATIME)
mnt_flags |= MNT_NODIRATIME;
if (flags & MS_STRICTATIME)
mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
if (flags & MS_RDONLY)
mnt_flags |= MNT_READONLY;
if (flags & MS_NOSYMFOLLOW)
mnt_flags |= MNT_NOSYMFOLLOW;
/* The default atime for remount is preservation */
if ((flags & MS_REMOUNT) &&
((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
MS_STRICTATIME)) == 0)) {
mnt_flags &= ~MNT_ATIME_MASK;
mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
}
sb_flags = flags & (SB_RDONLY |
SB_SYNCHRONOUS |
SB_MANDLOCK |
SB_DIRSYNC |
SB_SILENT |
SB_POSIXACL |
SB_LAZYTIME |
SB_I_VERSION);
if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
return do_reconfigure_mnt(path, mnt_flags);
if (flags & MS_REMOUNT)
return do_remount(path, flags, sb_flags, mnt_flags, data_page);
if (flags & MS_BIND)
return do_loopback(path, dev_name, flags & MS_REC);
if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
return do_change_type(path, flags);
if (flags & MS_MOVE)
return do_move_mount_old(path, dev_name);
return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
data_page);
}
分配fs_context
do_new_mount 是比较重要的函数, 这里面分配了fs_context结构体。
fs/namespace.c
/*
* create a new mount for userspace and request it to be added into the
* namespace's tree
*/
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
struct fs_context *fc;
const char *subtype = NULL;
int err = 0;
if (!fstype)
return -EINVAL;
type = get_fs_type(fstype); /* 1 */
if (!type)
return -ENODEV;
if (type->fs_flags & FS_HAS_SUBTYPE) {
subtype = strchr(fstype, '.');
if (subtype) {
subtype++;
if (!*subtype) {
put_filesystem(type);
return -EINVAL;
}
}
}
fc = fs_context_for_mount(type, sb_flags); /* 2 */
put_filesystem(type);
if (IS_ERR(fc))
return PTR_ERR(fc);
if (subtype)
err = vfs_parse_fs_string(fc, "subtype",
subtype, strlen(subtype));
if (!err && name)
err = vfs_parse_fs_string(fc, "source", name, strlen(name));
if (!err)
err = parse_monolithic_mount_data(fc, data);
if (!err && !mount_capable(fc))
err = -EPERM;
if (!err)
err = vfs_get_tree(fc); /* 3 */
if (!err)
err = do_new_mount_fc(fc, path, mnt_flags);
put_fs_context(fc);
return err;
}
- 根据fstype 找到对应的 file_system_type 结构体
- 初始化fc 结构体。 主要调用了alloc_fs_context 这个函数, 这里面,会调用文件系统自定义的init_fs_context回调; 如果没有定义fc->fs_type->init_fs_context, 则会调用legacy_init_fs_context初始化, 这里fc->ops = &legacy_fs_context_ops, 其中legacy_get_tree会调用fc->fs_type->mount,
- 调用vfs_get_tree, 这里会调用fc->ops->get_tree。 其中exfat 的实现为。 这里主要调用get_tree_bdev, 并传入了exfat_fill_super来作为回调填充super_block
申请super_block
get_tree_bdev 函数中会申请super_block结构体,主要流程如下:
/**
* get_tree_bdev - Get a superblock based on a single block device
* @fc: The filesystem context holding the parameters
* @fill_super: Helper to initialise a new superblock
*/
int get_tree_bdev(struct fs_context *fc,
int (*fill_super)(struct super_block *,
struct fs_context *))
{
struct block_device *bdev;
struct super_block *s;
fmode_t mode = FMODE_READ | FMODE_EXCL;
int error = 0;
if (!(fc->sb_flags & SB_RDONLY))
mode |= FMODE_WRITE;
if (!fc->source)
return invalf(fc, "No source specified");
fc->sb_flags |= SB_NOSEC;
fc->sget_key = bdev;
s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); /* 1 */
mutex_unlock(&bdev->bd_fsfreeze_mutex);
if (IS_ERR(s)) {
blkdev_put(bdev, mode);
return PTR_ERR(s);
}
if (s->s_root) {
/* Don't summarily change the RO/RW state. */
if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
warnf(fc, "%pg: Can't mount, would change RO state", bdev);
deactivate_locked_super(s);
blkdev_put(bdev, mode);
return -EBUSY;
}
/*
* s_umount nests inside bd_mutex during
* __invalidate_device(). blkdev_put() acquires
* bd_mutex and can't be called under s_umount. Drop
* s_umount temporarily. This is safe as we're
* holding an active reference.
*/
up_write(&s->s_umount);
blkdev_put(bdev, mode);
down_write(&s->s_umount);
} else {
s->s_mode = mode;
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, fc); /* 2 */
if (error) {
deactivate_locked_super(s);
return error;
}
s->s_flags |= SB_ACTIVE;
bdev->bd_super = s;
}
BUG_ON(fc->root);
fc->root = dget(s->s_root);
return 0;
}
(1) alloc super_block 结构体
(2) 调用传入的fill_super函数, 执行文件系统自定义的操作。 这里一般做的是去解析文件系统的元数据, 并填充到文件系统的私有结构体中。
调用回调函数填充super_block
exfat fill_super函数的实现如下:
fs/exfat/super.c
static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct exfat_sb_info *sbi = sb->s_fs_info;
struct exfat_mount_options *opts = &sbi->options;
struct inode *root_inode;
int err;
if (opts->allow_utime == (unsigned short)-1)
opts->allow_utime = ~opts->fs_dmask & 0022;
if (opts->discard) {
struct request_queue *q = bdev_get_queue(sb->s_bdev);
if (!blk_queue_discard(q)) {
exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard");
opts->discard = 0;
}
}
sb->s_flags |= SB_NODIRATIME;
sb->s_magic = EXFAT_SUPER_MAGIC;
sb->s_op = &exfat_sops;
sb->s_time_gran = 10 * NSEC_PER_MSEC;
sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS;
sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS;
err = __exfat_fill_super(sb); /* 1 */
if (err) {
exfat_err(sb, "failed to recognize exfat type");
goto check_nls_io;
}
/* set up enough so that it can read an inode */
exfat_hash_init(sb);
if (!strcmp(sbi->options.iocharset, "utf8"))
opts->utf8 = 1;
else {
sbi->nls_io = load_nls(sbi->options.iocharset);
if (!sbi->nls_io) {
exfat_err(sb, "IO charset %s not found",
sbi->options.iocharset);
err = -EINVAL;
goto free_table;
}
}
if (sbi->options.utf8)
sb->s_d_op = &exfat_utf8_dentry_ops;
else
sb->s_d_op = &exfat_dentry_ops;
root_inode = new_inode(sb);
if (!root_inode) {
exfat_err(sb, "failed to allocate root inode");
err = -ENOMEM;
goto free_table;
}
root_inode->i_ino = EXFAT_ROOT_INO;
inode_set_iversion(root_inode, 1);
err = exfat_read_root(root_inode);
if (err) {
exfat_err(sb, "failed to initialize root inode");
goto put_inode;
}
exfat_hash_inode(root_inode, EXFAT_I(root_inode)->i_pos);
insert_inode_hash(root_inode);
sb->s_root = d_make_root(root_inode);
if (!sb->s_root) {
exfat_err(sb, "failed to get the root dentry");
err = -ENOMEM;
goto put_inode;
}
return 0;
put_inode:
iput(root_inode);
sb->s_root = NULL;
free_table:
exfat_free_upcase_table(sbi);
exfat_free_bitmap(sbi);
brelse(sbi->boot_bh);
check_nls_io:
unload_nls(sbi->nls_io);
exfat_free_iocharset(sbi);
sb->s_fs_info = NULL;
kfree(sbi);
return err;
}
主要分为两部分:
(1) 读取exfat 的文件系统信息, 解析后保存在 exfat_sb_info这个结构体中
(2) 设置super_block 的一些重要field, 例如s_op, s_root, s_d_op
装载到全局文件系统树
mount 完成后, 会调用do_new_mount_fc将新的挂载实例添加到系统中。
fs/namespace.c
/*
* create a new mount for userspace and request it to be added into the
* namespace's tree
*/
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
struct fs_context *fc;
const char *subtype = NULL;
int err = 0;
if (!fstype)
return -EINVAL;
type = get_fs_type(fstype); /* 1 */
if (!type)
return -ENODEV;
if (type->fs_flags & FS_HAS_SUBTYPE) {
subtype = strchr(fstype, '.');
if (subtype) {
subtype++;
if (!*subtype) {
put_filesystem(type);
return -EINVAL;
}
}
}
fc = fs_context_for_mount(type, sb_flags); /* 2 */
put_filesystem(type);
if (IS_ERR(fc))
return PTR_ERR(fc);
if (subtype)
err = vfs_parse_fs_string(fc, "subtype",
subtype, strlen(subtype));
if (!err && name)
err = vfs_parse_fs_string(fc, "source", name, strlen(name));
if (!err)
err = parse_monolithic_mount_data(fc, data);
if (!err && !mount_capable(fc))
err = -EPERM;
if (!err)
err = vfs_get_tree(fc); /* 3 */
if (!err)
err = do_new_mount_fc(fc, path, mnt_flags); /* 4 */
put_fs_context(fc);
return err;
}
这里主要调用了 do_new_mount_fc , 创建新的挂载实例关联到系统中, 这里面数据结构涉及很多,且很混乱,其中的关系暂时没有梳理清楚