文件系统操作
首先,我们从标准库用来与内核通信的系统调用来研究。尽管文件操作对所有应用程序来说都属于标准功能,但对文件系统的操作只限于几个系统程序,即用于装载和卸载文件系统的mount
和umount
程序。
注册文件系统
在文件系统注册到内核时,文件系统是编译为模块,或者持久编译到内核中。fs/super.c
中的register_filesystem
用来向内核注册文件系统。一个文件系统不能注册两次。
用于描述文件系统的结构源码如下
struct file_system_type
{
const char *name;//保存文件系统的名称
int fs_flags; //使用的标志,例如标明只读装载、禁止/setudi/setgid操作或进行其他的微调。
//文件系统必须在物理设备上面
#define FS_REQUIRES_DEV 1
//此文件系统需要使用二进制数据结构mount data*.nfs使用这种mount data
#define FS_BINARY_MOUNTDATA 2
//系统含有子类型,最常见的就是FUSE,FUSE不是真正的文件系统,所以要通过文件系统类型来区别,通过FUSE接口实现不同的文件系统。
#define FS_HAS_SUBTYPE 4
//每次挂载后都是不同的user namespace,比如deypts
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
#define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */
#define FS_USERNS_VISIBLE 32 /* FS must already be visible */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);//用户调用sys_mount挂载某一个文件系统时,最终会调到该回调函数
struct dentry *(*mount2) (struct vfsmount *, struct file_system_type *, int,
const char *, void *);
void *(*alloc_mnt_data) (void);
void (*kill_sb) (struct super_block *);//删除内存中的superblock,在卸载文件系统时使用。
struct module *owner; //指向实现这个文件系统的模块,通常为THIS_MODULES宏
struct file_system_type * next; //指向文件系统链表的下一个文件系统模型
struct hlist_head fs_supers; //此文件系统类型的文件系统超级块结构串都在这个表头中。
struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
struct lock_class_key s_vfs_rename_key;
struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];
struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key;
struct lock_class_key i_mutex_dir_key;
};
register_filesystem函数
/**
* register_filesystem - register a new filesystem
* @fs: the file system structure
* * Adds the file system passed to the list of file systems the kernel
* is aware of for mount and other syscalls. Returns 0 on success,
* or a negative errno code on an error.
* * The &struct file_system_type that is passed is linked into the kernel
* structures and must not be freed until the file system has been
* unregistered.
*/
int register_filesystem(struct file_system_type * fs)
{
int res = 0;
struct file_system_type ** p;
BUG_ON(strchr(fs->name, '.'));
if (fs->next)
return -EBUSY;
write_lock(&file_systems_lock);
p = find_filesystem(fs->name, strlen(fs->name));
if (*p)
res = -EBUSY;
else
*p = fs;
write_unlock(&file_systems_lock);
return res;
}
装载和卸载
目录树的装载和卸载比仅仅注册文件系统复杂的多,因为后者只需要向一个链表中添加对象,而前者需要对内核的内部数据结构执行很多操作,所以要复杂的多,文件系统的装载由mount系统调用发起
,我们需要阐明在现存目录树中装载新的文件系统必须执行的任务。还需要用于描述装载点的数据结构。
- vfsmount结构:采用一种单一的
文件系统层次结构
,新的文件系统可以集成到其中,使用mount可查询目录树中各文件系统的装载情况如下:
vfsmount结构(文件系统层次结构,包含各种文件系统类型)
将文件系统装载到一个目录时,装载点的内容被替换为即将装载的文件系统的内容,但并没有丢失或损坏,只是无法访问到,只要卸载掉装载的文件系统就可以重新访问,这就相当于挂载一个U盘或者sd卡一样。
vfsmount结构描述一个独立文件系统的挂载信息,每个不同挂载点对应一个独立的vfsmount结构,属于同一文件系统的所有目录和文件隶属于同一个vfsmount,该vfsmount结构对应于该文件系统的顶层目录,即挂载目录。
mount源码分析如下:
struct mount {
struct hlist_node mnt_hash;
struct mount *mnt_parent; //装载点所在的父文件系统
struct dentry *mnt_mountpoint; //装载点在父文件系统中的dentry(目录项)
struct vfsmount mnt;
union {
struct rcu_head mnt_rcu;
struct llist_node mnt_llist;
};
#ifdef CONFIG_SMP
struct mnt_pcp __percpu *mnt_pcp;
#else
int mnt_count;
int mnt_writers;
#endif
//子文件系统链表
struct list_head mnt_mounts; /* list of children, anchored here */
//链表元素,用于父文件系统中的mnt_mount链表
struct list_head mnt_child; /* and going through their mnt_child */
struct list_head mnt_instance; /* mount instance on sb->s_mounts */
//设备名称,例如/dev/dsk/hda1
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
struct list_head mnt_list;
//链表元素,用于特定于文件系统的到期链表中
struct list_head mnt_expire; /* link in fs-specific expiry list */
//链表元素,用于共享装载的循环链表
struct list_head mnt_share; /* circular list of shared mounts */
//从属装载的链表
struct list_head mnt_slave_list;/* list of slave mounts */
//链表元素,用于从属装载的链表
struct list_head mnt_slave; /* slave list entry */
//指向主装载,从属装载位于master->mnt_slave_list链表上面
struct mount *mnt_master; /* slave is on master->mnt_slave_list */
//所属的命名空间
struct mnt_namespace *mnt_ns; /* containing namespace */
struct mountpoint *mnt_mp; /* where is it mounted */
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
struct hlist_head mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
#endif
int mnt_id; /* mount identifier */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
struct hlist_head mnt_pins;
struct fs_pin mnt_umount;
struct dentry *mnt_ex_mountpoint;
};
文件系统之间的父子关系有上述我们所讲述的两个成员实现的链表表示,mnt_mounts表头是子文件系统链表的起点,而mnt_child字段则用作该链表元素。
系统当中的每个vfsmount实例,通过两种途径标识:
1.一个命名空间的所有装载的文件系统都保存在namespace->list
链表中;
2.使用vfsmount
的mnt_list
成员作为链表元素。
超级块管理
在装载新的文件系统时,vfsmont并不是唯一需要在内存中创建的结构,装载操作开始于超级块的读取
。
struct super_block {
//将该成员设置于起始位置
struct list_head s_list; /* Keep this first */
//搜索索引,不是kdev_t
dev_t s_dev; /* search index; _not_ kdev_t */
unsigned char s_blocksize_bits;
unsigned long s_blocksize;
loff_t s_maxbytes; /* Max file size 最大文件长度*/
struct file_system_type *s_type;
const struct super_operations *s_op; //指向一个包含函数指针的结构,提供接口用于处理超级块的相关操作,操作的实现必须由底层文件系统的代码实现。
const struct dquot_operations *dq_op;
const struct quotactl_ops *s_qcop;
const struct export_operations *s_export_op;
unsigned long s_flags;
unsigned long s_iflags; /* internal SB_I_* flags */
unsigned long s_magic;
struct dentry *s_root;
struct rw_semaphore s_umount;
int s_count;
atomic_t s_active;
#ifdef CONFIG_SECURITY
void *s_security;
#endif
const struct xattr_handler **s_xattr;
const struct fscrypt_operations *s_cop;
struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
struct block_device *s_bdev;
struct backing_dev_info *s_bdi;
struct mtd_info *s_mtd;
struct hlist_node s_instances;
unsigned int s_quota_types; /* Bitmask of supported quota types */
struct quota_info s_dquot; /* Diskquota specific options */
struct sb_writers s_writers;
char s_id[32]; /* Informational name */
u8 s_uuid[16]; /* UUID */
void *s_fs_info; /* Filesystem private info 文件系统私有信息*/
unsigned int s_max_links;
fmode_t s_mode;
/* Granularity of c/m/atime in ns.
Cannot be worse than a second */
u32 s_time_gran;
file_system_type
/*
* The next field is for VFS *only*. No filesystems have any business
* even looking at it. You had been warned.
*/
struct mutex s_vfs_rename_mutex; /* Kludge */
/*
* Filesystem subtype. If non-empty the filesystem type field
* in /proc/mounts will be "type.subtype"
*/
char *s_subtype;
/*
* Saved mount options for lazy filesystems using
* generic_show_options()
*/
char __rcu *s_options;
const struct dentry_operations *s_d_op; /* default d_op for dentries */
/*
* Saved pool identifier for cleancache (-1 means none)
*/
int cleancache_poolid;
struct shrinker s_shrink; /* per-sb shrinker handle */
/* Number of inodes with nlink == 0 but still referenced */
atomic_long_t s_remove_count;
/* Being remounted read-only */
int s_readonly_remount;
/* AIO completions deferred from interrupt context */
struct workqueue_struct *s_dio_done_wq;
struct hlist_head s_pins;
/*
* Keep the lru lists last in the structure so they always sit on their
* own individual cachelines.
*/
struct list_lru s_dentry_lru ____cacheline_aligned_in_smp;
struct list_lru s_inode_lru ____cacheline_aligned_in_smp;
struct rcu_head rcu;
struct work_struct destroy_work;
struct mutex s_sync_lock; /* sync serialisation lock */
/*
* Indicates how deep in a filesystem stack this SB is
*/
int s_stack_depth;
/* s_inode_list_lock protects s_inodes */
spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp;
struct list_head s_inodes; /* all inodes 所有的inode的链表*/
};
*const struct super_operations s_op; //指向一个包含函数指针的结构,提供接口用于处理超级块的相关操作,操作的实现必须由底层文件系统的代码实现。
包含读写inode,删除inode等
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
//将inode从内存和底层的存储介质删除
void (*destroy_inode)(struct inode *);
//将传递的inode结构标记为“脏的”,意思就是修改过
void (*dirty_inode) (struct inode *, int flags);
int (*write_inode) (struct inode *, struct writeback_control *wbc);
int (*drop_inode) (struct inode *);
void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
int (*freeze_super) (struct super_block *);
int (*freeze_fs) (struct super_block *);
int (*thaw_super) (struct super_block *);
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
int (*remount_fs2) (struct vfsmount *, struct super_block *, int *, char *);
void *(*clone_mnt_data) (void *);
void (*copy_mnt_data) (void *, void *);
void (*umount_begin) (struct super_block *);
int (*show_options)(struct seq_file *, struct dentry *);
int (*show_options2)(struct vfsmount *,struct seq_file *, struct dentry *);
int (*show_devname)(struct seq_file *, struct dentry *);
int (*show_path)(struct seq_file *, struct dentry *);
int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
struct dquot **(*get_dquots)(struct inode *);
#endif
int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
long (*nr_cached_objects)(struct super_block *,
struct shrink_control *);
long (*free_cached_objects)(struct super_block *,
struct shrink_control *);
};
mount系统调用
mount系统调用的入口点是sys_mount函数,
/*
* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
* * data is a (void *) that can point to any structure up to
* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
* information (or be NULL).
* * Pre-0.97 versions of mount() didn't have a flags word.
* When the flags word was introduced its top half was required
* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
* Therefore, if this magic number is present, it carries no information
* and must be discarded.
*/
long do_mount(const char *dev_name, const char __user *dir_name,
const char *type_page, unsigned long flags, void *data_page)
{
struct path path;
int retval = 0;
int mnt_flags = 0;
/* Discard magic */
if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
flags &= ~MS_MGC_MSK;
/* Basic sanity checks */
if (data_page)
((char *)data_page)[PAGE_SIZE - 1] = 0;
/* ... and get the mountpoint */
retval = user_path(dir_name, &path);
if (retval)
return retval;
retval = security_sb_mount(dev_name, &path,
type_page, flags, data_page);
if (!retval && !may_mount())
retval = -EPERM;
if (retval)
goto dput_out;
/* Default to relatime unless overriden */
if (!(flags & MS_NOATIME))
mnt_flags |= MNT_RELATIME;
/* Separate the per-mountpoint flags */
if (flags & MS_NOSUID)
mnt_flags |= MNT_NOSUID;
if (flags & MS_NODEV)
mnt_flags |= MNT_NODEV;
if (flags & MS_NOEXEC)
mnt_flags |= MNT_NOEXEC;
if (flags & MS_NOATIME)
mnt_flags |= MNT_NOATIME;
if (flags & MS_NODIRATIME)
mnt_flags |= MNT_NODIRATIME;
if (flags & MS_STRICTATIME)
mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
if (flags & MS_RDONLY)
mnt_flags |= MNT_READONLY;
/* The default atime for remount is preservation */
if ((flags & MS_REMOUNT) &&
((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
MS_STRICTATIME)) == 0)) {
mnt_flags &= ~MNT_ATIME_MASK;
mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
}
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
MS_STRICTATIME);
if (flags & MS_REMOUNT) //修改已经装载的文件系统的选项
retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND) //用于通过环回接口装载一个文件系统
retval = do_loopback(&path, dev_name, flags & MS_REC);
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&path, flags);//负责处理共享、从属和不可绑定装载,他可以改变装载标志或者在涉及的各个vfsmount实例之间建立所需要的数据结构的关联。
else if (flags & MS_MOVE)
retval = do_move_mount(&path, dev_name); //用来移动一个已经装载的文件系统
else
//处理普通装载操作
retval = do_new_mount(&path, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
path_put(&path);
return retval;
}
共享子树(shared subtrees)
共享子树最核心的特征是允许挂载和卸载事件以一种自动的,可控的方式在不同的namespace
间传递(propagation
)。这就意味着,在一个命名空间中挂载光盘的同时也会触发对于其他namespace对同一张光盘的挂载
。
在共享子树中,每个挂载点都存在一个名为传递类型(propagation type
)的标记,该标记决定了一个namespase
中创建或者删除的挂载点是否会传递到其他的namespaces
。
共享子树有四种传递类型:
- MS_SHARED:该挂载点和它的共享挂载和卸载事件。
- MS_PRIVATE:和共享挂载相反,标记为private的事件不会传递到任何的对等组,挂载操作默认使用次标志。
- MS_SLAVE:这个传递类型介于shared和slave之间,一个slave mount拥有一个master(一个共享的对等组),slave mount不能将事件传递给master mount
- MS_UNBINDABLE:该挂载点是不可缩写的。