文件系统是对一个存储设备上的数据和元数据进行组织的机制。 Linux 文件系统接口实现为分层的体系结构,从而将用户接口层、文件系统实现和操作存储设备的驱动程序分隔开。另一种看待文件系统的方式是把它看作一个协议。网络协议(比如 IP)规定了互联网上传输的数据流的意义,同样,文件系统会给出特定存储媒体上数据的意义。
Linux 文件系统体系结构是一个对复杂系统进行抽象化的有趣例子。通过使用一组通用的 API 函数,Linux 可以在许多种存储设备上支持许多种文件系统。例如,read
函数调用可以从指定的文件描述符读取一定数量的字节。read
函数不了解文件系统的类型,比如 ext3 或 NFS。它也不了解文件系统所在的存储媒体,比如 ATAPI磁盘、SAS磁盘或 SATA磁盘。但是,当通过调用 read
函数读取一个文件时,数据会正常返回。
尽管大多数文件系统代码在内核中,但是图 1 所示的体系结构显示了用户空间和内核中与文件系统相关的主要组件之间的关系。
用户空间包含一些应用程序(例如,文件系统的使用者)和 GNU C 库(glibc),它们为文件系统调用(打开、读取、写和关闭)提供用户接口。系统调用接口的作用就像是交换器,它将系统调用从用户空间发送到内核空间中的适当端点。
VFS 是底层文件系统的主要接口。这个组件导出一组接口,然后将它们抽象到各个文件系统,各个文件系统的行为可能差异很大。有两个针对文件系统对象的缓存(inode 和 dentry)。它们缓存最近使用过的文件系统对象。
每个文件系统实现(比如 ext2、JFS 等等)导出一组通用接口,供 VFS 使用。缓冲区缓存会缓存文件系统和相关块设备之间的请求。例如,对底层设备驱动程序的读写请求会通过缓冲区缓存来传递。这就允许在其中缓存请求,减少访问物理设备的次数,加快访问速度。以最近使用(LRU)列表的形式管理缓冲区缓存。注意,可以使用 sync
命令将缓冲区缓存中的请求发送到存储媒体(迫使所有未写的数据发送到设备驱动程序,进而发送到存储设备)。
Linux 以一组通用对象的角度看待所有文件系统。这些对象是超级块(superblock)、inode、dentry 和文件。超级块在每个文件系统的根上,超级块描述和维护文件系统的状态。文件系统中管理的每个对象(文件或目录)在 Linux 中表示为一个 inode。inode 包含管理文件系统中的对象所需的所有元数据(包括可以在对象上执行的操作)。另一组结构称为 dentry,它们用来实现名称和 inode 之间的映射,有一个目录缓存用来保存最近使用的 dentry。dentry 还维护目录和文件之间的关系,从而支持在文件系统中移动。最后,VFS 文件表示一个打开的文件(保存打开的文件的状态,比如写偏移量等等)。
VFS 作为文件系统接口的根层。VFS 记录当前支持的文件系统以及当前挂装的文件系统。
在注册新的文件系统时,会把这个文件系统和它的相关信息添加到 file_systems 列表中( linux/include/linux/fs.h)。这个列表定义可以支持的文件系统。在命令行上输入 cat /proc/filesystems
,就可以查看这个列表。
linux/include/linux/fs.h
- struct file_system_type {
- const char *name;
- int fs_flags;
- int (*get_sb) (struct file_system_type *, int,
- const char *, void *, struct vfsmount *);
- struct dentry *(*mount) (struct file_system_type *, int,
- const char *, void *);
- void (*kill_sb) (struct super_block *);
- struct module *owner;
- struct file_system_type * next;//形成列表
- struct list_head fs_supers;
- struct lock_class_key s_lock_key;
- struct lock_class_key s_umount_key;
- struct lock_class_key s_vfs_rename_key;
- struct lock_class_key i_lock_key;
- struct lock_class_key i_mutex_key;
- struct lock_class_key i_mutex_dir_key;
- struct lock_class_key i_alloc_sem_key;
- };
struct file_system_type {
const char *name;
int fs_flags;
int (*get_sb) (struct file_system_type *, int,
const char *, void *, struct vfsmount *);
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);
void (*kill_sb) (struct super_block *);
struct module *owner;
struct file_system_type * next;//形成列表
struct list_head fs_supers;
struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
struct lock_class_key s_vfs_rename_key;
struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key;
struct lock_class_key i_mutex_dir_key;
struct lock_class_key i_alloc_sem_key;
};
VFS 中维护的另一个结构是挂装的文件系统(见下 )。这个结构提供当前挂装的文件系统(见 linux/include/linux/mount.h)。它链接又超级块结构。
- struct vfsmount {
- struct list_head mnt_hash;
- struct vfsmount *mnt_parent; /* fs we are mounted on */
- struct dentry *mnt_mountpoint; /* dentry of mountpoint */
- struct dentry *mnt_root; /* root of the mounted tree */
- struct super_block *mnt_sb; /* pointer to superblock超级块 */
- #ifdef CONFIG_SMP
- struct mnt_pcp __percpu *mnt_pcp;
- atomic_t mnt_longterm; /* how many of the refs are longterm */
- #else
- int mnt_count;
- int mnt_writers;
- #endif
- struct list_head mnt_mounts; /* list of children, anchored here */
- struct list_head mnt_child; /* and going through their mnt_child */
- int mnt_flags;
- /* 4 bytes hole on 64bits arches without fsnotify */
- #ifdef CONFIG_FSNOTIFY
- __u32 mnt_fsnotify_mask;
- struct hlist_head mnt_fsnotify_marks;
- #endif
- const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
- struct list_head mnt_list;
- struct list_head mnt_expire; /* link in fs-specific expiry list */
- struct list_head mnt_share; /* circular list of shared mounts */
- struct list_head mnt_slave_list;/* list of slave mounts */
- struct list_head mnt_slave; /* slave list entry */
- struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */
- struct mnt_namespace *mnt_ns; /* containing namespace */
- int mnt_id; /* mount identifier */
- int mnt_group_id; /* peer group identifier */
- int mnt_expiry_mark; /* true if marked for expiry */
- int mnt_pinned;
- int mnt_ghosts;
- };
struct vfsmount {
struct list_head mnt_hash;
struct vfsmount *mnt_parent; /* fs we are mounted on */
struct dentry *mnt_mountpoint; /* dentry of mountpoint */
struct dentry *mnt_root; /* root of the mounted tree */
struct super_block *mnt_sb; /* pointer to superblock超级块 */
#ifdef CONFIG_SMP
struct mnt_pcp __percpu *mnt_pcp;
atomic_t mnt_longterm; /* how many of the refs are longterm */
#else
int mnt_count;
int mnt_writers;
#endif
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
int mnt_flags;
/* 4 bytes hole on 64bits arches without fsnotify */
#ifdef CONFIG_FSNOTIFY
__u32 mnt_fsnotify_mask;
struct hlist_head mnt_fsnotify_marks;
#endif
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
struct list_head mnt_list;
struct list_head mnt_expire; /* link in fs-specific expiry list */
struct list_head mnt_share; /* circular list of shared mounts */
struct list_head mnt_slave_list;/* list of slave mounts */
struct list_head mnt_slave; /* slave list entry */
struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */
struct mnt_namespace *mnt_ns; /* containing namespace */
int mnt_id; /* mount identifier */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
int mnt_pinned;
int mnt_ghosts;
};
超级块结构表示一个文件系统。它包含管理文件系统所需的信息,包括文件系统名称(比如 ext2)、文件系统的大小和状态、块设备的引用和元数据信息(比如空闲列表等等)。超级块通常存储在存储媒体上,但是如果超级块不存在,也可以实时创建它。
/include/linux/fs.h
- struct super_block {
- struct list_head s_list; /* Keep this first */
- dev_t s_dev; /* search index; _not_ kdev_t */
- unsigned char s_dirt;
- unsigned char s_blocksize_bits;
- unsigned long s_blocksize;
- loff_t s_maxbytes; /* Max file size */
- struct file_system_type *s_type;
- const struct super_operations *s_op;//超级块操作,地位很重要
- const struct dquot_operations *dq_op;
- const struct quotactl_ops *s_qcop;
- const struct export_operations *s_export_op;
- unsigned long s_flags;
- unsigned long s_magic;
- struct dentry *s_root;
- struct rw_semaphore s_umount;
- struct mutex s_lock;
- int s_count;
- atomic_t s_active;
- #ifdef CONFIG_SECURITY
- void *s_security;
- #endif
- const struct xattr_handler **s_xattr;
- struct list_head s_inodes; /* all inodes */
- struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */
- #ifdef CONFIG_SMP
- struct list_head __percpu *s_files;
- #else
- struct list_head s_files;
- #endif
- /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
- struct list_head s_dentry_lru; /* unused dentry lru */
- int s_nr_dentry_unused; /* # of dentry on lru */
- struct block_device *s_bdev;
- struct backing_dev_info *s_bdi;
- struct mtd_info *s_mtd;
- struct list_head s_instances;
- struct quota_info s_dquot; /* Diskquota specific options */
- int s_frozen;
- wait_queue_head_t s_wait_unfrozen;
- char s_id[32]; /* Informational name */
- void *s_fs_info; /* Filesystem private info */
- fmode_t s_mode;
- /* Granularity of c/m/atime in ns.
- Cannot be worse than a second */
- u32 s_time_gran;
- /*
- * The next field is for VFS *only*. No filesystems have any business
- * even looking at it. You had been warned.
- */
- struct mutex s_vfs_rename_mutex; /* Kludge */
- /*
- * Filesystem subtype. If non-empty the filesystem type field
- * in /proc/mounts will be "type.subtype"
- */
- char *s_subtype;
- /*
- * Saved mount options for lazy filesystems using
- * generic_show_options()
- */
- char __rcu *s_options;
- const struct dentry_operations *s_d_op; /* default d_op for dentries */
- };
struct super_block {
struct list_head s_list; /* Keep this first */
dev_t s_dev; /* search index; _not_ kdev_t */
unsigned char s_dirt;
unsigned char s_blocksize_bits;
unsigned long s_blocksize;
loff_t s_maxbytes; /* Max file size */
struct file_system_type *s_type;
const struct super_operations *s_op;//超级块操作,地位很重要
const struct dquot_operations *dq_op;
const struct quotactl_ops *s_qcop;
const struct export_operations *s_export_op;
unsigned long s_flags;
unsigned long s_magic;
struct dentry *s_root;
struct rw_semaphore s_umount;
struct mutex s_lock;
int s_count;
atomic_t s_active;
#ifdef CONFIG_SECURITY
void *s_security;
#endif
const struct xattr_handler **s_xattr;
struct list_head s_inodes; /* all inodes */
struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */
#ifdef CONFIG_SMP
struct list_head __percpu *s_files;
#else
struct list_head s_files;
#endif
/* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
struct list_head s_dentry_lru; /* unused dentry lru */
int s_nr_dentry_unused; /* # of dentry on lru */
struct block_device *s_bdev;
struct backing_dev_info *s_bdi;
struct mtd_info *s_mtd;
struct list_head s_instances;
struct quota_info s_dquot; /* Diskquota specific options */
int s_frozen;
wait_queue_head_t s_wait_unfrozen;
char s_id[32]; /* Informational name */
void *s_fs_info; /* Filesystem private info */
fmode_t s_mode;
/* Granularity of c/m/atime in ns.
Cannot be worse than a second */
u32 s_time_gran;
/*
* The next field is for VFS *only*. No filesystems have any business
* even looking at it. You had been warned.
*/
struct mutex s_vfs_rename_mutex; /* Kludge */
/*
* Filesystem subtype. If non-empty the filesystem type field
* in /proc/mounts will be "type.subtype"
*/
char *s_subtype;
/*
* Saved mount options for lazy filesystems using
* generic_show_options()
*/
char __rcu *s_options;
const struct dentry_operations *s_d_op; /* default d_op for dentries */
};
超级块中的一个重要元素是超级块操作的定义。这个结构定义一组用来管理这个文件系统中的 inode 的函数。例如,可以用 alloc_inode
分配 inode,用 destroy_inode
删除 inode。可以用 read_inode
和 write_inode
读写 inode,用 sync_fs
执行文件系统同步。可以在 ./linux/include/linux/fs.h 中找到 super_operations
结构。每个文件系统提供自己的 inode 方法,这些方法实现操作并向 VFS 层提供通用的抽象。
- struct super_operations {
- struct inode *(*alloc_inode)(struct super_block *sb);//分配inode
- void (*destroy_inode)(struct inode *);//销毁inode
- void (*dirty_inode) (struct inode *);
- int (*write_inode) (struct inode *, struct writeback_control *wbc);
- int (*drop_inode) (struct inode *);
- void (*evict_inode) (struct inode *);
- void (*put_super) (struct super_block *);
- void (*write_super) (struct super_block *);
- int (*sync_fs)(struct super_block *sb, int wait);
- int (*freeze_fs) (struct super_block *);
- int (*unfreeze_fs) (struct super_block *);
- int (*statfs) (struct dentry *, struct kstatfs *);
- int (*remount_fs) (struct super_block *, int *, char *);
- void (*umount_begin) (struct super_block *);
- int (*show_options)(struct seq_file *, struct vfsmount *);
- int (*show_stats)(struct seq_file *, struct vfsmount *);
- #ifdef CONFIG_QUOTA
- ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
- ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
- #endif
- int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
- };
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);//分配inode
void (*destroy_inode)(struct inode *);//销毁inode
void (*dirty_inode) (struct inode *);
int (*write_inode) (struct inode *, struct writeback_control *wbc);
int (*drop_inode) (struct inode *);
void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *);
void (*write_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
int (*freeze_fs) (struct super_block *);
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
void (*umount_begin) (struct super_block *);
int (*show_options)(struct seq_file *, struct vfsmount *);
int (*show_stats)(struct seq_file *, struct vfsmount *);
#ifdef CONFIG_QUOTA
ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
#endif
int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
};
inode 表示文件系统中的一个对象,它具有惟一标识符。各个文件系统提供将文件名映射为惟一 inode 标识符和 inode 引用的方法。图 显示 inode 结构的一部分以及两个相关结构。请特别注意
inode_operations
和 file_operations
。这些结构表示可以在这个 inode 上执行的操作。inode_operations
定义直接在 inode 上执行的操作,而 file_operations
定义与文件和目录相关的方法(标准系统调用)。
inode结构体:
- struct inode {
- /* RCU path lookup touches following: */
- umode_t i_mode;
- uid_t i_uid;
- gid_t i_gid;
- const struct inode_operations *i_op;//inode操作实现方法
- struct super_block *i_sb;
- spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
- unsigned int i_flags;
- struct mutex i_mutex;
- unsigned long i_state;
- unsigned long dirtied_when; /* jiffies of first dirtying */
- struct hlist_node i_hash;
- struct list_head i_wb_list; /* backing dev IO list */
- struct list_head i_lru; /* inode LRU list */
- struct list_head i_sb_list;
- union {
- struct list_head i_dentry;
- struct rcu_head i_rcu;
- };
- unsigned long i_ino;
- atomic_t i_count;
- unsigned int i_nlink;
- dev_t i_rdev;
- unsigned int i_blkbits;
- u64 i_version;
- loff_t i_size;
- #ifdef __NEED_I_SIZE_ORDERED
- seqcount_t i_size_seqcount;
- #endif
- struct timespec i_atime;
- struct timespec i_mtime;
- struct timespec i_ctime;
- blkcnt_t i_blocks;
- unsigned short i_bytes;
- struct rw_semaphore i_alloc_sem;
- const struct file_operations *i_fop; /* former ->i_op->default_file_ops 文件操作实现方法*/
- struct file_lock *i_flock;
- struct address_space *i_mapping;
- struct address_space i_data;
- #ifdef CONFIG_QUOTA
- struct dquot *i_dquot[MAXQUOTAS];
- #endif
- struct list_head i_devices;
- union {
- struct pipe_inode_info *i_pipe;
- struct block_device *i_bdev;
- struct cdev *i_cdev;
- };
- __u32 i_generation;
- #ifdef CONFIG_FSNOTIFY
- __u32 i_fsnotify_mask; /* all events this inode cares about */
- struct hlist_head i_fsnotify_marks;
- #endif
- #ifdef CONFIG_IMA
- /* protected by i_lock */
- unsigned int i_readcount; /* struct files open RO */
- #endif
- atomic_t i_writecount;
- #ifdef CONFIG_SECURITY
- void *i_security;
- #endif
- #ifdef CONFIG_FS_POSIX_ACL
- struct posix_acl *i_acl;
- struct posix_acl *i_default_acl;
- #endif
- void *i_private; /* fs or device private pointer */
- };
struct inode {
/* RCU path lookup touches following: */
umode_t i_mode;
uid_t i_uid;
gid_t i_gid;
const struct inode_operations *i_op;//inode操作实现方法
struct super_block *i_sb;
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned int i_flags;
struct mutex i_mutex;
unsigned long i_state;
unsigned long dirtied_when; /* jiffies of first dirtying */
struct hlist_node i_hash;
struct list_head i_wb_list; /* backing dev IO list */
struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list;
union {
struct list_head i_dentry;
struct rcu_head i_rcu;
};
unsigned long i_ino;
atomic_t i_count;
unsigned int i_nlink;
dev_t i_rdev;
unsigned int i_blkbits;
u64 i_version;
loff_t i_size;
#ifdef __NEED_I_SIZE_ORDERED
seqcount_t i_size_seqcount;
#endif
struct timespec i_atime;
struct timespec i_mtime;
struct timespec i_ctime;
blkcnt_t i_blocks;
unsigned short i_bytes;
struct rw_semaphore i_alloc_sem;
const struct file_operations *i_fop; /* former ->i_op->default_file_ops 文件操作实现方法*/
struct file_lock *i_flock;
struct address_space *i_mapping;
struct address_space i_data;
#ifdef CONFIG_QUOTA
struct dquot *i_dquot[MAXQUOTAS];
#endif
struct list_head i_devices;
union {
struct pipe_inode_info *i_pipe;
struct block_device *i_bdev;
struct cdev *i_cdev;
};
__u32 i_generation;
#ifdef CONFIG_FSNOTIFY
__u32 i_fsnotify_mask; /* all events this inode cares about */
struct hlist_head i_fsnotify_marks;
#endif
#ifdef CONFIG_IMA
/* protected by i_lock */
unsigned int i_readcount; /* struct files open RO */
#endif
atomic_t i_writecount;
#ifdef CONFIG_SECURITY
void *i_security;
#endif
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif
void *i_private; /* fs or device private pointer */
};
inode 和目录缓存分别保存最近使用的 inode 和 dentry。注意,对于 inode 缓存中的每个 inode,在目录缓存中都有一个对应的 dentry。可以在 ./linux/include/linux/fs.h 中找到
inode
和 dentry
结构。除了各个文件系统实现(可以在 ./linux/fs 中找到)之外,文件系统层的底部是缓冲区缓存。这个组件跟踪来自文件系统实现和物理设备(通过设备驱动程序)的读写请求。为了提高效率,Linux 对请求进行缓存,避免将所有请求发送到物理设备。缓存中缓存最近使用的缓冲区(页面),这些缓冲区可以快速提供给各个文件系统。