这次的篇幅稍短一些。
伪文件系统存在于内存中,通常不占用硬盘空间,它以文件的形式,向用户提供了访问系统内核数据的接口。用户和应用程序可以通过访问这些数据接口,得到系统的信息,而且内核允许用户修改内核的某些参数。由于这些文件系统没有写盘,也叫作无持久存储文件系统。
这里介绍一个常见的伪文件系统proc。
proc文件系统(process filesystem),它使得内核可以生成与系统状态和配置有关的信息,该信息可以由用户和系统程序从普通文件读取,而无需专门的工具与内核通信。在某些情况下,一个简单的cat命令就足够了。数据不仅可以从内核读取,还可以通过向proc文件系统的文件写入字符串,来向内核发送数据。
使用方法
在/proc目录下
这一个个以名字为数字的目录,里面包含了相应pid的进程信息
# N是相应的pid
cat /proc/N/stat # 查看进程的状态
cat /proc/N/statm # 查看进程使用的内存的状态
cat /proc/N/status # 查看进程状态信息,比stat/statm更具可读性
除此之外,还有一些系统信息
cat /proc/interrupt # 查看系统所有的软中断
cat /proc/kallsyms # 内核里所有的系统调用接口
cat /proc/net # 查看网卡设备信息
cat /proc/scsi scsi # 查看设备信息
cat /proc/tty tty # 查看设备信息
cat /proc/net/dev # 查看显示网络适配器及统计信息
cat /proc/vmstat # 查看虚拟内存统计信息
举一个完整的例子,查看用户可以分配的文件句柄的最大数目。
cat /proc/sys/fs/file-max
>4096
echo 8192 > /proc/sys/fs/file-max
cat /proc/sys/fs/file-max
>8192
可以看出,这些系统设置也可以通过修改/proc中相应文件来设置。
代码实现
数据结构
与文件系统相关的数据结构
实现一个文件系统,主要扣住七个结构体。
- file_system_type:
mount -t proc proc /mnt
中第一个proc就是它的名字。把文件系统加入内核,首先要准备的就是这个结构体,注册方法是register_filesystem()。 - super_block:每mount一次就有一个对应的超级块,是文件系统的根,很多东西都依赖它。
- inode/inode_operations与file/file_operations:这一下就是4个结构体了。区分一下file与inode。file更倾向于文件里存储的内容,inode更强调的是在磁盘中存储的元数据。可理解为file就是用户能看到的文件,inode是元数据,结构体本身就含有struct file_operations*。
- dentry:关于路径的结构体。
由于之前没有贴过inode_operations和file_operations的代码,这里贴一下,也方便对比。
struct file_operations {
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
int (*fsync) (struct file *, loff_t, loff_t, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
loff_t, size_t, unsigned int);
int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
u64);
ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
u64);
};
struct inode_operations {
struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
int (*permission) (struct inode *, int);
struct posix_acl * (*get_acl)(struct inode *, int);
int (*readlink) (struct dentry *, char __user *,int);
int (*create) (struct inode *,struct dentry *, umode_t, bool);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct inode *,struct dentry *,const char *);
int (*mkdir) (struct inode *,struct dentry *,umode_t);
int (*rmdir) (struct inode *,struct dentry *);
int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*setattr) (struct dentry *, struct iattr *);
int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
u64 len);
int (*update_time)(struct inode *, struct timespec *, int);
int (*atomic_open)(struct inode *, struct dentry *,
struct file *, unsigned open_flag,
umode_t create_mode, int *opened);
int (*tmpfile) (struct inode *, struct dentry *, umode_t);
int (*set_acl)(struct inode *, struct posix_acl *, int);
} ____cacheline_aligned;
可以看出,file_operations针对的是read、write等对于文件内容本身的操作,而inode_operations针对的是对文件属性的操作。
proc数据结构
再看两个在proc中出现的数据结构。
proc文件系统中的每个数据项都由proc_dir_entry 的一个实例描述
/*
* This is not completely implemented yet. The idea is to
* create an in-memory tree (like the actual /proc filesystem
* tree) of these proc_dir_entries, so that we can dynamically
* add new files to /proc.
*
* parent/subdir are used for the directory structure (every /proc file has a
* parent, but "subdir" is empty for all non-directory entries).
* subdir_node is used to build the rb tree "subdir" of the parent.
*/
struct proc_dir_entry {
unsigned int low_ino;
umode_t mode; //文件访问权限
nlink_t nlink; //指定了目录中子目录和符号链接的数量
kuid_t uid;
kgid_t gid;
loff_t size; //文件数据长度
const struct inode_operations *proc_iops; //inode操作函数
const struct file_operations *proc_fops; //文件操作函数
struct proc_dir_entry *parent; //父目录结构体指针
struct rb_root subdir;
struct rb_node subdir_node;
void *data;
atomic_t count; /* use count */
atomic_t in_use; /* number of callers into module in progress; */
/* negative -> it's going away RSN */
struct completion *pde_unload_completion;
struct list_head pde_openers; /* who did ->open, but not ->release */
spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
u8 namelen; //指定文件名的长度
char name[]; //存储文件名的字符串
};
内核提供了一个数据结构,称之为 proc_inode ,支持以面向 inode 的方式来查看 proc 文件系统的数据项。
//将proc的数据与VFS层的inode数据关联起来,
struct proc_inode {
struct pid *pid;
unsigned int fd;
union proc_op op;
struct proc_dir_entry *pde; //pde是一个指针,指向关联到proc数据项的proc_dir_entry实例
struct ctl_table_header *sysctl;
struct ctl_table *sysctl_entry;
struct hlist_node sysctl_inodes;
const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
} ;
其中,联合体proc_op
union proc_op {
int (*proc_get_link)(struct dentry *, struct path *);
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,
struct task_struct *task);
};
执行流程
初始化流程
linux启动,会调用start_kernel,该函数会执行proc_root_init()函数。
void __init proc_root_init(void)
{
int err;
proc_init_inodecache(); //分配proc的inode缓存
set_proc_pid_nlink();
err = register_filesystem(&proc_fs_type); //向linux内核注册proc文件系统
if (err)
return;
proc_self_init(); //为self分配索引节点号
proc_thread_self_init(); //设置/proc/thread-self目录,其中包含有关当前线程的信息
proc_symlink("mounts", NULL, "self/mounts"); //包含调用的挂载点
proc_net_init();
//创建相关proc文件系统的信息条目
#ifdef CONFIG_SYSVIPC
proc_mkdir("sysvipc", NULL);
#endif
proc_mkdir("fs", NULL);
proc_mkdir("driver", NULL);
proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */
#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
/* just give it a mountpoint */
proc_create_mount_point("openprom");
#endif
proc_tty_init();
proc_mkdir("bus", NULL);
proc_sys_init(); //创建/proc/sys目录并初始化sysctl。
}
在register_filesystem函数中,遍历的链表头是全局变量。参数proc_fs_type结构体
static struct file_system_type proc_fs_type = {
.name = "proc",
.mount = proc_mount,
.kill_sb = proc_kill_sb, //其实就是umount操作
.fs_flags = FS_USERNS_MOUNT,
};
其中proc_mount()函数
static struct dentry *proc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
struct pid_namespace *ns;
if (flags & MS_KERNMOUNT) {
ns = data;
data = NULL;
} else {
ns = task_active_pid_ns(current);
}
//内核实现了很多种mount方法,这是其中一个
return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_uper);
}
该函数主要是调用proc_fill_super()函数填充super_block。
int proc_fill_super(struct super_block *s, void *data, int silent)
{
struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
struct inode *root_inode;
int ret;
if (!proc_parse_options(data, ns))
return -EINVAL;
/* User space would break if executables or devices appear on proc */
s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = PROC_SUPER_MAGIC;
s->s_op = &proc_sops;
s->s_time_gran = 1;
/*
* procfs isn't actually a stacking filesystem; however, there is
* too much magic going on inside it to permit stacking things on
* top of it
*/
s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
pde_get(&proc_root);
root_inode = proc_get_inode(s, &proc_root);
if (!root_inode) {
pr_err("proc_fill_super: get root inode failed\n");
return -ENOMEM;
}
s->s_root = d_make_root(root_inode);
if (!s->s_root) {
pr_err("proc_fill_super: allocate dentry failed\n");
return -ENOMEM;
}
ret = proc_setup_self(s);
if (ret) {
return ret;
}
return proc_setup_thread_self(s);
}
插个话,这里super_block是在挂载时产生的,而不是在insmod时产生的,为什么呢?因为多个设备可能使用同一种文件系统。mount有可能执行多次,这样就会有多个super_block,如果在insmod时产生,就只有一个。可以这么理解,super_block相对于文件系统,就相当于成员变量相当于结构体,而不是静态变量。
这个函数是在挂载时用到的。
挂载流程
当执行mount命令时,会执行系统调用
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
char __user *, type, unsigned long, flags, void __user *, data)
{
int ret;
char *kernel_type;
char *kernel_dev;
void *options;
kernel_type = copy_mount_string(type);
ret = PTR_ERR(kernel_type);
if (IS_ERR(kernel_type))
goto out_type;
kernel_dev = copy_mount_string(dev_name);
ret = PTR_ERR(kernel_dev);
if (IS_ERR(kernel_dev))
goto out_dev;
options = copy_mount_options(data);
ret = PTR_ERR(options);
if (IS_ERR(options))
goto out_data;
ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
kfree(options);
out_data:
kfree(kernel_dev);
out_dev:
kfree(kernel_type);
out_type:
return ret;
}
跟do_mount()函数
/*
* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
*
* data is a (void *) that can point to any structure up to
* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
* information (or be NULL).
*
* Pre-0.97 versions of mount() didn't have a flags word.
* When the flags word was introduced its top half was required
* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
* Therefore, if this magic number is present, it carries no information
* and must be discarded.
*/
long do_mount(const char *dev_name, const char __user *dir_name,
const char *type_page, unsigned long flags, void *data_page)
{
struct path path;
int retval = 0;
int mnt_flags = 0;
/* Discard magic */
if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
flags &= ~MS_MGC_MSK;
/* Basic sanity checks */
if (data_page)
((char *)data_page)[PAGE_SIZE - 1] = 0;
/* ... and get the mountpoint */
retval = user_path(dir_name, &path);
if (retval)
return retval;
//检查对应超级块是否安全
retval = security_sb_mount(dev_name, &path,
type_page, flags, data_page);
if (!retval && !may_mount())
retval = -EPERM;
if (!retval && (flags & MS_MANDLOCK) && !may_mandlock())
retval = -EPERM;
if (retval)
goto dput_out;
/* Default to relatime unless overriden */
if (!(flags & MS_NOATIME))
mnt_flags |= MNT_RELATIME;
/* Separate the per-mountpoint flags */
if (flags & MS_NOSUID)
mnt_flags |= MNT_NOSUID;
if (flags & MS_NODEV)
mnt_flags |= MNT_NODEV;
if (flags & MS_NOEXEC)
mnt_flags |= MNT_NOEXEC;
if (flags & MS_NOATIME)
mnt_flags |= MNT_NOATIME;
if (flags & MS_NODIRATIME)
mnt_flags |= MNT_NODIRATIME;
if (flags & MS_STRICTATIME)
mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
if (flags & MS_RDONLY)
mnt_flags |= MNT_READONLY;
/* The default atime for remount is preservation */
if ((flags & MS_REMOUNT) &&
((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
MS_STRICTATIME)) == 0)) {
mnt_flags &= ~MNT_ATIME_MASK;
mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
}
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
if (flags & MS_REMOUNT)
retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
retval = do_loopback(&path, dev_name, flags & MS_REC);
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&path, flags);
else if (flags & MS_MOVE)
retval = do_move_mount(&path, dev_name);
else
retval = do_new_mount(&path, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
path_put(&path);
return retval;
}
跟函数,代码就不继续贴了。do_mount()->do_new_mount()->vfs_kern_mount()->mount_fs()->type中执行回调mount(),这是自定义的挂载函数。
与sysfs的比较
再来对比一下同为伪文件系统的sysfs。
proc文件系统主要是用来调试内核,在内核运行时可以知道内核中一些重要的数据结构的值,一般都是读很少写。
proc文件系统出现的比sys文件系统早,proc文件系统的目录结构比较乱,在proc文件系统下面有很多文件夹,比如一个进程就有一个文件夹,现在内核越来越复杂,支持的设备类型也越来越多,显得很混乱;于是又开发出了sys系统,sys系统可以说是proc的升级,将来用sys系统会是主流。
proc文件系统和sys文件系统都是虚拟系统,并且有对应关系,比如"/proc/misc"对应于"sys/class/misc"下面的设备,都是描述misc类设备的。
总结
如果要实现一个文件系统,主要实现三个流程:
- 编译生成.ko文件,并通过insmod插入到内核中。如果确认没有异常,可以直接编译进内核代码中。
- 执行mount命令进行挂载。
- 读写数据等相关文件操作接口,实现cat、echo、touch等操作。