概述VFS文件对象

最新推荐文章于 2025-05-31 03:41:52 发布

tanglinux

最新推荐文章于 2025-05-31 03:41:52 发布

阅读量1.3k

点赞数

CC 4.0 BY-SA版权

分类专栏： # 文件系统

本文链接：https://blog.csdn.net/npy_lp/article/details/79083644

文件系统专栏收录该内容

6 篇文章

订阅专栏

内核源码：linux-2.6.38.8.tar.bz2

目标平台：ARM体系结构

文件对象表示进程中已打开的文件，在内存中由open、creat和openat等系统调用创建，由close系统调用销毁。由于多个进程可以同时打开和操作同一文件，所以同一文件可能存在多个文件对象。文件对象在Linux内核中由struct file结构体来表示，源代码如下所示：

/* include/linux/fs.h */
struct file {
	union {
		struct list_head	fu_list;	//用于插入到相应超级块的文件指针链表s_files
		struct rcu_head 	fu_rcuhead; //执行file_free函数时fu_list的值已失效，并以fu_rcuhead为节点加入可抢占队列
	} f_u;
	struct path		f_path;
#define f_dentry	f_path.dentry	//对应的目录项
#define f_vfsmnt	f_path.mnt		//所在文件系统的相关信息
	const struct file_operations	*f_op;	//操作文件内容的相关函数
	spinlock_t		f_lock;			//自旋锁
#ifdef CONFIG_SMP
	int			f_sb_list_cpu;		//多处理器时CPU的ID号
#endif
	atomic_long_t		f_count;	//引用计数
	unsigned int 		f_flags;	//open等系统调用所用的访问标志
	fmode_t			f_mode;			//针对文件内容的访问标志
	loff_t			f_pos;			//文件偏移量
	struct fown_struct	f_owner;	//用于向进程发送sigio和sigurg信号
	const struct cred	*f_cred;	//指向所在进程的struct task_struct结构体实例的cred成员
	struct file_ra_state	f_ra;	//文件预读状态

	u64			f_version;			//特定于文件系统，表示版本号或其它
#ifdef CONFIG_SECURITY
	void			*f_security;	//Linux安全模块，由security_file_alloc函数初始化
						//目前有selinux、apparmor和smack等三种安全框架支持该函数
#endif
	
	void			*private_data;	//文件私有数据

#ifdef CONFIG_EPOLL
	struct list_head	f_ep_links;	//监控该文件对象的epoll实例都须要插入到该成员所表示的链表
						//f_op->poll函数指针不为空时该成员才有效
#endif /* #ifdef CONFIG_EPOLL */
	struct address_space	*f_mapping;	//指向文件内容
#ifdef CONFIG_DEBUG_WRITECOUNT
	unsigned long f_mnt_write_state;	//调试所在文件系统的写者计数
						//值为0、FILE_MNT_WRITE_TAKEN或FILE_MNT_WRITE_RELEASED等三者之一
#endif
};

1、文件对象相关的初始化由files_init函数完成，源代码如下所示：

/* include/linux/fs.h */
#define NR_FILE  8192

/* include/linux/kernel.h */
#define max_t(type, x, y) ({			\
	type __max1 = (x);			\
	type __max2 = (y);			\
	__max1 > __max2 ? __max1: __max2; })

/* arch/arm/include/asm/cache.h */
#define L1_CACHE_SHIFT		CONFIG_ARM_L1_CACHE_SHIFT
#define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)

/* include/linux/cache.h */
#ifndef SMP_CACHE_BYTES
#define SMP_CACHE_BYTES L1_CACHE_BYTES
#endif

#ifndef __cacheline_aligned
#define __cacheline_aligned					\
  __attribute__((__aligned__(SMP_CACHE_BYTES),			\	//以SMP_CACHE_BYTES的大小对齐
		 __section__(".data..cacheline_aligned")))		//被修饰的变量存储在可执行文件的.data..cacheline_aligned区段
#endif /* __cacheline_aligned */

#ifndef __cacheline_aligned_in_smp
#ifdef CONFIG_SMP //配置CONFIG_SMP有效
#define __cacheline_aligned_in_smp __cacheline_aligned
#else
#define __cacheline_aligned_in_smp
#endif /* CONFIG_SMP */
#endif

/* include/linux/compiler.h */
# define __percpu	__attribute__((noderef, address_space(3))) //用于sparse程序，它所修饰的指针不能直接解引用且存储在第3个地址空间

/* include/linux/percpu_counter.h */
struct percpu_counter {
	spinlock_t lock;	//自旋锁
	s64 count;			//总计数
#ifdef CONFIG_HOTPLUG_CPU
	struct list_head list; //用于插入全局链表percpu_counters
#endif
	s32 __percpu *counters; //每个CPU的计数
};

/* fs/file_table.c */
struct files_stat_struct files_stat = {
	.max_files = NR_FILE
};

DECLARE_LGLOCK(files_lglock); //声明files_lglock锁相关函数
DEFINE_LGLOCK(files_lglock); //定义files_lglock锁及其相关函数

static struct kmem_cache *filp_cachep __read_mostly;

static struct percpu_counter nr_files __cacheline_aligned_in_smp;  //使用__cacheline_aligned_in_smp属性修饰nr_files变量

void __init files_init(unsigned long mempages) //mempages为内存页数
{ 
	unsigned long n;

	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, //创建用于分配文件对象的缓存
			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
			
	//PAGE_SIZE为内存页的大小，文件对象的大小粗略估计为1024个字节
	//可分配文件对象的数量至多为内存页数（乘以PAGE_SIZE/1024）的十分之一
	n = (mempages * (PAGE_SIZE / 1024)) / 10; 
	files_stat.max_files = max_t(unsigned long, n, NR_FILE); //取n和NR_FILE两者中的最大值
	files_defer_init();
	lg_lock_init(files_lglock); //初始化files_lglock锁
	percpu_counter_init(&nr_files, 0); //初始化nr_files
}

1.1、files_defer_init函数主要用来为系统中的每个处理器分别创建一个per-cpu变量fdtable_defer_list，源代码如下所示：

/* arch/arm/include/asm/posix_types.h */
typedef unsigned int		__kernel_size_t;

/* include/linux/types.h */
#ifndef _SIZE_T
#define _SIZE_T
typedef __kernel_size_t		size_t;
#endif

/* arch/arm/include/asm/types.h */
#define BITS_PER_LONG 32

/* include/linux/kernel.h */
#define INT_MAX		((int)(~0U>>1)) //在32位平台上该值为0x7fffffff

#define min(x, y) ({				\
	typeof(x) _min1 = (x);			\
	typeof(y) _min2 = (y);			\
	(void) (&_min1 == &_min2);		\ //数据类型检查，类型不同时编译器会告警
	_min1 < _min2 ? _min1 : _min2; })


/* fs/file.c */
int sysctl_nr_open_max = 1024 * 1024;

void __init files_defer_init(void)
{
	int i;
	for_each_possible_cpu(i) //遍历CPU，单核或多核
		fdtable_defer_list_init(i);
	sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
			     -BITS_PER_LONG;  //等于0x7fffffe0
}

其中，fdtable_defer_list_init函数就是用来定义struct fdtable_defer类型的per-cpu变量fdtable_defer_list并将它初始化，源代码如下所示：

/* fs/file.c */
static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); //per-cpu变量

struct fdtable_defer {
	spinlock_t lock; //自旋锁
	struct work_struct wq; //工作队列任务
	//struct fdtable类型的单向链表，每个进程都有一个这样的对象（包含在struct files_struct结构体内）
	struct fdtable *next; 
};

static void __devinit fdtable_defer_list_init(int cpu)
{
	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);  //获取相应处理器的变量副本
	spin_lock_init(&fddef->lock); //初始化自旋锁
	//初始化工作队列任务，当将该任务提交到工作队列时，函数free_fdtable_work在将来的某个时间会被执行
	INIT_WORK(&fddef->wq, free_fdtable_work); 
	fddef->next = NULL;
}

static void free_fdmem(void *ptr)
{
	is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);  //内存类型不同，释放函数也不同
}

static void __free_fdtable(struct fdtable *fdt)
{
	free_fdmem(fdt->fd);
	//close_on_exec和open_fds所使用的内存由alloc_fdmem函数一次分配，并由open_fds指向，
	//所以这里不需要再执行free_fdmem(fdt->close_on_exec)语句，下面的语句就能释放两者所占用的内存。
	free_fdmem(fdt->open_fds);  
	kfree(fdt);
}

static void free_fdtable_work(struct work_struct *work)
{
	struct fdtable_defer *f =
		container_of(work, struct fdtable_defer, wq);
	struct fdtable *fdt;

	spin_lock_bh(&f->lock);
	fdt = f->next; //获得链表
	f->next = NULL;
	spin_unlock_bh(&f->lock);
	while(fdt) { //遍历该链表
		struct fdtable *next = fdt->next;

		__free_fdtable(fdt); //释放struct fdtable实例
		fdt = next;
	}
}

变量fdtable_defer_list中的工作队列函数free_fdtable_work在释放struct fdtable实例时由free_fdtable函数触发，源代码如下所示：

/* include/linux/fdtable.h */
static inline void free_fdtable(struct fdtable *fdt)
{
	call_rcu(&fdt->rcu, free_fdtable_rcu); //注册RCU回调
}

/* fs/file.c */
void free_fdtable_rcu(struct rcu_head *rcu)
{
	struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
	struct fdtable_defer *fddef;

	BUG_ON(!fdt);

	if (fdt->max_fds <= NR_OPEN_DEFAULT) { //这时struct fdtable实例直接嵌入在struct files_struct内
		kmem_cache_free(files_cachep,
				container_of(fdt, struct files_struct, fdtab));
		return;
	}
	if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {  //非vmalloc内存
		kfree(fdt->fd);
		kfree(fdt->open_fds);
		kfree(fdt);
	} else {
		fddef = &get_cpu_var(fdtable_defer_list);  //返回当前处理器的fdtable_defer_list变量并禁止抢占
		spin_lock(&fddef->lock);
		//后面两个语句将该struct fdtable实例加入到当前处理器的fdtable_defer_list链表
		fdt->next = fddef->next;
		fddef->next = fdt;
		
		schedule_work(&fddef->wq); //提交工作
		spin_unlock(&fddef->lock);
		put_cpu_var(fdtable_defer_list);  //使能抢占
	}
}

1.2、percpu_counter_init函数用来将struct percpu_counter类型的变量nr_files的值初始化为零，在配置CONFIG_SMP的情况下，它的定义如下所示：

/* include/linux/percpu_counter.h */
#define percpu_counter_init(fbc, value)					\
	({								\
		static struct lock_class_key __key;			\ //死锁检测模块
									\
		__percpu_counter_init(fbc, value, &__key);		\
	})

/* lib/percpu_counter.c */
static LIST_HEAD(percpu_counters);
static DEFINE_MUTEX(percpu_counters_lock);

int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
			  struct lock_class_key *key)
{
	spin_lock_init(&fbc->lock);
	lockdep_set_class(&fbc->lock, key); //初始化死锁检测模块
	fbc->count = amount;
	fbc->counters = alloc_percpu(s32); //分配per-cpu变量的内存
	if (!fbc->counters)
		return -ENOMEM;

	debug_percpu_counter_activate(fbc); //调试模块

#ifdef CONFIG_HOTPLUG_CPU
	INIT_LIST_HEAD(&fbc->list);
	mutex_lock(&percpu_counters_lock);
	list_add(&fbc->list, &percpu_counters); //插入链表
	mutex_unlock(&percpu_counters_lock);
#endif
	return 0;
}

2、文件对象由get_empty_filp函数创建（并初始化它的一些成员），源代码如下所示：

/* fs/file_table.c */
struct file *get_empty_filp(void)
{
	const struct cred *cred = current_cred(); //current->cred，获取当前进程的进程凭证
	static long old_max;
	struct file * f;
	
	//get_nr_files函数返回percpu_counter_read_positive(&nr_files)的值
	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { //超过最大文件数限制且不具备CAP_SYS_ADMIN权限
		if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
			goto over;
	}

	f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); //分配文件对象所用内存并清零
	if (f == NULL)
		goto fail;

	percpu_counter_inc(&nr_files); //计数nr_files加1
	f->f_cred = get_cred(cred); //递增cred->usage
	if (security_file_alloc(f)) //初始化f_security成员
		goto fail_sec;

	INIT_LIST_HEAD(&f->f_u.fu_list);
	atomic_long_set(&f->f_count, 1);
	rwlock_init(&f->f_owner.lock);
	spin_lock_init(&f->f_lock);
	eventpoll_init_file(f); //初始化f_ep_links成员
	/* f->f_version: 0 */
	return f;

over:
	if (get_nr_files() > old_max) {
		pr_info("VFS: file-max limit %lu reached\n", get_max_files());
		old_max = get_nr_files();
	}
	goto fail;

fail_sec:
	file_free(f);
fail:
	return NULL;
}

剩余的一些成员一般由__dentry_open函数初始化，源代码如下所示：

/* fs/open.c */
static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
					struct file *f,
					int (*open)(struct inode *, struct file *),
					const struct cred *cred)
{
	struct inode *inode;
	int error;
	
	//OPEN_FMODE将打开标志O_RDONLY、O_WRONLY或O_RDWR转换为相应的FMODE_READ、FMODE_WRITE或（FMODE_READ|FMODE_WRITE）
	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
				FMODE_PREAD | FMODE_PWRITE;
	inode = dentry->d_inode; //文件所对应的i节点
	if (f->f_mode & FMODE_WRITE) {
		error = __get_file_write_access(inode, mnt);	//针对非特殊文件，判断所在文件系统是否具有写权限
		if (error)
			goto cleanup_file;
		if (!special_file(inode->i_mode))
			file_take_write(f);	//将f->f_mnt_write_state设置为FILE_MNT_WRITE_TAKEN
	}

	f->f_mapping = inode->i_mapping;	//指向文件内容
	f->f_path.dentry = dentry; //文件所对应的目录项
	f->f_path.mnt = mnt; //文件所在文件系统的相关信息
	f->f_pos = 0;	//文件偏移量
	f->f_op = fops_get(inode->i_fop); //操作函数来自于特定文件系统的i节点
	file_sb_list_add(f, inode->i_sb);	//通过f->f_u.fu_list成员将该文件指针添加到超级块的inode->i_sb->s_files链表中
						//对于SMP系统，f->f_sb_list_cpu初始化为当前处理器的ID号，否则为零

	error = security_dentry_open(f, cred);	//安全模块检查
	if (error)
		goto cleanup_all;

	if (!open && f->f_op)
		open = f->f_op->open;	//特定文件系统的i节点的open函数
	if (open) {
		error = open(inode, f);
		if (error)
			goto cleanup_all;
	}
	ima_counts_get(f);	//针对普通文件

	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);	//清除不再使用的打开标志

	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);	//初始化预读状态的成员f->f_ra.ra_pages和f->f_ra.prev_pos

	//当配置有O_DIRECT打开标志时，direct_IO和get_xip_mem函数指针必须为真
	if (f->f_flags & O_DIRECT) {
		if (!f->f_mapping->a_ops ||
		    ((!f->f_mapping->a_ops->direct_IO) &&
		    (!f->f_mapping->a_ops->get_xip_mem))) {
			fput(f);
			f = ERR_PTR(-EINVAL);
		}
	}

	return f; //成功返回

cleanup_all:
	fops_put(f->f_op);
	if (f->f_mode & FMODE_WRITE) {
		put_write_access(inode); //递减inode->i_writecount
		if (!special_file(inode->i_mode)) {
			file_reset_write(f); //将f->f_mnt_write_state置零
			mnt_drop_write(mnt); //递减mnt->mnt_writers或mnt->mnt_pcp->mnt_writers
		}
	}
	file_sb_list_del(f); //从所属超级块的文件对象链表中移除该文件对象
	f->f_path.dentry = NULL;
	f->f_path.mnt = NULL;
cleanup_file:
	put_filp(f);
	dput(dentry);
	mntput(mnt);
	return ERR_PTR(error);
}

3、文件对象由file_free函数销毁，源代码如下所示：

/* fs/file_table.c */
static inline void file_free_rcu(struct rcu_head *head)
{
	struct file *f = container_of(head, struct file, f_u.fu_rcuhead);

	put_cred(f->f_cred); //使用计数为零时释放进程凭证
	kmem_cache_free(filp_cachep, f); //释放所占内存
}

static inline void file_free(struct file *f)
{
	percpu_counter_dec(&nr_files); //递减nr_files计数
	file_check_state(f); //检测f->f_mnt_write_state的值
	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);  //注册回调函数file_free_rcu
}

4、文件对象的操作函数统一由struct file_operations结构体来声明。其中的成员会根据文件系统和文件类型的不同选择性地来实现（也可能一个也不须要实现），未实现的成员置为空指针。源代码如下所示：

/* include/linux/fs.h */
struct file_operations {
	struct module *owner; //文件系统以内核模块加载时有效
	loff_t (*llseek) (struct file *, loff_t, int); //文件偏移
	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); //读文件
	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); //写文件
	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); //异步I/O读
	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); //异步I/O写
	int (*readdir) (struct file *, void *, filldir_t); //遍历当前目录
	unsigned int (*poll) (struct file *, struct poll_table_struct *); //多路复用
	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); //ioctl操作
	long (*compat_ioctl) (struct file *, unsigned int, unsigned long); //64位系统所提供的兼容32位系统的方法
	int (*mmap) (struct file *, struct vm_area_struct *); //文件内存映射
	int (*open) (struct inode *, struct file *); //打开操作
	int (*flush) (struct file *, fl_owner_t id); //刷新操作
	int (*release) (struct inode *, struct file *); //执行与open相反的操作
	int (*fsync) (struct file *, int datasync); //数据同步
	int (*aio_fsync) (struct kiocb *, int datasync); //异步I/O的数据同步
	int (*fasync) (int, struct file *, int); //打开或关闭异步I/O的通告信号
	int (*lock) (struct file *, int, struct file_lock *); //POSIX锁
	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); //在内核实现文件之间的数据拷贝
	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); //获取未使用的地址空间来映射文件
	int (*check_flags)(int); //检查标志的有效性
	int (*flock) (struct file *, int, struct file_lock *); //协同锁
	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); //管道写
	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); //管道读
	int (*setlease)(struct file *, long, struct file_lock **); //锁租约
	long (*fallocate)(struct file *file, int mode, loff_t offset,
			  loff_t len); //直接操作文件的磁盘空间
};