内核源码:linux-2.6.38.8.tar.bz2
目标平台:ARM体系结构
文件对象表示进程中已打开的文件,在内存中由open、creat和openat等系统调用创建,由close系统调用销毁。由于多个进程可以同时打开和操作同一文件,所以同一文件可能存在多个文件对象。文件对象在Linux内核中由struct file结构体来表示,源代码如下所示:
/* include/linux/fs.h */
struct file {
union {
struct list_head fu_list; //用于插入到相应超级块的文件指针链表s_files
struct rcu_head fu_rcuhead; //执行file_free函数时fu_list的值已失效,并以fu_rcuhead为节点加入可抢占队列
} f_u;
struct path f_path;
#define f_dentry f_path.dentry //对应的目录项
#define f_vfsmnt f_path.mnt //所在文件系统的相关信息
const struct file_operations *f_op; //操作文件内容的相关函数
spinlock_t f_lock; //自旋锁
#ifdef CONFIG_SMP
int f_sb_list_cpu; //多处理器时CPU的ID号
#endif
atomic_long_t f_count; //引用计数
unsigned int f_flags; //open等系统调用所用的访问标志
fmode_t f_mode; //针对文件内容的访问标志
loff_t f_pos; //文件偏移量
struct fown_struct f_owner; //用于向进程发送sigio和sigurg信号
const struct cred *f_cred; //指向所在进程的struct task_struct结构体实例的cred成员
struct file_ra_state f_ra; //文件预读状态
u64 f_version; //特定于文件系统,表示版本号或其它
#ifdef CONFIG_SECURITY
void *f_security; //Linux安全模块,由security_file_alloc函数初始化
//目前有selinux、apparmor和smack等三种安全框架支持该函数
#endif
void *private_data; //文件私有数据
#ifdef CONFIG_EPOLL
struct list_head f_ep_links; //监控该文件对象的epoll实例都须要插入到该成员所表示的链表
//f_op->poll函数指针不为空时该成员才有效
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping; //指向文件内容
#ifdef CONFIG_DEBUG_WRITECOUNT
unsigned long f_mnt_write_state; //调试所在文件系统的写者计数
//值为0、FILE_MNT_WRITE_TAKEN或FILE_MNT_WRITE_RELEASED等三者之一
#endif
};
1、文件对象相关的初始化由files_init函数完成,源代码如下所示:
/* include/linux/fs.h */
#define NR_FILE 8192
/* include/linux/kernel.h */
#define max_t(type, x, y) ({ \
type __max1 = (x); \
type __max2 = (y); \
__max1 > __max2 ? __max1: __max2; })
/* arch/arm/include/asm/cache.h */
#define L1_CACHE_SHIFT CONFIG_ARM_L1_CACHE_SHIFT
#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
/* include/linux/cache.h */
#ifndef SMP_CACHE_BYTES
#define SMP_CACHE_BYTES L1_CACHE_BYTES
#endif
#ifndef __cacheline_aligned
#define __cacheline_aligned \
__attribute__((__aligned__(SMP_CACHE_BYTES), \ //以SMP_CACHE_BYTES的大小对齐
__section__(".data..cacheline_aligned"))) //被修饰的变量存储在可执行文件的.data..cacheline_aligned区段
#endif /* __cacheline_aligned */
#ifndef __cacheline_aligned_in_smp
#ifdef CONFIG_SMP //配置CONFIG_SMP有效
#define __cacheline_aligned_in_smp __cacheline_aligned
#else
#define __cacheline_aligned_in_smp
#endif /* CONFIG_SMP */
#endif
/* include/linux/compiler.h */
# define __percpu __attribute__((noderef, address_space(3))) //用于sparse程序,它所修饰的指针不能直接解引用且存储在第3个地址空间
/* include/linux/percpu_counter.h */
struct percpu_counter {
spinlock_t lock; //自旋锁
s64 count; //总计数
#ifdef CONFIG_HOTPLUG_CPU
struct list_head list; //用于插入全局链表percpu_counters
#endif
s32 __percpu *counters; //每个CPU的计数
};
/* fs/file_table.c */
struct files_stat_struct files_stat = {
.max_files = NR_FILE
};
DECLARE_LGLOCK(files_lglock); //声明files_lglock锁相关函数
DEFINE_LGLOCK(files_lglock); //定义files_lglock锁及其相关函数
static struct kmem_cache *filp_cachep __read_mostly;
static struct percpu_counter nr_files __cacheline_aligned_in_smp; //使用__cacheline_aligned_in_smp属性修饰nr_files变量
void __init files_init(unsigned long mempages) //mempages为内存页数
{
unsigned long n;
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, //创建用于分配文件对象的缓存
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
//PAGE_SIZE为内存页的大小,文件对象的大小粗略估计为1024个字节
//可分配文件对象的数量至多为内存页数(乘以PAGE_SIZE/1024)的十分之一
n = (mempages * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = max_t(unsigned long, n, NR_FILE); //取n和NR_FILE两者中的最大值
files_defer_init();
lg_lock_init(files_lglock); //初始化files_lglock锁
percpu_counter_init(&nr_files, 0); //初始化nr_files
}
1.1、files_defer_init函数主要用来为系统中的每个处理器分别创建一个per-cpu变量fdtable_defer_list,源代码如下所示:
/* arch/arm/include/asm/posix_types.h */
typedef unsigned int __kernel_size_t;
/* include/linux/types.h */
#ifndef _SIZE_T
#define _SIZE_T
typedef __kernel_size_t size_t;
#endif
/* arch/arm/include/asm/types.h */
#define BITS_PER_LONG 32
/* include/linux/kernel.h */
#define INT_MAX ((int)(~0U>>1)) //在32位平台上该值为0x7fffffff
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void) (&_min1 == &_min2); \ //数据类型检查,类型不同时编译器会告警
_min1 < _min2 ? _min1 : _min2; })
/* fs/file.c */
int sysctl_nr_open_max = 1024 * 1024;
void __init files_defer_init(void)
{
int i;
for_each_possible_cpu(i) //遍历CPU,单核或多核
fdtable_defer_list_init(i);
sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
-BITS_PER_LONG; //等于0x7fffffe0
}
其中,fdtable_defer_list_init函数就是用来定义struct fdtable_defer类型的per-cpu变量fdtable_defer_list并将它初始化,源代码如下所示:
/* fs/file.c */
static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); //per-cpu变量
struct fdtable_defer {
spinlock_t lock; //自旋锁
struct work_struct wq; //工作队列任务
//struct fdtable类型的单向链表,每个进程都有一个这样的对象(包含在struct files_struct结构体内)
struct fdtable *next;
};
static void __devinit fdtable_defer_list_init(int cpu)
{
struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); //获取相应处理器的变量副本
spin_lock_init(&fddef->lock); //初始化自旋锁
//初始化工作队列任务,当将该任务提交到工作队列时,函数free_fdtable_work在将来的某个时间会被执行
INIT_WORK(&fddef->wq, free_fdtable_work);
fddef->next = NULL;
}
static void free_fdmem(void *ptr)
{
is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr); //内存类型不同,释放函数也不同
}
static void __free_fdtable(struct fdtable *fdt)
{
free_fdmem(fdt->fd);
//close_on_exec和open_fds所使用的内存由alloc_fdmem函数一次分配,并由open_fds指向,
//所以这里不需要再执行free_fdmem(fdt->close_on_exec)语句,下面的语句就能释放两者所占用的内存。
free_fdmem(fdt->open_fds);
kfree(fdt);
}
static void free_fdtable_work(struct work_struct *work)
{
struct fdtable_defer *f =
container_of(work, struct fdtable_defer, wq);
struct fdtable *fdt;
spin_lock_bh(&f->lock);
fdt = f->next; //获得链表
f->next = NULL;
spin_unlock_bh(&f->lock);
while(fdt) { //遍历该链表
struct fdtable *next = fdt->next;
__free_fdtable(fdt); //释放struct fdtable实例
fdt = next;
}
}
变量fdtable_defer_list中的工作队列函数free_fdtable_work在释放struct fdtable实例时由free_fdtable函数触发,源代码如下所示:
/* include/linux/fdtable.h */
static inline void free_fdtable(struct fdtable *fdt)
{
call_rcu(&fdt->rcu, free_fdtable_rcu); //注册RCU回调
}
/* fs/file.c */
void free_fdtable_rcu(struct rcu_head *rcu)
{
struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
struct fdtable_defer *fddef;
BUG_ON(!fdt);
if (fdt->max_fds <= NR_OPEN_DEFAULT) { //这时struct fdtable实例直接嵌入在struct files_struct内
kmem_cache_free(files_cachep,
container_of(fdt, struct files_struct, fdtab));
return;
}
if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) { //非vmalloc内存
kfree(fdt->fd);
kfree(fdt->open_fds);
kfree(fdt);
} else {
fddef = &get_cpu_var(fdtable_defer_list); //返回当前处理器的fdtable_defer_list变量并禁止抢占
spin_lock(&fddef->lock);
//后面两个语句将该struct fdtable实例加入到当前处理器的fdtable_defer_list链表
fdt->next = fddef->next;
fddef->next = fdt;
schedule_work(&fddef->wq); //提交工作
spin_unlock(&fddef->lock);
put_cpu_var(fdtable_defer_list); //使能抢占
}
}
1.2、percpu_counter_init函数用来将struct percpu_counter类型的变量nr_files的值初始化为零,在配置CONFIG_SMP的情况下,它的定义如下所示:
/* include/linux/percpu_counter.h */
#define percpu_counter_init(fbc, value) \
({ \
static struct lock_class_key __key; \ //死锁检测模块
\
__percpu_counter_init(fbc, value, &__key); \
})
/* lib/percpu_counter.c */
static LIST_HEAD(percpu_counters);
static DEFINE_MUTEX(percpu_counters_lock);
int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
struct lock_class_key *key)
{
spin_lock_init(&fbc->lock);
lockdep_set_class(&fbc->lock, key); //初始化死锁检测模块
fbc->count = amount;
fbc->counters = alloc_percpu(s32); //分配per-cpu变量的内存
if (!fbc->counters)
return -ENOMEM;
debug_percpu_counter_activate(fbc); //调试模块
#ifdef CONFIG_HOTPLUG_CPU
INIT_LIST_HEAD(&fbc->list);
mutex_lock(&percpu_counters_lock);
list_add(&fbc->list, &percpu_counters); //插入链表
mutex_unlock(&percpu_counters_lock);
#endif
return 0;
}
2、文件对象由get_empty_filp函数创建(并初始化它的一些成员),源代码如下所示:
/* fs/file_table.c */
struct file *get_empty_filp(void)
{
const struct cred *cred = current_cred(); //current->cred,获取当前进程的进程凭证
static long old_max;
struct file * f;
//get_nr_files函数返回percpu_counter_read_positive(&nr_files)的值
if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { //超过最大文件数限制且不具备CAP_SYS_ADMIN权限
if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
goto over;
}
f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); //分配文件对象所用内存并清零
if (f == NULL)
goto fail;
percpu_counter_inc(&nr_files); //计数nr_files加1
f->f_cred = get_cred(cred); //递增cred->usage
if (security_file_alloc(f)) //初始化f_security成员
goto fail_sec;
INIT_LIST_HEAD(&f->f_u.fu_list);
atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
eventpoll_init_file(f); //初始化f_ep_links成员
/* f->f_version: 0 */
return f;
over:
if (get_nr_files() > old_max) {
pr_info("VFS: file-max limit %lu reached\n", get_max_files());
old_max = get_nr_files();
}
goto fail;
fail_sec:
file_free(f);
fail:
return NULL;
}
剩余的一些成员一般由__dentry_open函数初始化,源代码如下所示:
/* fs/open.c */
static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
struct file *f,
int (*open)(struct inode *, struct file *),
const struct cred *cred)
{
struct inode *inode;
int error;
//OPEN_FMODE将打开标志O_RDONLY、O_WRONLY或O_RDWR转换为相应的FMODE_READ、FMODE_WRITE或(FMODE_READ|FMODE_WRITE)
f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
FMODE_PREAD | FMODE_PWRITE;
inode = dentry->d_inode; //文件所对应的i节点
if (f->f_mode & FMODE_WRITE) {
error = __get_file_write_access(inode, mnt); //针对非特殊文件,判断所在文件系统是否具有写权限
if (error)
goto cleanup_file;
if (!special_file(inode->i_mode))
file_take_write(f); //将f->f_mnt_write_state设置为FILE_MNT_WRITE_TAKEN
}
f->f_mapping = inode->i_mapping; //指向文件内容
f->f_path.dentry = dentry; //文件所对应的目录项
f->f_path.mnt = mnt; //文件所在文件系统的相关信息
f->f_pos = 0; //文件偏移量
f->f_op = fops_get(inode->i_fop); //操作函数来自于特定文件系统的i节点
file_sb_list_add(f, inode->i_sb); //通过f->f_u.fu_list成员将该文件指针添加到超级块的inode->i_sb->s_files链表中
//对于SMP系统,f->f_sb_list_cpu初始化为当前处理器的ID号,否则为零
error = security_dentry_open(f, cred); //安全模块检查
if (error)
goto cleanup_all;
if (!open && f->f_op)
open = f->f_op->open; //特定文件系统的i节点的open函数
if (open) {
error = open(inode, f);
if (error)
goto cleanup_all;
}
ima_counts_get(f); //针对普通文件
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); //清除不再使用的打开标志
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); //初始化预读状态的成员f->f_ra.ra_pages和f->f_ra.prev_pos
//当配置有O_DIRECT打开标志时,direct_IO和get_xip_mem函数指针必须为真
if (f->f_flags & O_DIRECT) {
if (!f->f_mapping->a_ops ||
((!f->f_mapping->a_ops->direct_IO) &&
(!f->f_mapping->a_ops->get_xip_mem))) {
fput(f);
f = ERR_PTR(-EINVAL);
}
}
return f; //成功返回
cleanup_all:
fops_put(f->f_op);
if (f->f_mode & FMODE_WRITE) {
put_write_access(inode); //递减inode->i_writecount
if (!special_file(inode->i_mode)) {
file_reset_write(f); //将f->f_mnt_write_state置零
mnt_drop_write(mnt); //递减mnt->mnt_writers或mnt->mnt_pcp->mnt_writers
}
}
file_sb_list_del(f); //从所属超级块的文件对象链表中移除该文件对象
f->f_path.dentry = NULL;
f->f_path.mnt = NULL;
cleanup_file:
put_filp(f);
dput(dentry);
mntput(mnt);
return ERR_PTR(error);
}
3、文件对象由file_free函数销毁,源代码如下所示:
/* fs/file_table.c */
static inline void file_free_rcu(struct rcu_head *head)
{
struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
put_cred(f->f_cred); //使用计数为零时释放进程凭证
kmem_cache_free(filp_cachep, f); //释放所占内存
}
static inline void file_free(struct file *f)
{
percpu_counter_dec(&nr_files); //递减nr_files计数
file_check_state(f); //检测f->f_mnt_write_state的值
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); //注册回调函数file_free_rcu
}
4、文件对象的操作函数统一由struct file_operations结构体来声明。其中的成员会根据文件系统和文件类型的不同选择性地来实现(也可能一个也不须要实现),未实现的成员置为空指针。源代码如下所示:
/* include/linux/fs.h */
struct file_operations {
struct module *owner; //文件系统以内核模块加载时有效
loff_t (*llseek) (struct file *, loff_t, int); //文件偏移
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); //读文件
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); //写文件
ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); //异步I/O读
ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); //异步I/O写
int (*readdir) (struct file *, void *, filldir_t); //遍历当前目录
unsigned int (*poll) (struct file *, struct poll_table_struct *); //多路复用
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); //ioctl操作
long (*compat_ioctl) (struct file *, unsigned int, unsigned long); //64位系统所提供的兼容32位系统的方法
int (*mmap) (struct file *, struct vm_area_struct *); //文件内存映射
int (*open) (struct inode *, struct file *); //打开操作
int (*flush) (struct file *, fl_owner_t id); //刷新操作
int (*release) (struct inode *, struct file *); //执行与open相反的操作
int (*fsync) (struct file *, int datasync); //数据同步
int (*aio_fsync) (struct kiocb *, int datasync); //异步I/O的数据同步
int (*fasync) (int, struct file *, int); //打开或关闭异步I/O的通告信号
int (*lock) (struct file *, int, struct file_lock *); //POSIX锁
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); //在内核实现文件之间的数据拷贝
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); //获取未使用的地址空间来映射文件
int (*check_flags)(int); //检查标志的有效性
int (*flock) (struct file *, int, struct file_lock *); //协同锁
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); //管道写
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); //管道读
int (*setlease)(struct file *, long, struct file_lock **); //锁租约
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len); //直接操作文件的磁盘空间
};