到目前为止,文章主要都是从理论上来讲述VFS的运行机制;接下来我们将深入源代码层中,通过阐述两个具有代表性的系统 调用sys_open()和sys_read()来更好地理解VFS向具体文件系统提供的接口机制。由于本文更关注的是文件操作的整个流程体制,所以我 们在追踪源代码时,对一些细节性的处理不予关心。又由于篇幅所限,只列出相关代码。本文中的源代码来自于linux-4.9.92内核版本。
在深入sys_open()和sys_read()之前,我们先概览下调用sys_read()的上下文。下图。描述了从用户空间的read()调用到数据从 磁盘读出的整个流程。当在用户应用程序调用文件I/O read()操作时,系统调用sys_read()被激发,sys_read()找到文件所在的具体文件 系统,把控制权传给该文件系统,最后由具体文件系统与物理介质交互,从介质中读出数据。
1 、sys_open()
sys_open()系统调用打开或创建一个文件,成功返回该文件的文件描述符。下面是sys_open()实现代码中主要的函数调用关系图。
sys_open函数调用关系图
sys_open()
do_sys_open()
getname()
get_unused_fd_flags()
do_filp_open()
set_nameidata()
path_openat()
get_empty_filp()
path_init()
link_path_walk()
do_last()
lookup_open()
atomic_open()
error = dir->i_op->atomic_open()
may_open()
vfs_open()
do_dentry_open()
f->f_op = fops_get(inode->i_fop);
open = f->f_op->open;
error = open(inode, f);
restore_nameidata()
fsnotify_open()
fd_install()
由于sys_open()的代码量大,函数调用关系复杂,以下主要是对该函数做整体的解析;而对其中的一些关键点,则列出其关键代码。
(整个open细节,在这个部分完成后,有时间我也会努力分析,写出来)
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
if (force_o_largefile())
flags |= O_LARGEFILE;
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
a 、 从sys_open()的函数代码可以看到,sys_open()在做了一些简单的参数填充后,就把接力棒传给do_sys_open():
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_flags op;
int fd = build_open_flags(flags, mode, &op);
struct filename *tmp;
if (fd)
return fd;
tmp = getname(filename);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
fd = get_unused_fd_flags(flags);
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fsnotify_open(f);
fd_install(fd, f);
}
}
putname(tmp);
return fd;
}
1)、首先,getname()把用户空间传过来的pathname,复制到内核空间;下面需要用到这个pathname解析找到inode;
2)、接着,get_unused_fd_flags()得到一个可用的文件描述符;通过该函数,可知文件描述符实质是进程打开文件列表中对应某个文件对象的索引值;
3)、然后,do_filp_open()打开文件,返回一个file对象,代表由该进程打开的一个文件;进程通过这样的一个数据结构对物理文件进行读写操作。
4)、接下来,fsnotify_open()通知文件被打开,fd_install()建立文件描述符与file对象的联系,以后进程对文件的读写都是通过操纵该文件描述符而进行。
5)、最后,putname()把第一步传过来的复制到内核空间的pathname销毁;
b. do_filp_open()用于打开文件,返回一个file对象;而打开之前需要先找到该文件:
struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op)
{
struct nameidata nd;
int flags = op->lookup_flags;
struct file *filp;
set_nameidata(&nd, dfd, pathname);
filp = path_openat(&nd, op, flags | LOOKUP_RCU);
if (unlikely(filp == ERR_PTR(-ECHILD)))
filp = path_openat(&nd, op, flags);
if (unlikely(filp == ERR_PTR(-ESTALE)))
filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
restore_nameidata();
return filp;
}
1)、open_namei()用于根据文件路径名查找文件,借助一个持有路径信息的数据结构nameidata而进行;
2)、这个函数结束后将,nameidata这个数据结构将会马上被释放。
3)、path_openat()用于,寻找目标节点,打开直至调用驱动的open
c . path_openat用于正式开始处理路径和找到对应的dentry和inode,进而执行驱动中的open
static struct file *path_openat(struct nameidata *nd,
const struct open_flags *op, unsigned flags)
{
const char *s;
struct file *file;
int opened = 0;
int error;
file = get_empty_filp();
if (IS_ERR(file))
return file;
file->f_flags = op->open_flag;
if (unlikely(file->f_flags & __O_TMPFILE)) {
error = do_tmpfile(nd, flags, op, file, &opened);
goto out2;
}
if (unlikely(file->f_flags & O_PATH)) {
error = do_o_path(nd, flags, file);
if (!error)
opened |= FILE_OPENED;
goto out2;
}
s = path_init(nd, flags);
if (IS_ERR(s)) {
put_filp(file);
return ERR_CAST(s);
}
while (!(error = link_path_walk(s, nd)) &&
(error = do_last(nd, file, op, &opened)) > 0) {
nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
s = trailing_symlink(nd);
if (IS_ERR(s)) {
error = PTR_ERR(s);
break;
}
}
terminate_walk(nd);
out2:
if (!(opened & FILE_OPENED)) {
BUG_ON(!error);
put_filp(file);
}
if (unlikely(error)) {
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
else
error = -ESTALE;
}
file = ERR_PTR(error);
}
return file;
}
1)、get_empty_filp()得到一个空的file结构体(默认静态分配了一些,如果打开的文件超过默认的数量,则需要动态申请)
2)、path_init()根据路径名查找目标节点的第一步,确定path lookup的起始位置。分三种情形: 【从"/"开始】、 【从当前路径开始】、 【从进程某个已经打开的目录路径开始】。
3)、link_path_walk(),用walk_component()函数,该函数作用是根据分量名找打对应的path,如果是符合链接则调用nested_symlink()函数处理符合链,walk_component()首先跳过‘.’和‘..’,如果是‘.’则直接跳过,如果是‘..’则要考虑父目录是否是另外一个挂载点,如果是的话可能要切换文件系统。
4)、do_last()处理剩下的解析,和open函数调用流程。
注:O_PATH标志,指示文件系统树中的位置和 执行纯粹在文件描述符级别执行的操作。文件本身未打开。因为不打开文件,所以我们就不分析这个。
__O_TMPFILE表示创建一个不应允许访问(读或写)的文件。
d: open系统调用的最后一步
/*
* Handle the last step of open()
*/
static int do_last(struct nameidata *nd,
struct file *file, const struct open_flags *op,
int *opened)
{
struct dentry *dir = nd->path.dentry;
int open_flag = op->open_flag;
bool will_truncate = (open_flag & O_TRUNC) != 0;
bool got_write = false;
int acc_mode = op->acc_mode;
unsigned seq;
struct inode *inode;
struct path path;
int error;
nd->flags &= ~LOOKUP_PARENT;
nd->flags |= op->intent;
if (nd->last_type != LAST_NORM) {
error = handle_dots(nd, nd->last_type);
if (unlikely(error))
return error;
goto finish_open;
}
if (!(open_flag & O_CREAT)) {
if (nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
/* we _can_ be in RCU mode here */
error = lookup_fast(nd, &path, &inode, &seq);
if (likely(error > 0))
goto finish_lookup;
if (error < 0)
return error;
BUG_ON(nd->inode != dir->d_inode);
BUG_ON(nd->flags & LOOKUP_RCU);
} else {
/* create side of things */
/*
* This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
* has been cleared when we got to the last component we are
* about to look up
*/
error = complete_walk(nd);
if (error)
return error;
audit_inode(nd->name, dir, LOOKUP_PARENT);
/* trailing slashes? */
if (unlikely(nd->last.name[nd->last.len]))
return -EISDIR;
}
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
error = mnt_want_write(nd->path.mnt);
if (!error)
got_write = true;
/*
* do _not_ fail yet - we might not need that or fail with
* a different error; let lookup_open() decide; we'll be
* dropping this one anyway.
*/
}
if (open_flag & O_CREAT)
inode_lock(dir->d_inode);
else
inode_lock_shared(dir->d_inode);
error = lookup_open(nd, &path, file, op, got_write, opened);
if (open_flag & O_CREAT)
inode_unlock(dir->d_inode);
else
inode_unlock_shared(dir->d_inode);
if (error <= 0) {
if (error)
goto out;
if ((*opened & FILE_CREATED) ||
!S_ISREG(file_inode(file)->i_mode))
will_truncate = false;
audit_inode(nd->name, file->f_path.dentry, 0);
goto opened;
}
if (*opened & FILE_CREATED) {
/* Don't check for write permission, don't truncate */
open_flag &= ~O_TRUNC;
will_truncate = false;
acc_mode = 0;
path_to_nameidata(&path, nd);
goto finish_open_created;
}
/*
* If atomic_open() acquired write access it is dropped now due to
* possible mount and symlink following (this might be optimized away if
* necessary...)
*/
if (got_write) {
mnt_drop_write(nd->path.mnt);
got_write = false;
}
error = follow_managed(&path, nd);
if (unlikely(error < 0))
return error;
if (unlikely(d_is_negative(path.dentry))) {
path_to_nameidata(&path, nd);
return -ENOENT;
}
/*
* create/update audit record if it already exists.
*/
audit_inode(nd->name, path.dentry, 0);
if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
path_to_nameidata(&path, nd);
return -EEXIST;
}
seq = 0; /* out of RCU mode, so the value doesn't matter */
inode = d_backing_inode(path.dentry);
finish_lookup:
if (nd->depth)
put_link(nd);
error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW,
inode, seq);
if (unlikely(error))
return error;
path_to_nameidata(&path, nd);
nd->inode = inode;
nd->seq = seq;
/* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
finish_open:
error = complete_walk(nd);
if (error)
return error;
audit_inode(nd->name, nd->path.dentry, 0);
error = -EISDIR;
if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
goto out;
error = -ENOTDIR;
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
goto out;
if (!d_is_reg(nd->path.dentry))
will_truncate = false;
if (will_truncate) {
error = mnt_want_write(nd->path.mnt);
if (error)
goto out;
got_write = true;
}
finish_open_created:
error = may_open(&nd->path, acc_mode, open_flag);
if (error)
goto out;
BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
error = vfs_open(&nd->path, file, current_cred());
if (error)
goto out;
*opened |= FILE_OPENED;
opened:
error = open_check_o_direct(file);
if (!error)
error = ima_file_check(file, op->acc_mode, *opened);
if (!error && will_truncate)
error = handle_truncate(file);
out:
if (unlikely(error) && (*opened & FILE_OPENED))
fput(file);
if (unlikely(error > 0)) {
WARN_ON(1);
error = -EINVAL;
}
if (got_write)
mnt_drop_write(nd->path.mnt);
return error;
}
1)该函数比较长,先根基传入的flag,处理了一些特殊情况
2)接下来,调用lookup_open,首先调用__d_lookup()函数,在dentry_hashtable,以父dentry实例地址和name(要解析的分量)的hash为key查找对应的dentry实例,如果在缓存中找到相应的实例,主要调用__d_lookup()函数,即使找到,也不保证它是最新的,必须调用底层文件系统的dentry_operations中的d_revalidate()函数来检测缓存项是否任然有效,如果有效则作为缓存搜索结果返回。如果在dcache缓存中没有找到要找的dentry实例,则重新加载,调用lookup_hash()函数,lookup_hash()函数首先调用lookup_dcache()函数在缓存中生成新的dentry实例,然后调用lookup_real()函数,它是具体的文件系统相关函数,读取磁盘的inode节点信息,并将inode节点和目录项对象相关联,在iget索引节点时,将索引节点加入inode cache
3)may_open()检查该inode的权限
4)vfs_open()调用open函数
e 调用open函数
/**
* vfs_open - open the file at the given path
* @path: path to open
* @file: newly allocated file with f_flag initialized
* @cred: credentials to use
*/
int vfs_open(const struct path *path, struct file *file,
const struct cred *cred)
{
struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
file->f_path = *path;
return do_dentry_open(file, d_backing_inode(dentry), NULL, cred);
}
1)d_real根据flag看是否需要复制一份dentry
2)打开这个dentey,传入了dentry->inode里面的
f
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *),
const struct cred *cred)
{
static const struct file_operations empty_fops = {};
int error;
f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
FMODE_PREAD | FMODE_PWRITE;
path_get(&f->f_path);
f->f_inode = inode;
f->f_mapping = inode->i_mapping;
if (unlikely(f->f_flags & O_PATH)) {
f->f_mode = FMODE_PATH;
f->f_op = &empty_fops;
return 0;
}
if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
error = get_write_access(inode);
if (unlikely(error))
goto cleanup_file;
error = __mnt_want_write(f->f_path.mnt);
if (unlikely(error)) {
put_write_access(inode);
goto cleanup_file;
}
f->f_mode |= FMODE_WRITER;
}
/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
f->f_mode |= FMODE_ATOMIC_POS;
f->f_op = fops_get(inode->i_fop);
if (unlikely(WARN_ON(!f->f_op))) {
error = -ENODEV;
goto cleanup_all;
}
error = security_file_open(f, cred);
if (error)
goto cleanup_all;
error = break_lease(locks_inode(f), f->f_flags);
if (error)
goto cleanup_all;
if (!open)
open = f->f_op->open;
if (open) {
error = open(inode, f);
if (error)
goto cleanup_all;
}
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(inode);
if ((f->f_mode & FMODE_READ) &&
likely(f->f_op->read || f->f_op->read_iter))
f->f_mode |= FMODE_CAN_READ;
if ((f->f_mode & FMODE_WRITE) &&
likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
return 0;
cleanup_all:
fops_put(f->f_op);
if (f->f_mode & FMODE_WRITER) {
put_write_access(inode);
__mnt_drop_write(f->f_path.mnt);
}
cleanup_file:
path_put(&f->f_path);
f->f_path.mnt = NULL;
f->f_path.dentry = NULL;
f->f_inode = NULL;
return error;
}
1)填充好file结构体,这个是最终,最终要返回给最开始调用的do_filp_open函数的,它里面填充好了相关参数,包括inode,f_mapping ,mode,file_operation操作等
2)f->f_op = fops_get(inode->i_fop);从inode中得到操作方法。(这里会是一个和文件系统相关的默认的操作方法。)
3)
if (!open)
open = f->f_op->open;
if (open) {
error = open(inode, f);
我们上面传入的open是NULL,所以会拿到inode里面指向的f_op中的open函数,如果inode中即驱动中有这个,则执行这个真正的open函数。
参考博文:
http://blog.chinaunix.net/uid-20522771-id-4419678.html
https://blog.csdn.net/u013686805/article/details/37660083
https://www.ibm.com/developerworks/cn/linux/l-cn-vfs/#icomments