自我理解open的设计思想
前言
只是看源码感觉只是了解这要干什么,但不理解为什么要这么干,所以先找资料学习一下open的设计思想
设计思想
在Linux中,有一句话是“一切皆文件”,目录,设备,套接字,普通文件等等都是文件,即使他看起来不像文件,Linux也会以文件的形式来呈现,这样就给我们用户提供了一个统一的文件视图,我们可以用一套标准来操纵Linux几乎所有资源。
所以在我理解中,文件就是计算机的资源,而我们将这些资源抽象成文件,在提供一套统一的标准来操纵这些资源,使用户可以简单的操作计算机
open故名思意,打开,打开什么?
打开文件,怎么打开文件?凭什么能打开文件?所以必须有文件的名字和你打开它的权限,然后计算机会给你一个非负整数,即所谓的文件描述符;用户态后续对文件的读写操作等都是通过fd来完成的。
在内核中,内核将对访问该文件的进程创建一个file结构。
内核使用进程描述符task_struct来描述一个进程,而该进程所有已打开文件对应的file结构将形成一个数组files(其为files_struct结构),内核向用户返回的fd便是该数组中具体file结构的索引。
函数分析
入口函数
在入口函数中主要调用了do_sys_open函数用来进行文件的打开;
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
if (force_o_largefile())
flags |= O_LARGEFILE;
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
do_sys_open函数
你可以看到该函数最后返回了一个文件描述符fd,目的就是将文件和文件描述符关联起来,其中最重要的是do_filp_open
函数,它正好就是创建了一个file
结构体,而它将执行文件的打开操作
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_how how = build_open_how(flags, mode);
return do_sys_openat2(dfd, filename, &how);
}
static long do_sys_openat2(int dfd, const char __user *filename,
struct open_how *how)
{
struct open_flags op;
int fd = build_open_flags(how, &op);
struct filename *tmp;
if (fd)
return fd;
tmp = getname(filename);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
fd = get_unused_fd_flags(how->flags);
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fsnotify_open(f);
fd_install(fd, f);
}
}
putname(tmp);
return fd;
}
do_file_open函数
这个函数用了nameidata
结构体来表示要打开文件的路径信息,并通过set_nameidata
函数将 nd 结构体初始化为要打开文件的路径信息,重点是path_openat
函数,它将打开文件并返回一个file
结构体
但你能看到path_openat
被调用了三次,原因是在RCU模式(rcu-walk)下进行文件打开操作;如果在此方式下打开失败,则进入普通模式(ref-walk)。第三次调用比较少用。
struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op)
{
struct nameidata nd;
int flags = op->lookup_flags;
struct file *filp;
set_nameidata(&nd, dfd, pathname, NULL);
filp = path_openat(&nd, op, flags | LOOKUP_RCU);
if (unlikely(filp == ERR_PTR(-ECHILD)))
filp = path_openat(&nd, op, flags);
if (unlikely(filp == ERR_PTR(-ESTALE)))
filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
restore_nameidata();
return filp;
}
path_openat函数
可以看出下面这个函数是对一个file结构进行操作,这个file结构由函数alloc_empty_file
产生。
后续是对file标志位的判断,在后面的代码分析有提到。如果没有出现错误,函数会正常返回文件对象;如果出现错误,函数会返回一个错误指针,以便调用者知道发生了什么问题。
static struct file *path_openat(struct nameidata *nd,
const struct open_flags *op, unsigned flags)
{
struct file *file;
int error;
file = alloc_empty_file(op->open_flag, current_cred());
if (IS_ERR(file))
return file;
if (unlikely(file->f_flags & __O_TMPFILE)) {
error = do_tmpfile(nd, flags, op, file);
} else if (unlikely(file->f_flags & O_PATH)) {
error = do_o_path(nd, flags, file);
} else {
const char *s = path_init(nd, flags);
while (!(error = link_path_walk(s, nd)) &&
(s = open_last_lookups(nd, file, op)) != NULL)
;
if (!error)
error = do_open(nd, file, op);
terminate_walk(nd);
}
if (likely(!error)) {
if (likely(file->f_mode & FMODE_OPENED))
return file;
WARN_ON(1);
error = -EINVAL;
}
fput(file);
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
else
error = -ESTALE;
}
return ERR_PTR(error);
}
link_path_walk函数
link_path_walk 就像你的导航,帮助你找到正确的路径,处理不同情况,最终告诉你如何到达目的地。如果一切顺利,它会返回0,表示成功找到路径,否则会返回相应的错误代码,就像导航器会告诉你如果遇到问题该怎么办
static int link_path_walk(const char *name, struct nameidata *nd)
{
int depth = 0; // depth <= nd->depth
int err;
nd->last_type = LAST_ROOT;
nd->flags |= LOOKUP_PARENT;
if (IS_ERR(name))
return PTR_ERR(name);
while (*name=='/')
name++;
if (!*name) {
nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
return 0;
}
/* At this point we know we have a real path component. */
for(;;) {
struct user_namespace *mnt_userns;
const char *link;
u64 hash_len;
int type;
mnt_userns = mnt_user_ns(nd->path.mnt);
err = may_lookup(mnt_userns, nd);
if (err)
return err;
hash_len = hash_name(nd->path.dentry, name);
type = LAST_NORM;
if (name[0] == '.') switch (hashlen_len(hash_len)) {
case 2:
if (name[1] == '.') {
type = LAST_DOTDOT;
nd->state |= ND_JUMPED;
}
break;
case 1:
type = LAST_DOT;
}
if (likely(type == LAST_NORM)) {
struct dentry *parent = nd->path.dentry;
nd->state &= ~ND_JUMPED;
if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
struct qstr this = { { .hash_len = hash_len }, .name = name };
err = parent->d_op->d_hash(parent, &this);
if (err < 0)
return err;
hash_len = this.hash_len;
name = this.name;
}
}
nd->last.hash_len = hash_len;
nd->last.name = name;
nd->last_type = type;
name += hashlen_len(hash_len);
if (!*name)
goto OK;
/*
* If it wasn't NUL, we know it was '/'. Skip that
* slash, and continue until no more slashes.
*/
do {
name++;
} while (unlikely(*name == '/'));
if (unlikely(!*name)) {
OK:
/* pathname or trailing symlink, done */
if (!depth) {
nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
nd->dir_mode = nd->inode->i_mode;
nd->flags &= ~LOOKUP_PARENT;
return 0;
}
/* last component of nested symlink */
name = nd->stack[--depth].name;
link = walk_component(nd, 0);
} else {
/* not the last component */
link = walk_component(nd, WALK_MORE);
}
if (unlikely(link)) {
if (IS_ERR(link))
return PTR_ERR(link);
/* a symlink to follow */
nd->stack[depth++].name = name;
name = link;
continue;
}
if (unlikely(!d_can_lookup(nd->path.dentry))) {
if (nd->flags & LOOKUP_RCU) {
if (!try_to_unlazy(nd))
return -ECHILD;
}
return -ENOTDIR;
}
}
}
do_tmpfile函数
该函数的进入条件是如果文件是临时文件,则进入该函数的处理过程
该函数用于创建一个临时文件,确保创建过程的合法性,然后返回结果,告诉你是否成功创建了文件。如果一切正常,它返回0,表示创建成功;否则,返回适当的错误代码。
static int do_tmpfile(struct nameidata *nd, unsigned flags,
const struct open_flags *op,
struct file *file)
{
struct user_namespace *mnt_userns;
struct dentry *child;
struct path path;
int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
if (unlikely(error))
return error;
error = mnt_want_write(path.mnt);
if (unlikely(error))
goto out;
mnt_userns = mnt_user_ns(path.mnt);
child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
error = PTR_ERR(child);
if (IS_ERR(child))
goto out2;
dput(path.dentry);
path.dentry = child;
audit_inode(nd->name, child, 0);
/* Don't check for other permissions, the inode was just created */
error = may_open(mnt_userns, &path, 0, op->open_flag);
if (!error)
error = vfs_open(&path, file);
out2:
mnt_drop_write(path.mnt);
out:
path_put(&path);
return error;
}
do_o_path函数
进入该函数的前提是标志位是O_PATH
该标志用于表示打开文件的目的是为了路径操作而不是读取或写入文件内容。路径操作可能包括获取文件的元数据、权限检查等,而不是直接操作文件内容。
执行路径查找操作,通过传入的路径描述 nd 和操作标志 flags,查找文件的位置信息,并将结果存储在 path 结构中。如果查找成功,error 变量被设置为0
记录有关路径操作的审计信息,就像在实际操作中可能需要记录相关信息一样。
使用 vfs_open 函数来打开找到的文件,就像在现实生活中,如果我们找到了要去的地方,就需要打开门。打开的文件对象将存储在 file 中。
释放之前分配的资源,并返回 error,它的值可能是0或其他错误码
static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
struct path path;
int error = path_lookupat(nd, flags, &path);
if (!error) {
audit_inode(nd->name, path.dentry, 0);
error = vfs_open(&path, file);
path_put(&path);
}
return error;
}
path_init函数
如果文件不是临时文件或者仅仅为路径操作的文件,那我们将进入该函数的处理过程。
该函数准备文件查找的初始状态,包括路径字符串、权限检查和起始点的设置。它会根据传递的参数和标志位进行相应的初始化。
它准备和初始化了一个用于查找文件的工具箱,就像你在旅行前要准备好导航地图和工具一样
static const char *path_init(struct nameidata *nd, unsigned flags)
{
int error;
const char *s = nd->name->name;
/* LOOKUP_CACHED requires RCU, ask caller to retry */
if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
return ERR_PTR(-EAGAIN);
if (!*s)
flags &= ~LOOKUP_RCU;
if (flags & LOOKUP_RCU)
rcu_read_lock();
nd->flags = flags;
nd->state |= ND_JUMPED;
nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
smp_rmb();
if (nd->state & ND_ROOT_PRESET) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
if (*s && unlikely(!d_can_lookup(root)))
return ERR_PTR(-ENOTDIR);
nd->path = nd->root;
nd->inode = inode;
if (flags & LOOKUP_RCU) {
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
nd->root_seq = nd->seq;
} else {
path_get(&nd->path);
}
return s;
}
nd->root.mnt = NULL;
/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
error = nd_jump_root(nd);
if (unlikely(error))
return ERR_PTR(error);
return s;
}
/* Relative pathname -- get the starting-point it is relative to. */
if (nd->dfd == AT_FDCWD) {
if (flags & LOOKUP_RCU) {
struct fs_struct *fs = current->fs;
unsigned seq;
do {
seq = read_seqcount_begin(&fs->seq);
nd->path = fs->pwd;
nd->inode = nd->path.dentry->d_inode;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
} else {
get_fs_pwd(current->fs, &nd->path);
nd->inode = nd->path.dentry->d_inode;
}
} else {
/* Caller must check execute permissions on the starting path component */
struct fd f = fdget_raw(nd->dfd);
struct dentry *dentry;
if (!f.file)
return ERR_PTR(-EBADF);
dentry = f.file->f_path.dentry;
if (*s && unlikely(!d_can_lookup(dentry))) {
fdput(f);
return ERR_PTR(-ENOTDIR);
}
nd->path = f.file->f_path;
if (flags & LOOKUP_RCU) {
nd->inode = nd->path.dentry->d_inode;
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
} else {
path_get(&nd->path);
nd->inode = nd->path.dentry->d_inode;
}
fdput(f);
}
/* For scoped-lookups we need to set the root to the dirfd as well. */
if (flags & LOOKUP_IS_SCOPED) {
nd->root = nd->path;
if (flags & LOOKUP_RCU) {
nd->root_seq = nd->seq;
} else {
path_get(&nd->root);
nd->state |= ND_ROOT_GRABBED;
}
}
return s;
}
open_last_lookups函数
用于执行文件路径查找和打开文件操作,根据查找和打开的情况,可能返回文件对象或错误码
static const char *open_last_lookups(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
struct dentry *dir = nd->path.dentry;
int open_flag = op->open_flag;
bool got_write = false;
unsigned seq;
struct inode *inode;
struct dentry *dentry;
const char *res;
nd->flags |= op->intent;
if (nd->last_type != LAST_NORM) {
if (nd->depth)
put_link(nd);
return handle_dots(nd, nd->last_type);
}
if (!(open_flag & O_CREAT)) {
if (nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
/* we _can_ be in RCU mode here */
dentry = lookup_fast(nd, &inode, &seq);
if (IS_ERR(dentry))
return ERR_CAST(dentry);
if (likely(dentry))
goto finish_lookup;
BUG_ON(nd->flags & LOOKUP_RCU);
} else {
/* create side of things */
if (nd->flags & LOOKUP_RCU) {
if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
}
audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
/* trailing slashes? */
if (unlikely(nd->last.name[nd->last.len]))
return ERR_PTR(-EISDIR);
}
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
got_write = !mnt_want_write(nd->path.mnt);
/*
* do _not_ fail yet - we might not need that or fail with
* a different error; let lookup_open() decide; we'll be
* dropping this one anyway.
*/
}
if (open_flag & O_CREAT)
inode_lock(dir->d_inode);
else
inode_lock_shared(dir->d_inode);
dentry = lookup_open(nd, file, op, got_write);
if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
fsnotify_create(dir->d_inode, dentry);
if (open_flag & O_CREAT)
inode_unlock(dir->d_inode);
else
inode_unlock_shared(dir->d_inode);
if (got_write)
mnt_drop_write(nd->path.mnt);
if (IS_ERR(dentry))
return ERR_CAST(dentry);
if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
dput(nd->path.dentry);
nd->path.dentry = dentry;
return NULL;
}
finish_lookup:
if (nd->depth)
put_link(nd);
res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
if (unlikely(res))
nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
return res;
}
do_open函数
经过link_path_walk
和open_last_lookups
函数的审查,我们将进入最后的打开阶段
link_path_walk 就像是查找地图上的具体位置,它告诉你目标位置的坐标,并且返回了是否找到的信息。它的主要任务是找到路径的每个部分,并获取相关的地点信息。
open_last_lookups 则类似于你到达目标地点后,你需要执行的具体任务。它接受之前查找得到的信息,可能会有一些额外的工作,比如打开文件或者检查特殊情况。它的返回值告诉你接下来需要去哪里,或者是否已经完成了任务。
负责打开文件,并在打开文件前进行各种权限检查、安全检查以及可能的截断操作。如果一切正常,返回0;否则,返回适当的错误代码
do_open 就像你在打开文件之前需要检查文件的状态、权限和内容,以确保你能够顺利打开文件并进行相关操作。如果一切正常,它会让你成功打开文件;否则,会告诉你出现了问题。
static int do_open(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
struct user_namespace *mnt_userns;
int open_flag = op->open_flag;
bool do_truncate;
int acc_mode;
int error;
if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
error = complete_walk(nd);
if (error)
return error;
}
if (!(file->f_mode & FMODE_CREATED))
audit_inode(nd->name, nd->path.dentry, 0);
mnt_userns = mnt_user_ns(nd->path.mnt);
if (open_flag & O_CREAT) {
if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
return -EEXIST;
if (d_is_dir(nd->path.dentry))
return -EISDIR;
error = may_create_in_sticky(mnt_userns, nd,
d_backing_inode(nd->path.dentry));
if (unlikely(error))
return error;
}
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
return -ENOTDIR;
do_truncate = false;
acc_mode = op->acc_mode;
if (file->f_mode & FMODE_CREATED) {
/* Don't check for write permission, don't truncate */
open_flag &= ~O_TRUNC;
acc_mode = 0;
} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
error = mnt_want_write(nd->path.mnt);
if (error)
return error;
do_truncate = true;
}
error = may_open(mnt_userns, &nd->path, acc_mode, open_flag);
if (!error && !(file->f_mode & FMODE_OPENED))
error = vfs_open(&nd->path, file);
if (!error)
error = ima_file_check(file, op->acc_mode);
if (!error && do_truncate)
error = handle_truncate(mnt_userns, file);
if (unlikely(error > 0)) {
WARN_ON(1);
error = -EINVAL;
}
if (do_truncate)
mnt_drop_write(nd->path.mnt);
return error;
}
vfs_open 函数
该函数主要调用了do_dentry_open
函数
int vfs_open(const struct path *path, struct file *file)
{
file->f_path = *path;
return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}
do_dentry_open函数
对于该函数我们知道一件事就行了,那就是将我们前面所创建的file结构体 和inode连接到一起。
file 结构体是用于跟踪和管理文件操作的数据结构,而 inode 结构体用于存储文件的元数据信息。
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *))
{
static const struct file_operations empty_fops = {};
int error;
path_get(&f->f_path);
f->f_inode = inode;
f->f_mapping = inode->i_mapping;
f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
f->f_sb_err = file_sample_sb_err(f);
if (unlikely(f->f_flags & O_PATH)) {
f->f_mode = FMODE_PATH | FMODE_OPENED;
f->f_op = &empty_fops;
return 0;
}
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
i_readcount_inc(inode);
} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
error = get_write_access(inode);
if (unlikely(error))
goto cleanup_file;
error = __mnt_want_write(f->f_path.mnt);
if (unlikely(error)) {
put_write_access(inode);
goto cleanup_file;
}
f->f_mode |= FMODE_WRITER;
}
/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
f->f_mode |= FMODE_ATOMIC_POS;
f->f_op = fops_get(inode->i_fop);
if (WARN_ON(!f->f_op)) {
error = -ENODEV;
goto cleanup_all;
}
error = security_file_open(f);
if (error)
goto cleanup_all;
error = break_lease(locks_inode(f), f->f_flags);
if (error)
goto cleanup_all;
/* normally all 3 are set; ->open() can clear them if needed */
f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
if (!open)
open = f->f_op->open;
if (open) {
error = open(inode, f);
if (error)
goto cleanup_all;
}
f->f_mode |= FMODE_OPENED;
if ((f->f_mode & FMODE_READ) &&
likely(f->f_op->read || f->f_op->read_iter))
f->f_mode |= FMODE_CAN_READ;
if ((f->f_mode & FMODE_WRITE) &&
likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
f->f_write_hint = WRITE_LIFE_NOT_SET;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
/* NB: we're sure to have correct a_ops only after f_op->open */
if (f->f_flags & O_DIRECT) {
if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
return -EINVAL;
}
/*
* XXX: Huge page cache doesn't support writing yet. Drop all page
* cache for this file before processing writes.
*/
if (f->f_mode & FMODE_WRITE) {
/*
* Paired with smp_mb() in collapse_file() to ensure nr_thps
* is up to date and the update to i_writecount by
* get_write_access() is visible. Ensures subsequent insertion
* of THPs into the page cache will fail.
*/
smp_mb();
if (filemap_nr_thps(inode->i_mapping)) {
struct address_space *mapping = inode->i_mapping;
filemap_invalidate_lock(inode->i_mapping);
/*
* unmap_mapping_range just need to be called once
* here, because the private pages is not need to be
* unmapped mapping (e.g. data segment of dynamic
* shared libraries here).
*/
unmap_mapping_range(mapping, 0, 0, 0);
truncate_inode_pages(mapping, 0);
filemap_invalidate_unlock(inode->i_mapping);
}
}
return 0;
cleanup_all:
if (WARN_ON_ONCE(error > 0))
error = -EINVAL;
fops_put(f->f_op);
put_file_access(f);
cleanup_file:
path_put(&f->f_path);
f->f_path.mnt = NULL;
f->f_path.dentry = NULL;
f->f_inode = NULL;
return error;
}
流程图
心得体会
根据源码的分析,我们大致搞清inode和file结构体之间的区别和关系,file结构体记录打开文件的操作,inode记录的是文件的信息。
举个例子,文件相当于一本书,file是你目前再看的页面,你可以对该页进行一些操作,inode相当于书的目录,你可以根据目录找到你要的文件在那一页
open是对文件的操作,所以使用file结构体记录,最后将其和inode相联系,类似可以猜测wirte系统调用也是同样的道理,我们将open调用创建的那个和inode相关联的file结构体进行操作,最终会使文件的inode产生变化,我们的信息也就写入了文件中。read同理。
文件系统完全用一种统一的视图来操作文件,将不同的信息分成不同的结构体管理,inode是文件的数据,file是对文件的操作,dentry也可以想到是为了提高查找的效率所设计的一种结构体。