linux文件系统--VFS与文件系统层的关系

最新推荐文章于 2024-01-19 15:02:03 发布

月下看鸟

最新推荐文章于 2024-01-19 15:02:03 发布

阅读量1.2k

点赞数

分类专栏： linux 文章标签： linux 文件系统

linux 专栏收录该内容

14 篇文章 0 订阅

订阅专栏

VFS是文件系统特有的层，这一层把VFS的文件I/O转换成页和块。

在深入sys_open()和sys_read()之前，我们先概览下调用sys_read()的上下文。下图描述了从用户空间的read()调用到数据从磁盘读出的整个流程。当在用户应用程序调用文件I/O read()操作时，系统调用sys_read()被激发，sys_read()找到文件所在的具体文件系统，把控制权传给该文件系统，最后由具体文件系统与物理介质交互，从介质中读出数据。

图片示例_从物理介质读数据的过程

1.sys_open()

sys_open()系统调用打开或创建一个文件，成功返回该文件的文件描述符。图8是sys_open()实现代码中主要的函数调用关系图。

图片示例_sys_open函数调用关系图

a. 从sys_open()的函数调用关系图可以看到，sys_open()在做了一些简单的参数检验后，就把接力棒传给do_sys_open()：

/* fs/open.c */
asmlinkage long sys_open(const char __user *filename, int flags, int mode)
{
 long ret;
 if (force_o_largefile())
  flags |= O_LARGEFILE;
 ret = do_sys_open(AT_FDCWD, filename, flags, mode);
 /* avoid REGPARM breakage on x86: */
 prevent_tail_call(ret);
 return ret;
}
真正的打开函数是do_sys_open:
/* fs/open.c */
// dfd为AT_FDCWD
long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
{
// 通过该函数将用户空间的文件名传递到内核
// tmp是一个cache类的动态内存空间,用于保存文件路径名
// 
 char *tmp = getname(filename);
 int fd = PTR_ERR(tmp);
 if (!IS_ERR(tmp)) {
// 获取一个未使用的文件描述符, 和inode无关
  fd = get_unused_fd();
  if (fd >= 0) {
// 打开文件,将文件名转换为文件结构
   struct file *f = do_filp_open(dfd, tmp, flags, mode);
   if (IS_ERR(f)) {
    put_unused_fd(fd);
    fd = PTR_ERR(f);
   } else {
    fsnotify_open(f->f_dentry);
    fd_install(fd, f);
   }
  }
  putname(tmp);
 }
 return fd;
}

1）、首先，get_unused_fd()得到一个可用的文件描述符；并标记为忙。

2）、接着，do_filp_open()打开文件，返回一个file对象，代表由该进程打开的一个文件；进程通过这样的一个数据结构对物理文件进行读写操作。

3）、最后，fd_install()建立文件描述符与file对象的联系，以后进程对文件的读写都是通过操纵该文件描述符而进行。

b. do_filp_open()用于打开文件，返回一个file对象；而打开之前需要先找到该文件：

1）、open_namei()用于根据文件路径名查找文件，借助一个持有路径信息的数据结构nameidata而进行；

struct nameidata {
	struct dentry	*dentry;	/*当前目录项对象*/
	struct vfsmount *mnt;		/*已安装的文件系统对象的地址*/
	struct qstr	last;		/*路径名最后一部分*/
	unsigned int	flags;		/*查询标志*/
	int		last_type;	/*路径名最后一部分的类型*/
	unsigned	depth;		/*当前符号链接的深度，一般小于6*/
	char *saved_names[MAX_NESTED_LINKS + 1];/*关联符号链接的路径名数组*/


	/* Intent data */
	union {
		struct open_intent open;/*想要打开的文件的联合体*/
	} intent;
};

2）、查找结束后将填充有路径信息的nameidata返回给接下来的函数nameidata_to_filp()从而得到最终的file对象；当达到目的后，nameidata这个数据结构将会马上被释放。

static struct file *do_filp_open(int dfd, const char *filename, int flags,
     int mode)
{
 int namei_flags, error;
// 注意这是结构而不是指针
 struct nameidata nd;
 namei_flags = flags;
 if ((namei_flags+1) & O_ACCMODE)
  namei_flags++;
// 根据文件名得到nameidata, nd作为namei空间保存结果
 error = open_namei(dfd, filename, namei_flags, mode, &nd);
 if (!error)
// 成功, nameidata再转换为file指针
  return nameidata_to_filp(&nd, flags);
 return ERR_PTR(error);
}

在do_filp_open()中，函数调用open_namei()实现路径名的查找，生成相关的nameidata结构并得到相应的索引节点。

/* fs/namei.c */
/*
 * open_namei()
 *
 * namei for open - this is in fact almost the whole open-routine.
 *
 * Note that the low bits of "flag" aren't the same as in the open
 * system call - they are 00 - no permissions needed
 *     01 - read permission needed
 *     10 - write permission needed
 *     11 - read/write permissions needed
 * which is a lot more logical, and also allows the "no perm" needed
 * for symlinks (where the permissions are checked later).
 * SMP-safe
 */
int open_namei(int dfd, const char *pathname, int flag,
  int mode, struct nameidata *nd)
{
 int acc_mode, error;
 struct path path;
 struct dentry *dir;
 int count = 0;
// #define ACC_MODE(x) ("/000/004/002/006"[(x)&O_ACCMODE])
// 审计模式
 acc_mode = ACC_MODE(flag);
 /* O_TRUNC implies we need access checks for write permissions */
// 截断标志, 基本上需要写权限, 除非要截断的长度实际大于文件本身长度
 if (flag & O_TRUNC)
  acc_mode |= MAY_WRITE;
 /* Allow the LSM permission hook to distinguish append 
    access from general write access. */
// 添加标志, 也是需要写权限
 if (flag & O_APPEND)
  acc_mode |= MAY_APPEND;
 /*
  * The simplest case - just a plain lookup.
  */
// 不需要创建文件
 if (!(flag & O_CREAT)) {
// 直接找pathname的dentry和挂接点, 结果填在nd中
  error = path_lookup_open(dfd, pathname, lookup_flags(flag),
      nd, flag);
  if (error)
   return error;
  goto ok;
 }
 /*
  * Create - we need to know the parent.
  */
// 创建文件的dentry和挂接点, 数据填到nd中
 error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
 if (error)
  return error;
 /*
  * We have the parent and last component. First of all, check
  * that we are not asked to creat(2) an obvious directory - that
  * will not do.
  */
 error = -EISDIR;
// 检查nameidata结构中的last参数是否合法
 if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
  goto exit;
// 文件项dentry
 dir = nd->dentry;
// 去掉查询父目录标志
 nd->flags &= ~LOOKUP_PARENT;
 mutex_lock(&dir->d_inode->i_mutex);
// 填充path参数, 又根据nd的信息搜索一次当前的缓存的dentry
// 不过dir与path.dentry难道不相同么?
 path.dentry = lookup_hash(nd);
 path.mnt = nd->mnt;
do_last:
// 检查path.entry是否合法
 error = PTR_ERR(path.dentry);
 if (IS_ERR(path.dentry)) {
  mutex_unlock(&dir->d_inode->i_mutex);
  goto exit;
 }
// 检查nd->intent.open.file是否合法, 这是最终要返回的文件指针
 if (IS_ERR(nd->intent.open.file)) {
  mutex_unlock(&dir->d_inode->i_mutex);
  error = PTR_ERR(nd->intent.open.file);
  goto exit_dput;
 }
 /* Negative dentry, just create the file */
 if (!path.dentry->d_inode) {
// 创建新文件的inode, 然后返回
  error = open_namei_create(nd, &path, flag, mode);
  if (error)
   goto exit;
  return 0;
 }
// 现在是打开已经存在的文件
 /*
  * It already exists.
  */
 mutex_unlock(&dir->d_inode->i_mutex);
 audit_inode_update(path.dentry->d_inode);
 error = -EEXIST;
// O_EXCL标志是只必须打开的是不存在的文件, 文件已存在时错误
 if (flag & O_EXCL)
  goto exit_dput;
 if (__follow_mount(&path)) {
  error = -ELOOP;
  if (flag & O_NOFOLLOW)
   goto exit_dput;
 }
 error = -ENOENT;
 if (!path.dentry->d_inode)
  goto exit_dput;
// 如果dentry的具体FS的实现中定义了follow_link操作, 转
// 不过大多数FS的实现中都没有定义该函数
 if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
  goto do_link;
// 从路径中的dentry和mnt信息赋值到nameidata
 path_to_nameidata(&path, nd);
 error = -EISDIR;
// 如果是一个目录, 返回错误
 if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
  goto exit;
ok:
// 对nd中的dentry及其inode进行打开前的错误检查
 error = may_open(nd, acc_mode, flag);
 if (error)
  goto exit;
 return 0;
// 下面是错误处理, 释放掉已分配的资源, 返回错误
exit_dput:
 dput_path(&path, nd);
exit:
 if (!IS_ERR(nd->intent.open.file))
  release_open_intent(nd);
 path_release(nd);
 return error;
// 处理符号连接, 找到实际文件的inode,然后重新循环, 要注意回环情况的错误处理
do_link:
 error = -ELOOP;
 if (flag & O_NOFOLLOW)
  goto exit_dput;
 /*
  * This is subtle. Instead of calling do_follow_link() we do the
  * thing by hands. The reason is that this way we have zero link_count
  * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
  * After that we have the parent and last component, i.e.
  * we are in the same situation as after the first path_walk().
  * Well, almost - if the last component is normal we get its copy
  * stored in nd->last.name and we will have to putname() it when we
  * are done. Procfs-like symlinks just set LAST_BIND.
  */
// 设置查找LOOKUP_PARENT标志
 nd->flags |= LOOKUP_PARENT;
 error = security_inode_follow_link(path.dentry, nd);
 if (error)
  goto exit_dput;
// 处理符号链接
 error = __do_follow_link(&path, nd);
 if (error) {
  /* Does someone understand code flow here? Or it is only
   * me so stupid? Anathema to whoever designed this non-sense
   * with "intent.open".
   */
  release_open_intent(nd);
  return error;
 }
 nd->flags &= ~LOOKUP_PARENT;
// 检查最后一段文件或目录名的属性情况
 if (nd->last_type == LAST_BIND)
  goto ok;
 error = -EISDIR;
 if (nd->last_type != LAST_NORM)
  goto exit;
 if (nd->last.name[nd->last.len]) {
  __putname(nd->last.name);
  goto exit;
 }
 error = -ELOOP;
// 出现回环标志: 循环超过32次
 if (count++==32) {
  __putname(nd->last.name);
  goto exit;
 }
 dir = nd->dentry;
 mutex_lock(&dir->d_inode->i_mutex);
// 更新路径的挂接点和dentry
 path.dentry = lookup_hash(nd);
 path.mnt = nd->mnt;
 __putname(nd->last.name);
 goto do_last;
}

在open_namei()中，就是利用path_lookup_open和path_lookup_create这两个函数找到路径名对应的挂接点和dentry结构, 赋值到nameidata结构中, create时如果文件不存在, 建立新文件。

/**
 * path_lookup_open - lookup a file path with open intent
 * @dfd: the directory to use as base, or AT_FDCWD
 * @name: pointer to file name
 * @lookup_flags: lookup intent flags
 * @nd: pointer to nameidata
 * @open_flags: open intent flags
 */
int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
  struct nameidata *nd, int open_flags)
{
 return __path_lookup_intent_open(dfd, name, lookup_flags, nd,
   open_flags, 0);
}

/**
 * path_lookup_create - lookup a file path with open + create intent
 * @dfd: the directory to use as base, or AT_FDCWD
 * @name: pointer to file name
 * @lookup_flags: lookup intent flags
 * @nd: pointer to nameidata
 * @open_flags: open intent flags
 * @create_mode: create intent flags
 */
static int path_lookup_create(int dfd, const char *name,
         unsigned int lookup_flags, struct nameidata *nd,
         int open_flags, int create_mode)
{
 return __path_lookup_intent_open(dfd, name, lookup_flags|LOOKUP_CREATE,
   nd, open_flags, create_mode);
}

这两个函数都是调用__path_lookup_intent_open, 只是参数不同,create中加入了LOOKUP_CREATE标志和create_mode:

static int __path_lookup_intent_open(int dfd, const char *name,
  unsigned int lookup_flags, struct nameidata *nd,
  int open_flags, int create_mode)
{
// 找一个空闲的文件指针
 struct file *filp = get_empty_filp();
 int err;
// 找不到返回错误, 文件表溢出了
 if (filp == NULL)
  return -ENFILE;
// 在nameidate中填充打开的文件参数, 这是最终会返回的文件指针
 nd->intent.open.file = filp;
 nd->intent.open.flags = open_flags;
 nd->intent.open.create_mode = create_mode;
// 进行具体的路径查找, name是路径名
 err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
// 先检查nd->intent.open.file而不是err
 if (IS_ERR(nd->intent.open.file)) {
// 打开的文件指针错误
  if (err == 0) {
// do_path_lookup已经成功了, 释放path, err重新设置为错误值
   err = PTR_ERR(nd->intent.open.file);
   path_release(nd);
  }
 } else if (err != 0)
  release_open_intent(nd);
 return err;
}

// 路径查找
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int fastcall do_path_lookup(int dfd, const char *name,
    unsigned int flags, struct nameidata *nd)
{
 int retval = 0;
 int fput_needed;
 struct file *file;
// 文件系统指针从进程中获取
 struct fs_struct *fs = current->fs;
// 缺省情况last_type为绝对路径, 以"/"开头的格式
 nd->last_type = LAST_ROOT; /* if there are only slashes... */
 nd->flags = flags;
 nd->depth = 0;
// 下面只是用于增加某些变量的使用计数值, get是增加,put是减少
 if (*name=='/') {
// 绝对路径格式
  read_lock(&fs->lock);
  if (fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
// 检查是否更改了root, 即用chroot
// 增加altrootmnt的使用计数, 其为一vfsmount结构指针
   nd->mnt = mntget(fs->altrootmnt);
   nd->dentry = dget(fs->altroot);
   read_unlock(&fs->lock);
   if (__emul_lookup_dentry(name,nd))
    goto out; /* found in altroot */
   read_lock(&fs->lock);
  }
// 增加rootmnt的使用计数然后赋值到nd中
  nd->mnt = mntget(fs->rootmnt);
// 增加根的dentry的使用计数然后赋值到nd中
  nd->dentry = dget(fs->root);
  read_unlock(&fs->lock);
 } else if (dfd == AT_FDCWD) {
// 从sys_open调用来的话会到这里, 表示从当前工作目录的路径开始的相对路径
  read_lock(&fs->lock);
// 增加pwdmnt使用计数然后赋值到nd中
  nd->mnt = mntget(fs->pwdmnt);
// 增加pwd使用计数然后赋值到nd中
  nd->dentry = dget(fs->pwd);
  read_unlock(&fs->lock);
 } else {
  struct dentry *dentry;
// 轻量级的路径查找, fd不是共享的话不会增加引用计数
  file = fget_light(dfd, &fput_needed);
  retval = -EBADF;
  if (!file)
   goto out_fail;
  dentry = file->f_dentry;
  retval = -ENOTDIR;
  if (!S_ISDIR(dentry->d_inode->i_mode))
   goto fput_fail;
// 检查文件的执行权限
  retval = file_permission(file, MAY_EXEC);
  if (retval)
   goto fput_fail;
// 增加f_vfsmnt的使用计数
  nd->mnt = mntget(file->f_vfsmnt);
  nd->dentry = dget(dentry);
// 轻量级释放
  fput_light(file, fput_needed);
 }
// 清空总链接数
 current->total_link_count = 0;
// 变量路径表查询, 核心函数
 retval = link_path_walk(name, nd);
out:
 if (likely(retval == 0)) {
// 在大部分情况下都会执行到这,能正确打开路径
  if (unlikely(!audit_dummy_context() && nd && nd->dentry &&
    nd->dentry->d_inode))
  audit_inode(name, nd->dentry->d_inode);
 }
out_fail:
 return retval;
fput_fail:
 fput_light(file, fput_needed);
 goto out_fail;
}

do_path_lookup调用的核心函数是link_path_walk:

/*
 * Wrapper to retry pathname resolution whenever the underlying
 * file system returns an ESTALE.
 *
 * Retry the whole path once, forcing real lookup requests
 * instead of relying on the dcache.
 */
int fastcall link_path_walk(const char *name, struct nameidata *nd)
{
// 先备份一下
 struct nameidata save = *nd;
 int result;
 /* make sure the stuff we saved doesn't go away */
 dget(save.dentry);
 mntget(save.mnt);
 result = __link_path_walk(name, nd);
 if (result == -ESTALE) {
// ESTALE是失效的文件句柄错误
// 用备份的nameidate重新恢复, 设置LOOKUP_REVAL标志后重新查询
  *nd = save;
  dget(nd->dentry);
  mntget(nd->mnt);
  nd->flags |= LOOKUP_REVAL;
  result = __link_path_walk(name, nd);
 }
 dput(save.dentry);
 mntput(save.mnt);
 return result;
}

真正的名称解析函数__link_path_walk:
/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
 *
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
{
 struct path next;
 struct inode *inode;
 int err;
 unsigned int lookup_flags = nd->flags;
// 去掉起始多余的"/", 同时也说明系统可以允许你输入多个"/"而不报错
 while (*name=='/')
  name++;
// 空路径
 if (!*name)
  goto return_reval;
// 路径对应的inode
 inode = nd->dentry->d_inode;
 if (nd->depth)
  lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
 /* At this point we know we have a real path component. */
 for(;;) {
// 循环处理,每个循环提取文件路径的一个目录名, '/'分隔
  unsigned long hash;
  struct qstr this;
  unsigned int c;
  nd->flags |= LOOKUP_CONTINUE;
// 检查文件权限, 包括读写执行权限, 用户/组/其他权限, 返回0为合法
  err = exec_permission_lite(inode, nd);
  if (err == -EAGAIN)
// EAGAIN表示该inode正在被操作, 检查其执行权限
// 而对于普通文件检查结果将是错误
   err = vfs_permission(nd, MAY_EXEC);
// 出错中断循环
   if (err)
   break;
// 填充quickstring结构
  this.name = name;
// name的第一个字符的数值
  c = *(const unsigned char *)name;
// 计算文件名的hash, 不包括'/'
  hash = init_name_hash();
  do {
   name++;
   hash = partial_name_hash(c, hash);
   c = *(const unsigned char *)name;
  } while (c && (c != '/'));
// 目录(如果有的话)的名称长度
  this.len = name - (const char *) this.name;
// hash
  this.hash = end_name_hash(hash);
  /* remove trailing slashes? */
// c为0表示是最后的具体文件名了
  if (!c)
   goto last_component;
// 跳过中间的'/'
  while (*++name == '/');
// 到名称尾, 说明文件名最后一个字符是'/'
  if (!*name)
   goto last_with_slashes;
  /*
   * "." and ".." are special - ".." especially so because it has
   * to be able to know about the current root directory and
   * parent relationships.
   */
// 如果第一个字符是'.'
  if (this.name[0] == '.') switch (this.len) {
   default:
// 是一个一'.'开头的文件或目录名称
    break;
   case 2: 
// 第2 个字符不是".", 是普通文件或路径名
    if (this.name[1] != '.')
     break;
// 以".."开头, 是父目录, 更新nd为父目录nameidata数据, inode相应更新重新循环
    follow_dotdot(nd);
    inode = nd->dentry->d_inode;
    /* fallthrough */
   case 1:
// 以'.'开头的当前目录, 忽略, 重新循环
    continue;
  }
  /*
   * See if the low-level filesystem might want
   * to use its own hash..
   */
// 底层FS实现中有自己的HASH算法
  if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
   err = nd->dentry->d_op->d_hash(nd->dentry, &this);
   if (err < 0)
    break;
  }
  /* This does the actual lookups.. */
// 根据文件/目录名进行具体的查找
  err = do_lookup(nd, &this, &next);
  if (err)
   break;
  err = -ENOENT;
// inode更新为本级文件目录的inode
  inode = next.dentry->d_inode;
// 找不到inode, 转错误处理
  if (!inode)
   goto out_dput;
  err = -ENOTDIR; 
  if (!inode->i_op)
   goto out_dput;
  if (inode->i_op->follow_link) {
// 处理符号链接, 在其中考虑了递归互相链接的异常处理
   err = do_follow_link(&next, nd);
   if (err)
    goto return_err;
   err = -ENOENT;
// 更新inode为实际的inode
   inode = nd->dentry->d_inode;
   if (!inode)
    break;
   err = -ENOTDIR; 
   if (!inode->i_op)
    break;
  } else
// nd中得到下一级路径信息
   path_to_nameidata(&next, nd);
  err = -ENOTDIR; 
  if (!inode->i_op->lookup)
   break;
// 继续循环找下一目录文件名称
  continue;
  /* here ends the main loop */
// 最后的文件名了, 处理和前面类似
last_with_slashes:
// 最后一个字符是'/', 是一个目录
  lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
last_component:
  /* Clear LOOKUP_CONTINUE iff it was previously unset */
  nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
  if (lookup_flags & LOOKUP_PARENT)
   goto lookup_parent;
  if (this.name[0] == '.') switch (this.len) {
   default:
    break;
   case 2: 
// 文件名不是"..", 继续
    if (this.name[1] != '.')
     break;
// 文件名是"..", 到父目录
    follow_dotdot(nd);
    inode = nd->dentry->d_inode;
    /* fallthrough */
   case 1:
// 文件名就是".", 跳到返回处理
    goto return_reval;
  }
// 一般文件处理
// 底层FS实现中有自己的HASH算法
  if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
   err = nd->dentry->d_op->d_hash(nd->dentry, &this);
   if (err < 0)
    break;
  }
// 查找最后的文件名
  err = do_lookup(nd, &this, &next);
  if (err)
   break;
  inode = next.dentry->d_inode;
  if ((lookup_flags & LOOKUP_FOLLOW)
      && inode && inode->i_op && inode->i_op->follow_link) {
   err = do_follow_link(&next, nd);
   if (err)
    goto return_err;
   inode = nd->dentry->d_inode;
  } else
// 更新nameidata中的mnt, dentry值
   path_to_nameidata(&next, nd);
  err = -ENOENT;
  if (!inode)
   break;
  if (lookup_flags & LOOKUP_DIRECTORY) {
   err = -ENOTDIR; 
   if (!inode->i_op || !inode->i_op->lookup)
    break;
  }
  goto return_base;
lookup_parent:
// 复制当前quickstring结构this信息到nd的last中
// 类型为LAST_NORM
  nd->last = this;
  nd->last_type = LAST_NORM;
  if (this.name[0] != '.')
   goto return_base;
  if (this.len == 1)
   nd->last_type = LAST_DOT;
  else if (this.len == 2 && this.name[1] == '.')
   nd->last_type = LAST_DOTDOT;
  else
   goto return_base;
return_reval:
// 返回
  /*
   * We bypassed the ordinary revalidation routines.
   * We may need to check the cached dentry for staleness.
   */
  if (nd->dentry && nd->dentry->d_sb &&
      (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
   err = -ESTALE;
   /* Note: we do not d_invalidate() */
   if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd))
    break;
  }
return_base:
  return 0;
out_dput:
  dput_path(&next, nd);
  break;
 }
// 到这里属于出错了
 path_release(nd);
return_err:
 return err;
}
 
/*
 *  It's more convoluted than I'd like it to be, but... it's still fairly
 *  small and for now I'd prefer to have fast path as straight as possible.
 *  It _is_ time-critical.
 */
static int do_lookup(struct nameidata *nd, struct qstr *name,
       struct path *path)
{
 struct vfsmount *mnt = nd->mnt;
// 从系统缓存的dentry的hash表中查找父dentry是nd->dentry, 名称为name的dentry
 struct dentry *dentry = __d_lookup(nd->dentry, name);
// 没找到dentry, 进行真正从存储硬盘中查找
 if (!dentry)
  goto need_lookup;
// 需要进行revalidate操作时先进行validate操作
 if (dentry->d_op && dentry->d_op->d_revalidate)
  goto need_revalidate;
done:
// 找到, 填充path参数: 挂接点mnt和目录项dentry
 path->mnt = mnt;
 path->dentry = dentry;
 __follow_mount(path);
 return 0;
need_lookup:
// 进行真正的查找, 不过read_lookup会重新调用__d_lookup, 找不到才调用底层的fs实现去查找
// 好象是重复操作了
// real_lookup中的操作才反映了各个fs底层和相关标志的区别处理
 dentry = real_lookup(nd->dentry, name, nd);
 if (IS_ERR(dentry))
  goto fail;
 goto done;
need_revalidate:
// 进行validate操作
 dentry = do_revalidate(dentry, nd);
 if (!dentry)
  goto need_lookup;
 if (IS_ERR(dentry))
  goto fail;
 goto done;
fail:
 return PTR_ERR(dentry);
}

/*
 * This is called when everything else fails, and we actually have
 * to go to the low-level filesystem to find out what we should do..
 *
 * We get the directory semaphore, and after getting that we also
 * make sure that nobody added the entry to the dcache in the meantime..
 * SMP-safe
 */
static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
{
 struct dentry * result;
 struct inode *dir = parent->d_inode;
 mutex_lock(&dir->i_mutex);
 /*
  * First re-do the cached lookup just in case it was created
  * while we waited for the directory semaphore..
  *
  * FIXME! This could use version numbering or similar to
  * avoid unnecessary cache lookups.
  *
  * The "dcache_lock" is purely to protect the RCU list walker
  * from concurrent renames at this point (we mustn't get false
  * negatives from the RCU list walk here, unlike the optimistic
  * fast walk).
  *
  * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
  */
// 查找缓存中的dentry项
 result = d_lookup(parent, name);
 if (!result) {
// 没找到, 新建dentry项
  struct dentry * dentry = d_alloc(parent, name);
  result = ERR_PTR(-ENOMEM);
  if (dentry) {
// 调用inode的查找操作, 这是和具体文件系统相关
   result = dir->i_op->lookup(dir, dentry, nd);
   if (result)
// 失败, 释放dentry
    dput(dentry);
   else
// 成功, 找到的dentry作为结果返回
    result = dentry;
  }
  mutex_unlock(&dir->i_mutex);
  return result;
 }
 /*
  * Uhhuh! Nasty case: the cache was re-populated while
  * we waited on the semaphore. Need to revalidate.
  */
// 在缓存中找到dentry项, 进行validate操作
 mutex_unlock(&dir->i_mutex);
 if (result->d_op && result->d_op->d_revalidate) {
  result = do_revalidate(result, nd);
  if (!result)
   result = ERR_PTR(-ENOENT);
 }
 return result;
}

小结一下函数调用顺序:
path_lookup_open    path_lookup_create
     |                     |
     V                     V
   __path_lookup_intent_open
               |
               V
        do_path_lookup
               |
               V
        link_path_walk
               |
               V
      __link_path_walk
               |
               V
           do_lookup
               |
               V
          real_lookup

fs/open.c
844  struct file *dentry_open_it(struct dentry *dentry, struct 845   vfsmount *mnt, int
 flags, struct lookup_intent *it)
846  {
847   struct file * f;
848   struct inode *inode;
849   int error;
850
851   error = -ENFILE;
852   f = get_empty_filp();
...
855   f->f_flags = flags;
856   f->f_mode = (flags+1) & O_ACCMODE;
857   f->f_it = it;
858   inode = dentry->d_inode;
859   if (f->f_mode & FMODE_WRITE) {
860    error = get_write_access(inode);
861    if (error)
862      goto cleanup_file;
863   }
...
866   f->f_dentry = dentry;
867   f->f_vfsmnt = mnt;
868   f->f_pos = 0;
869   f->f_op = fops_get(inode->i_fop);
870   file_move(f, &inode->i_sb->s_files);
871
872   if (f->f_op && f->f_op->open) {
873     error = f->f_op->open(inode,f);
874     if (error)
875      goto cleanup_all;
876     intent_release(it);
877   }
...
891  return f;
...
907  }
-----------------------------------------------------------------------

1）、path_lookup_open()实现文件的查找功能；要打开的文件若不存在，还需要有一个新建的过程，则调用path_lookup_create()，后者和前者封装的是同一个实际的路径查找函数，只是参数不一样，使它们在处理细节上有所偏差；

2）、当是以新建文件的方式打开文件时，即设置了O_CREAT标识时需要创建一个新的索引节点，代表创建一个文件。在vfs_create()里的一句核心语句dir->i_op->create(dir, dentry, mode, nd)可知它调用了具体的文件系统所提供的创建索引节点的方法。注意：这边的索引节点的概念，还只是位于内存之中，它和磁盘上的物理的索引节点的关系就像位于内存中和位于磁盘中的文件一样。此时新建的索引节点还不能完全标志一个物理文件的成功创建，只有当把索引节点回写到磁盘上才是一个物理文件的真正创建。想想我们以新建的方式打开一个文件，对其读写但最终没有保存而关闭，则位于内存中的索引节点会经历从新建到消失的过程，而磁盘却始终不知道有人曾经想过创建一个文件，这是因为索引节点没有回写的缘故。

3）、path_to_nameidata()填充nameidata数据结构；

4）、may_open()检查是否可以打开该文件；一些文件如链接文件和只有写权限的目录是不能被打开的，先检查nd->dentry->inode所指的文件是否是这一类文件，是的话则错误返回。还有一些文件是不能以TRUNC的方式打开的，若nd->dentry->inode所指的文件属于这一类，则显式地关闭TRUNC标志位。接着如果有以TRUNC方式打开文件的，则更新nd->dentry->inode的信息.