linux 文件系统(三)mount 源码分析

源码基于linux 3.10版本。

1 关键数据结构struct mount和struct vfsmount

struct mount代表着一个mount实例(一次真正挂载对应一个mount实例),其中struct vfsmount定义的mnt成员是它最核心的部分。过去没有stuct mount,mount和vfsmount的成员都在vfsmount里,现在linux将vfsmount改作mount结构体,并将mount中mnt_root, mnt_sb, mnt_flags成员移到vfsmount结构体中了。这样使得vfsmount的内容更加精简,在很多情况下只需要传递vfsmount而已。

struct vfsmount的定义:

struct vfsmount {
        struct dentry *mnt_root;        /* 指向这个文件系统的根的dentry */
        struct super_block *mnt_sb;     /* 指向这个文件系统的超级块对象 */
        int mnt_flags;                  /* 此文件系统的挂载标志 */
}

struct mount的定义:

struct mount {
        struct hlist_node mnt_hash;    /* 用于链接到全局已挂载文件系统的链表 */
        struct mount *mnt_parent;      /* 指向此文件系统的挂载点所属的文件系统,即父文件系统 */ 
        struct dentry *mnt_mountpoint; /* 指向此文件系统的挂载点的dentry */
        struct vfsmount mnt;           /* 指向此文件系统的vfsmount实例 */
        union {
                struct rcu_head mnt_rcu;
                struct llist_node mnt_llist;
        };
#ifdef CONFIG_SMP
        struct mnt_pcp __percpu *mnt_pcp;
#else
        int mnt_count;
        int mnt_writers;
#endif
        struct list_head mnt_mounts;    /* 挂载在此文件系统下的所有子文件系统的链表的表头,下面的节点都是mnt_child */
        struct list_head mnt_child;     /* 链接到被此文件系统所挂的父文件系统的mnt_mounts上 */
        struct list_head mnt_instance;  /* 链接到sb->s_mounts上的一个mount实例 */
        const char *mnt_devname;        /* 设备名,如/dev/sdb1 */
        struct list_head mnt_list;      /* 链接到进程namespace中已挂载文件系统中,表头为mnt_namespace的list域 */ 
        struct list_head mnt_expire;    /* 链接到一些文件系统专有的过期链表,如NFS, CIFS等 */
        struct list_head mnt_share;     /* 链接到共享挂载的循环链表中 */
        struct list_head mnt_slave_list;/* 此文件系统的slave mount链表的表头 */
        struct list_head mnt_slave;     /* 连接到master文件系统的mnt_slave_list */
        struct mount *mnt_master;       /* 指向此文件系统的master文件系统,slave is on master->mnt_slave_list */
        struct mnt_namespace *mnt_ns;   /* 指向包含这个文件系统的进程的name space */
        struct mountpoint *mnt_mp;      /* where is it mounted */
        struct hlist_node mnt_mp_list;  /* list mounts with the same mountpoint */
        struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
        struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
        __u32 mnt_fsnotify_mask;
#endif
        int mnt_id;                     /* mount identifier */
        int mnt_group_id;               /* peer group identifier */
        int mnt_expiry_mark;            /* true if marked for expiry */
        struct hlist_head mnt_pins;
        struct fs_pin mnt_umount;
        struct dentry *mnt_ex_mountpoint;
}

一个文件系统可以挂装载到不同的挂载点。所以文件系统树的一个位置要由<mount, dentry>二元组(或者说<vfsmount, dentry>)来确定。

2 源码分析

在应用层使用mount系统调用以后,就会进入该函数:

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
		char __user *, type, unsigned long, flags, void __user *, data)
{
	int ret;
	char *kernel_type;
	struct filename *kernel_dir;
	char *kernel_dev;
	unsigned long data_page;

	ret = copy_mount_string(type, &kernel_type);
	if (ret < 0)
		goto out_type;

	kernel_dir = getname(dir_name);
	if (IS_ERR(kernel_dir)) {
		ret = PTR_ERR(kernel_dir);
		goto out_dir;
	}

	ret = copy_mount_string(dev_name, &kernel_dev);
	if (ret < 0)
		goto out_dev;

	ret = copy_mount_options(data, &data_page);
	if (ret < 0)
		goto out_data;

	ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags,
		(void *) data_page);

	free_page(data_page);
out_data:
	kfree(kernel_dev);
out_dev:
	putname(kernel_dir);
out_dir:
	kfree(kernel_type);
out_type:
	return ret;
}

上面的函数很简单,把用户层的数据拷贝到内核中,然后调用do_mount:

long do_mount(const char *dev_name, const char *dir_name,
		const char *type_page, unsigned long flags, void *data_page)
{
	struct path path;
	int retval = 0;
	int mnt_flags = 0;

	/* Discard magic */
	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
		flags &= ~MS_MGC_MSK;

	/* Basic sanity checks */

	if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
		return -EINVAL;

	if (data_page)
		((char *)data_page)[PAGE_SIZE - 1] = 0;

	/* ... and get the mountpoint */
    //该函数沿着目录寻找需要挂载的dentry 以及挂载点,并放入path中
	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
	if (retval)
		return retval;
    //和安全管理有关,暂时先不看
	retval = security_sb_mount(dev_name, &path,
				   type_page, flags, data_page);
	if (!retval && !may_mount())
		retval = -EPERM;
	if (retval)
		goto dput_out;

	/* Default to relatime unless overriden */
	if (!(flags & MS_NOATIME))
		mnt_flags |= MNT_RELATIME;

	/* Separate the per-mountpoint flags */
	if (flags & MS_NOSUID)
		mnt_flags |= MNT_NOSUID;
	if (flags & MS_NODEV)
		mnt_flags |= MNT_NODEV;
	if (flags & MS_NOEXEC)
		mnt_flags |= MNT_NOEXEC;
	if (flags & MS_NOATIME)
		mnt_flags |= MNT_NOATIME;
	if (flags & MS_NODIRATIME)
		mnt_flags |= MNT_NODIRATIME;
	if (flags & MS_STRICTATIME)
		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
	if (flags & MS_RDONLY)
		mnt_flags |= MNT_READONLY;

	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
		   MS_STRICTATIME);

	if (flags & MS_REMOUNT)
		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
				    data_page);
	else if (flags & MS_BIND)
		retval = do_loopback(&path, dev_name, flags & MS_REC);
	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
		retval = do_change_type(&path, flags);
	else if (flags & MS_MOVE)
		retval = do_move_mount(&path, dev_name);
	else    //正常mount 流程,主要关注该分支
		retval = do_new_mount(&path, type_page, flags, mnt_flags,
				      dev_name, data_page);
dput_out:
	path_put(&path);
	return retval;
}

核心函数是kern_path和do_new_mount。

2.1 kern_path

int kern_path(const char *name, unsigned int flags, struct path *path)
{
	struct nameidata nd;
	int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
	if (!res)
		*path = nd.path;
	return res;
}

kern_path

      --------->do_path_lookup

             ------------>filename_lookup

                    --------------->path_lookupat

分析path_lookupat:

static int path_lookupat(int dfd, const char *name,
				unsigned int flags, struct nameidata *nd)
{
	struct file *base = NULL;
	struct path path;
	int err;

	/*
	 * Path walking is largely split up into 2 different synchronisation
	 * schemes, rcu-walk and ref-walk (explained in
	 * Documentation/filesystems/path-lookup.txt). These share much of the
	 * path walk code, but some things particularly setup, cleanup, and
	 * following mounts are sufficiently divergent that functions are
	 * duplicated. Typically there is a function foo(), and its RCU
	 * analogue, foo_rcu().
	 *
	 * -ECHILD is the error number of choice (just to avoid clashes) that
	 * is returned if some aspect of an rcu-walk fails. Such an error must
	 * be handled by restarting a traditional ref-walk (which will always
	 * be able to complete).
	 */
//用挂载路径的根目录的挂载实例以及root dentry 初始化path
	err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);

	if (unlikely(err))
		return err;

	current->total_link_count = 0;
	err = link_path_walk(name, nd);

	if (!err && !(flags & LOOKUP_PARENT)) {
		err = lookup_last(nd, &path);
		while (err > 0) {
			void *cookie;
			struct path link = path;
			err = may_follow_link(&link, nd);
			if (unlikely(err))
				break;
			nd->flags |= LOOKUP_PARENT;
			err = follow_link(&link, nd, &cookie);
			if (err)
				break;
			err = lookup_last(nd, &path);
			put_link(nd, &link, cookie);
		}
	}

	if (!err)
		err = complete_walk(nd);

	if (!err && nd->flags & LOOKUP_DIRECTORY) {
		if (!can_lookup(nd->inode)) {
			path_put(&nd->path);
			err = -ENOTDIR;
		}
	}

	if (base)
		fput(base);

	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
		path_put(&nd->root);
		nd->root.mnt = NULL;
	}
	return err;
}

 path_init函数:

path_lookupat

     ------------>path_init

static int path_init(int dfd, const char *name, unsigned int flags,
		     struct nameidata *nd, struct file **fp)
{
	int retval = 0;

	nd->last_type = LAST_ROOT; /* if there are only slashes... */
	nd->flags = flags | LOOKUP_JUMPED;
	nd->depth = 0;
	if (flags & LOOKUP_ROOT) {   //我们没有传LOOKUP_ROOT flag,所以这个分支不走
		struct inode *inode = nd->root.dentry->d_inode;
		if (*name) {
			if (!can_lookup(inode))
				return -ENOTDIR;
			retval = inode_permission(inode, MAY_EXEC);
			if (retval)
				return retval;
		}
		nd->path = nd->root;
		nd->inode = inode;
		if (flags & LOOKUP_RCU) {
			lock_rcu_walk();
			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
		} else {
			path_get(&nd->path);
		}
		return 0;
	}

	nd->root.mnt = NULL;

	if (*name=='/') {   //这个分支,当传下来的挂载路径名是/开头时,说明我们要从文件系统的根目录开始搜索
		if (flags & LOOKUP_RCU) { //只讨论rcu 的查找,该查找过程不会睡眠
			lock_rcu_walk();
			set_root_rcu(nd);  //进程的根目录路径
		} else {
			set_root(nd);
			path_get(&nd->root);
		}
		nd->path = nd->root;
	} else if (dfd == AT_FDCWD) {  //当不是/开头时,dfd又等于AT_FDCWD时,则从当前目录开始查找,把进程的当前目录路径赋值给path
		if (flags & LOOKUP_RCU) {
			struct fs_struct *fs = current->fs;
			unsigned seq;

			lock_rcu_walk();

			do {
				seq = read_seqcount_begin(&fs->seq);
				nd->path = fs->pwd; //进程的当前路径
				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
			} while (read_seqcount_retry(&fs->seq, seq));
		} else {
			get_fs_pwd(current->fs, &nd->path);
		}
	} else {
		/* Caller must check execute permissions on the starting path component */
		struct fd f = fdget_raw(dfd);
		struct dentry *dentry;

		if (!f.file)
			return -EBADF;

		dentry = f.file->f_path.dentry;

		if (*name) {
			if (!can_lookup(dentry->d_inode)) {
				fdput(f);
				return -ENOTDIR;
			}
		}

		nd->path = f.file->f_path;
		if (flags & LOOKUP_RCU) {
			if (f.need_put)
				*fp = f.file;
			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
			lock_rcu_walk();
		} else {
			path_get(&nd->path);
			fdput(f);
		}
	}

	nd->inode = nd->path.dentry->d_inode;
	return 0;
}

上面函数主要看两种情况,即从/目录开始查找,或者从当前目录开始查找,分别初始化path为不同的值,这边只分析一下/目录开始的情况。

比如说我们想查找的路径是/mnt/a,通过上面path_init函数以后,path会被初始化为根文件系统的path,即第一个/ 符号,看一下path的结构:

struct path {
	struct vfsmount *mnt;
	struct dentry *dentry;
};

存放的是文件系统根目录的挂载点vfsmount 和该文件系统根目录的dentry。

初始化完namespace的path以后:

path_lookupat

     ----------->link_path_walk

static int link_path_walk(const char *name, struct nameidata *nd)
{
	struct path next;
	int err;
	
	while (*name=='/')
		name++;  //跳过最前面的/符号,进行了容错处理
	if (!*name)
		return 0;

	/* At this point we know we have a real path component. */
	for(;;) {
		struct qstr this;
		long len;
		int type;

		err = may_lookup(nd);
 		if (err)
			break;

		len = hash_name(name, &this.hash); //算出/符号后面的字符串的hash值,比如说/mnt/a这个目录,第一次计算,则计算mnt的hash值
		this.name = name; //如果搜索路径是这个/mnt/a,则第一次name为mnt/a,第二次循环为a
		this.len = len; //len 记录的是上面用来计算hash的那个名字的长度

		type = LAST_NORM;
		if (name[0] == '.') switch (len) {
			case 2:
				if (name[1] == '.') {  //走到这里,说明如果是..开头的目录,则要跳出该次搜索,因为需要返回到上级目录
					type = LAST_DOTDOT;
					nd->flags |= LOOKUP_JUMPED;
				}
				break;
			case 1:
				type = LAST_DOT;
		}
		if (likely(type == LAST_NORM)) {  //走到这里说明是/开头的目录
			struct dentry *parent = nd->path.dentry;
			nd->flags &= ~LOOKUP_JUMPED;
			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { 
				err = parent->d_op->d_hash(parent, nd->inode,
							   &this);
				if (err < 0)
					break;
			}
		}

		nd->last = this;
		nd->last_type = type;

		if (!name[len])
			return 0;
		/*
		 * If it wasn't NUL, we know it was '/'. Skip that
		 * slash, and continue until no more slashes.
		 */
		do {
			len++;    
		} while (unlikely(name[len] == '/'));
		if (!name[len]) //如果当前name子串后面为空,则返回,查找完毕,即查找到/a 后面这种情况,注意如果执行到/a,a这个目录的dentry 这边是还没有解析的,返回的dentry结构应该是mnt的目录项
			return 0;

		name += len;  //对于mnt/a的路径,第一次执行完以后,name会变为/a,为下一次循环做准备

		err = walk_component(nd, &next, LOOKUP_FOLLOW); //查找的核心函数
		if (err < 0)
			return err;

		if (err) {
			err = nested_symlink(&next, nd);
			if (err)
				return err;
		}
		if (!can_lookup(nd->inode)) {
			err = -ENOTDIR; 
			break;
		}
	}
	terminate_walk(nd);
	return err;
}

link_path_walk 函数的作用就是对path中name的子串依次解析(最后 一个子串不做处理,直接退出),比如是/mnt/a 路径,则先解析mnt,找到该目录的dentry结构,a子串不在这边处理,直接结束该函数。所以link_path_walk就是找到上层父目录的dentry为止。最后一个节点需要其他函数再做处理。

path_lookupat

     ----------->link_path_walk

           ------------>walk_component

static inline int walk_component(struct nameidata *nd, struct path *path,
		int follow)
{
	struct inode *inode;
	int err;
	/*
	 * "." and ".." are special - ".." especially so because it has
	 * to be able to know about the current root directory and
	 * parent relationships.
	 */
	if (unlikely(nd->last_type != LAST_NORM)) //查找路径中有 .符号,则走这一支
		return handle_dots(nd, nd->last_type);
	err = lookup_fast(nd, path, &inode);  //主要分析该函数
	if (unlikely(err)) {
		if (err < 0)
			goto out_err;

		err = lookup_slow(nd, path);
		if (err < 0)
			goto out_err;

		inode = path->dentry->d_inode;
	}
	err = -ENOENT;
	if (!inode)
		goto out_path_put;

	if (should_follow_link(inode, follow)) {
		if (nd->flags & LOOKUP_RCU) {
			if (unlikely(unlazy_walk(nd, path->dentry))) {
				err = -ECHILD;
				goto out_err;
			}
		}
		BUG_ON(inode != path->dentry->d_inode);
		return 1;
	}
	path_to_nameidata(path, nd);
	nd->inode = inode;
	return 0;

out_path_put:
	path_to_nameidata(path, nd);
out_err:
	terminate_walk(nd);
	return err;
}

path_lookupat

     ----------->link_path_walk

           ------------>walk_component

                  -------------->lookup_fast

static int lookup_fast(struct nameidata *nd,
		       struct path *path, struct inode **inode)
{
	struct vfsmount *mnt = nd->path.mnt;
	struct dentry *dentry, *parent = nd->path.dentry;
	int need_reval = 1;
	int status = 1;
	int err;

	/*
	 * Rename seqlock is not required here because in the off chance
	 * of a false negative due to a concurrent rename, we're going to
	 * do the non-racy lookup, below.
	 */
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;
		dentry = __d_lookup_rcu(parent, &nd->last, &seq, nd->inode); //调用该函数去查找dentry

		if (!dentry)
			goto unlazy;

		/*
		 * This sequence count validates that the inode matches
		 * the dentry name information from lookup.
		 */
		*inode = dentry->d_inode;
		if (read_seqcount_retry(&dentry->d_seq, seq))
			return -ECHILD;

		/*
		 * This sequence count validates that the parent had no
		 * changes while we did the lookup of the dentry above.
		 *
		 * The memory barrier in read_seqcount_begin of child is
		 *  enough, we can use __read_seqcount_retry here.
		 */
		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
			return -ECHILD;
		nd->seq = seq;

		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
			status = d_revalidate(dentry, nd->flags);
			if (unlikely(status <= 0)) {
				if (status != -ECHILD)
					need_reval = 0;
				goto unlazy;
			}
		}
		path->mnt = mnt;
		path->dentry = dentry;
		if (unlikely(!__follow_mount_rcu(nd, path, inode)))
			goto unlazy;
		if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
			goto unlazy;
		return 0;
unlazy:
		if (unlazy_walk(nd, dentry))
			return -ECHILD;
	} else {
		dentry = __d_lookup(parent, &nd->last);
	}

	if (unlikely(!dentry))
		goto need_lookup;

	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
		status = d_revalidate(dentry, nd->flags);
	if (unlikely(status <= 0)) {
		if (status < 0) {
			dput(dentry);
			return status;
		}
		if (!d_invalidate(dentry)) {
			dput(dentry);
			goto need_lookup;
		}
	}

	path->mnt = mnt;
	path->dentry = dentry;
	err = follow_managed(path, nd->flags);
	if (unlikely(err < 0)) {
		path_put_conditional(path, nd);
		return err;
	}
	if (err)
		nd->flags |= LOOKUP_JUMPED;
	*inode = path->dentry->d_inode;
	return 0;

need_lookup:
	return 1;
}

调用__d_lookup_rcu函数在哈希桶中找到对应的dentry(这是哪个dentry还记得吗?这个是name指向的那个目录分量的dentry,即对于/mnt/a这样的路径搜索,第一次返回mnt的 dentry。如果没找到就跳转到unlazy标记处(切换到ref-walk模式继续查找)。根据这个dentry得到对应的inode,进行一系列的检查操作,这样是为了确保在读取的时候,并没有其他进程对这些结构进行修改操作(rcu-walk模式并没有加锁),更新的临时变量path,这时候不能直接修改nd变量,因为不能确定这个分量是不是目录,nd记录的信息必须是目录,然后结束。
其中有很多个跳转到unlazy标志的语句,我们前面说了,跳到unlazy标志表示rcu模式查找失败,用ref模式进行查找。ref模式的fast查找还是在内核缓冲区查找相应的dentry,和上述过程类似,这就不深入讲了。找到当前子目录分量的dentry以后,接着下面有一个__follow_mount_rcu的函数:

path_lookupat

     ----------->link_path_walk

           ------------>walk_component

                  -------------->lookup_fast

                           -------------->__follow_mount_rcu

static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
			       struct inode **inode)
{
	for (;;) {
		struct mount *mounted;
		/*
		 * Don't forget we might have a non-mountpoint managed dentry
		 * that wants to block transit.
		 */
		if (unlikely(managed_dentry_might_block(path->dentry)))
			return false;

		if (!d_mountpoint(path->dentry)) //没有被挂载过就返回
			break;

		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
		if (!mounted)
			break;
		path->mnt = &mounted->mnt; //把找到的下级文件系统的mount 实例赋值给path
		path->dentry = mounted->mnt.mnt_root; //path的dentry 结构变成下级文件系统的root dentry
		nd->flags |= LOOKUP_JUMPED;
		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
		/*
		 * Update the inode too. We don't need to re-check the
		 * dentry sequence number here after this d_inode read,
		 * because a mount-point is always pinned.
		 */
		*inode = path->dentry->d_inode; 
	}
	return true;
}

这边这个函数要特别注意一下:此函数根据当前目录所属文件系统的mount实例和当前目录的dentry,去查找另一个mount实例,为什么要这样做呢,前面一篇文章已经介绍过了,如果当前目录作为另一个文件系统的挂载点,那么如果要继续往下搜寻下级目录的dentry,需要切换到下级文件系统的dentry上面,所以这边就是在查找下级文件系统的root dentry和mount实例,实现跳转。看一下__lookup_mnt:

struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
			      int dir)
{
	struct list_head *head = mount_hashtable + hash(mnt, dentry);
	struct list_head *tmp = head;
	struct mount *p, *found = NULL;

	for (;;) {
		tmp = dir ? tmp->next : tmp->prev;
		p = NULL;
		if (tmp == head)
			break;
		p = list_entry(tmp, struct mount, mnt_hash);
		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) {
			found = p;
			break;
		}
	}
	return found;
}

该函数根据<vfsmount,dentry>二元组找到mount实例的hash 链表,查找该链表上面有没有符合情况的mount 实例。

link_path_walk执行完结束以后,找到的是上级父目录的dentry和inode结构,即/mnt目录的结构,所以还需进一步查找找到最下层子节点的dentry和inode。在path_lookupat下面,接着往下执行:

err = link_path_walk(name, nd);

	if (!err && !(flags & LOOKUP_PARENT)) {
		err = lookup_last(nd, &path);
		while (err > 0) {
			void *cookie;
			struct path link = path;
			err = may_follow_link(&link, nd);
			if (unlikely(err))
				break;
			nd->flags |= LOOKUP_PARENT;
			err = follow_link(&link, nd, &cookie);
			if (err)
				break;
			err = lookup_last(nd, &path);
			put_link(nd, &link, cookie);
		}
	}

在lookup_last函数里面完成最下层节点的搜索:

static inline int lookup_last(struct nameidata *nd, struct path *path)
{
	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

	nd->flags &= ~LOOKUP_PARENT;
	return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW);
}

可以看到里面调用了walk_component,该函数的原理上面已经分析过,所以最终找到a目录的dentry和inode,并返回。至此安装节点已经找到。

所以path_lookupat函数的最终执行成果就是就是返回path 结构,该结构包含着安装目录的dentry结构以及安装 目录所在文件系统的mount 实例。

2.2 do_new_mount

通过上面已经找到了安装点的dentry结构以及安装 目录所在文件系统的mount 实例,这个时候可以把新的文件系统安装在此目录上了。

static int do_new_mount(struct path *path, const char *fstype, int flags,
			int mnt_flags, const char *name, void *data)
{
	struct file_system_type *type;
	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
	struct vfsmount *mnt;
	int err;

	if (!fstype)
		return -EINVAL;
//它是根据文件系统名称找到对应的已注册的file_system_type实例。这个实例里有一个很重要的东西就是
        // mount回调函数。
	type = get_fs_type(fstype);
	if (!type)
		return -ENODEV;

	if (user_ns != &init_user_ns) {
		if (!(type->fs_flags & FS_USERNS_MOUNT)) {
			put_filesystem(type);
			return -EPERM;
		}
		/* Only in special cases allow devices from mounts
		 * created outside the initial user namespace.
		 */
		if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
			flags |= MS_NODEV;
			mnt_flags |= MNT_NODEV;
		}
	}
//这个地方是很重要的一步,也是我们第一次接触vfsmount这个结构的地方。
        // 先简单来说vfs_kern_mount会调用特定文件系统类型(type里)的mount回调函数,
        // 构建好一个vfsmount结构。具体怎么做的我们等下面讲vfsmount的时候再说。
	mnt = vfs_kern_mount(type, flags, name, data);
	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
	    !mnt->mnt_sb->s_subtype)
		mnt = fs_set_subtype(mnt, fstype);

	put_filesystem(type);
	if (IS_ERR(mnt))
		return PTR_ERR(mnt);
// 将得到的vfsmount结构加入全局目录树。
	err = do_add_mount(real_mount(mnt), path, mnt_flags);
	if (err)
		mntput(mnt);
	return err;
}

do_new_mount

        --------->vfs_kern_mount 

struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
	struct mount *mnt;
	struct dentry *root;

	if (!type)
		return ERR_PTR(-ENODEV);

	mnt = alloc_vfsmnt(name); //分配并初始化一个mount结构
	if (!mnt)
		return ERR_PTR(-ENOMEM);

	if (flags & MS_KERNMOUNT)
		mnt->mnt.mnt_flags = MNT_INTERNAL;

	root = mount_fs(type, flags, name, data); // 在mount_fs函数里调用特定文件系统的mount回调函数构造一个root dentry,包含特定文件系统的super block信息
	if (IS_ERR(root)) {
		free_vfsmnt(mnt);
		return ERR_CAST(root);
	}

	mnt->mnt.mnt_root = root; //把mount的文件系统的root dentry 记录下来
	mnt->mnt.mnt_sb = root->d_sb;
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
	mnt->mnt_parent = mnt;
	br_write_lock(&vfsmount_lock);
	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
	br_write_unlock(&vfsmount_lock);
	return &mnt->mnt;
}

mount_fs 中是具体的文件系统的挂载函数,该函数在后面的文章中讲具体的文件系统的挂载的时候再来分析。

do_new_mount

        --------->do_add_mount

static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
{
	struct mountpoint *mp;
	struct mount *parent;
	int err;

	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
// 这里不是简单的加锁,如果path上挂载了很多文件系统,那么这里就是要找出最新一次挂载到
        // 其上的文件系统的根路径,这才是我们这个文件系统要挂载到的mountpoint
        // 为了便于理解,我们可以看一下这段代码的逻辑在以前是这样的:
        //     while(do_mountpoint(path->dentry) && follow_down(path));
        // 注意结尾有一个分号,也就是说这里要不断循环的做follow_down操作,直到找到最后挂载上去的文件系统。
        // 现在这些逻辑,加上锁操作都放到了下面这个不是很起眼的lock_mount函数里了。
	mp = lock_mount(path);
	if (IS_ERR(mp))
		return PTR_ERR(mp);
// real_mount通过vfsmount找到mount
	parent = real_mount(path->mnt);
	err = -EINVAL;
	if (unlikely(!check_mnt(parent))) {
		/* that's acceptable only for automounts done in private ns */
		if (!(mnt_flags & MNT_SHRINKABLE))
			goto unlock;
		/* ... and for those we'd better have mountpoint still alive */
		if (!parent->mnt_ns)
			goto unlock;
	}

	/* Refuse the same filesystem on the same mount point */
	err = -EBUSY;
	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
	    path->mnt->mnt_root == path->dentry)
		goto unlock;

	err = -EINVAL;
// 新文件系统的挂载实例的根inode不应该是一个符号链接
	if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
		goto unlock;

	newmnt->mnt.mnt_flags = mnt_flags;
// 最后graft_tree就是把newmnt加入到全局文件系统树中
	err = graft_tree(newmnt, parent, mp);

unlock:
	unlock_mount(mp);
	return err;
}

看一下lock_mount

do_new_mount

        --------->do_add_mount

             ------------->lock_mount

static struct mountpoint *lock_mount(struct path *path)
{
	struct vfsmount *mnt;
	struct dentry *dentry = path->dentry;
retry:
	mutex_lock(&dentry->d_inode->i_mutex);
	if (unlikely(cant_mount(dentry))) {
		mutex_unlock(&dentry->d_inode->i_mutex);
		return ERR_PTR(-ENOENT);
	}
	namespace_lock();
 // 以mount /dev/sdc1 /mnt为例,在此这个path实际上代表"/"根文件系统下的/mnt这个dentry。
        // lookup_mnt(path)就是检查这个dentry上是否挂载着文件系统,如果挂载着则返回这个挂载在/mnt上的子文件系统。其实在前面path_init函数里__follow_mount_rcu已经做过一次当前挂载目录是否已经挂载过其他文件系统的而检查了,这边再做一次,应该返回的mnt为空
	mnt = lookup_mnt(path);
// 如果lookup_mnt返回NULL了,就代表path里保存的是一个没有被挂载过的dentry

	if (likely(!mnt)) {
		struct mountpoint *mp = new_mountpoint(dentry);
		if (IS_ERR(mp)) {
			namespace_unlock();
			mutex_unlock(&dentry->d_inode->i_mutex);
			return mp;
		}
		return mp;
	}
	namespace_unlock();
	mutex_unlock(&path->dentry->d_inode->i_mutex);
	path_put(path);
// 如果lookup_mnt没有返回NULL,则说明它找到了挂载在/mnt上的子文件系统,下面的逻辑是:
        // 把子文件系统的mount结构赋值给path->mnt
	path->mnt = mnt;
	dentry = path->dentry = dget(mnt->mnt_root);
	goto retry;
}

lock_mount函数就是对当前挂载目录是否已经挂载过其他文件系统进行检查,然后返回一个mountpoint 结构,看一下new_mountpoint函数:

static struct mountpoint *new_mountpoint(struct dentry *dentry)
{
	struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry);
	struct mountpoint *mp;

	list_for_each_entry(mp, chain, m_hash) {
		if (mp->m_dentry == dentry) {
			/* might be worth a WARN_ON() */
			if (d_unlinked(dentry))
				return ERR_PTR(-ENOENT);
			mp->m_count++;
			return mp;
		}
	}

	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
	if (!mp)
		return ERR_PTR(-ENOMEM);

	spin_lock(&dentry->d_lock);
	if (d_unlinked(dentry)) {
		spin_unlock(&dentry->d_lock);
		kfree(mp);
		return ERR_PTR(-ENOENT);
	}
	dentry->d_flags |= DCACHE_MOUNTED;  //重点关注这边,把要挂载目录的dentry置上DCACHE_MOUNTED
	spin_unlock(&dentry->d_lock);
	mp->m_dentry = dentry;
	mp->m_count = 1;
	list_add(&mp->m_hash, chain);
	return mp;
}

上面函数比较重要的一个操作是这个

dentry->d_flags |= DCACHE_MOUNTED;

标记该挂载目录被其他文件系统挂载上。

最后调用graft_tree完成挂载的收尾工作,现在mp, parent都有了,我们要做的就是把newmnt和parent,mp构建到一起,比如让newmnt的mnt_parent指向parent,让newmnt的mnt_mountpoint指向mp,用parent和mp计算hash值把newmnt加入到mount_hashtable中。当然还有很多inode, dentry, super_block等等结构之间关系的构建。

do_new_mount

        --------->do_add_mount

             ------------->lock_mount

                    ------------->attach_recursive_mnt

static int attach_recursive_mnt(struct mount *source_mnt,
			struct mount *dest_mnt,
			struct mountpoint *dest_mp,
			struct path *parent_path)
{
	LIST_HEAD(tree_list);
	struct mount *child, *p;
	int err;

	if (IS_MNT_SHARED(dest_mnt)) {
		err = invent_group_ids(source_mnt, true);
		if (err)
			goto out;
	}
	err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
	if (err)
		goto out_cleanup_ids;

	br_write_lock(&vfsmount_lock);

	if (IS_MNT_SHARED(dest_mnt)) {
		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
			set_mnt_shared(p);
	}
	if (parent_path) {  //传入的parent_path为null
		detach_mnt(source_mnt, parent_path);
		attach_mnt(source_mnt, dest_mnt, dest_mp);
		touch_mnt_namespace(source_mnt->mnt_ns);
	} else {
		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); //主要关注这边的代码
		commit_tree(source_mnt);
	}

	list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
		list_del_init(&child->mnt_hash);
		commit_tree(child);
	}
	br_write_unlock(&vfsmount_lock);

	return 0;

 out_cleanup_ids:
	if (IS_MNT_SHARED(dest_mnt))
		cleanup_group_ids(source_mnt, NULL);
 out:
	return err;
}

mnt_set_mountpoint,设置子挂载点和挂载目录之间的关系:

void mnt_set_mountpoint(struct mount *mnt,
			struct mountpoint *mp,
			struct mount *child_mnt)
{
	mp->m_count++;
	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
	child_mnt->mnt_mountpoint = dget(mp->m_dentry); //挂载点设置为挂载目录的dentry
	child_mnt->mnt_parent = mnt; //设置mount parent为 挂载目录文件系统的mount 实例
	child_mnt->mnt_mp = mp;
}

commit_tree:

static void commit_tree(struct mount *mnt)
{
	struct mount *parent = mnt->mnt_parent;
	struct mount *m;
	LIST_HEAD(head);
	struct mnt_namespace *n = parent->mnt_ns;

	BUG_ON(parent == mnt);

	list_add_tail(&head, &mnt->mnt_list);
	list_for_each_entry(m, &head, mnt_list)
		m->mnt_ns = n;

	list_splice(&head, n->list.prev);

	list_add_tail(&mnt->mnt_hash, mount_hashtable +
				hash(&parent->mnt, mnt->mnt_mountpoint)); //根据父挂载点的mount实例和挂载目录的dentry 计算hash 值,链接到mount_hashtable hash表上,至此通过该关系可以从挂载目录找到下级子目录
	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
	touch_mnt_namespace(n);
}

最后挂载工作完成。通过数据结构之间的相互关系,可以从挂载目录找到下级子目录,也可以从下级子目录返回上级挂载父目录,整条路径连成一条线。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值