Linux内核中的overlay文件系统

一、简介

Docker 内核实现容器的功能用了linux 内核中的三个特性 Namespace、Cgroup、UnionFs,今天我们来说一下UnionFs。

linux UnionFs 实现的是overlay 文件系统

OverlayFs 文件系统分为三层,

lower 是只读层

Upper 是可读写

Merged 是 lower 和Upper 合并的目录

挂载方式可以使用mount 命令挂载:

mount -t overlay overlay -o lowerdir=lower1:lower2,upperdir=upper,workdir=work merged

二、源码分析

1.挂载overlay 设备初始化

当我们使用

mount -t overlay overlay -o lowerdir=lower1:lower2,upperdir=upper,workdir=work merged

linux 内核层,overlay 结构体声明类型

static struct file_system_type ovl_fs_type = {
	.owner		= THIS_MODULE,
	.name		= "overlay",
	.fs_flags	= FS_USERNS_MOUNT,
	.mount		= ovl_mount,
	.kill_sb	= kill_anon_super,
};

当我们使用overlay设备的时候,会触发结构体上挂载的mount函数指针,这个函数触发linux内核中的ovl_mount

static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
				const char *dev_name, void *raw_data)
{
	return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
}

核心是使用ovl_fill_super,填充overlay 文件系统的超级块,申请一个ovl_fs,然后填充到

sb->s_fs_info = ofs;

详细代码:

static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
	struct path upperpath = { };
	struct dentry *root_dentry;
	struct ovl_entry *oe;
	struct ovl_fs *ofs;
	struct ovl_layer *layers;
	struct cred *cred;
	char *splitlower = NULL;
	unsigned int numlower;
	int err;

    // 如果当前用户的namespace不是超级块的ns那么返回错误 -EIO
	err = -EIO;
	if (WARN_ON(sb->s_user_ns != current_user_ns()))
		goto out;

    // 目录操作结构体赋值
	sb->s_d_op = &ovl_dentry_operations;

	err = -ENOMEM;
    // 申请ovl_fs,并且对ovl_fs进行填充
	ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
	if (!ofs)
		goto out;

	err = -ENOMEM;
	ofs->creator_cred = cred = prepare_creds();
	if (!cred)
		goto out_err;

	/* Is there a reason anyone would want not to share whiteouts? */
	ofs->share_whiteout = true;

	ofs->config.index = ovl_index_def;
	ofs->config.uuid = true;
	ofs->config.nfs_export = ovl_nfs_export_def;
	ofs->config.xino = ovl_xino_def();
	ofs->config.metacopy = ovl_metacopy_def;
    // 装载选项
	err = ovl_parse_opt((char *) data, &ofs->config);
	if (err)
		goto out_err;

	err = -EINVAL;
	if (!ofs->config.lowerdir) {
		if (!silent)
			pr_err("missing 'lowerdir'\n");
		goto out_err;
	}

	err = -ENOMEM;
	splitlower = kstrdup(ofs->config.lowerdir, GFP_KERNEL);
	if (!splitlower)
		goto out_err;

	err = -EINVAL;
	numlower = ovl_split_lowerdirs(splitlower);
	if (numlower > OVL_MAX_STACK) {
		pr_err("too many lower directories, limit is %d\n",
		       OVL_MAX_STACK);
		goto out_err;
	}

	err = -ENOMEM;
	layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL);
	if (!layers)
		goto out_err;

	ofs->layers = layers;
	/* Layer 0 is reserved for upper even if there's no upper */
	ofs->numlayer = 1;

	sb->s_stack_depth = 0;
	sb->s_maxbytes = MAX_LFS_FILESIZE;
	atomic_long_set(&ofs->last_ino, 1);
	/* Assume underlying fs uses 32bit inodes unless proven otherwise */
	if (ofs->config.xino != OVL_XINO_OFF) {
		ofs->xino_mode = BITS_PER_LONG - 32;
		if (!ofs->xino_mode) {
			pr_warn("xino not supported on 32bit kernel, falling back to xino=off.\n");
			ofs->config.xino = OVL_XINO_OFF;
		}
	}

	/* alloc/destroy_inode needed for setting up traps in inode cache */
	sb->s_op = &ovl_super_operations;

	if (ofs->config.upperdir) {
		struct super_block *upper_sb;

		err = -EINVAL;
		if (!ofs->config.workdir) {
			pr_err("missing 'workdir'\n");
			goto out_err;
		}

		err = ovl_get_upper(sb, ofs, &layers[0], &upperpath);
		if (err)
			goto out_err;

		upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
		if (!ovl_should_sync(ofs)) {
			ofs->errseq = errseq_sample(&upper_sb->s_wb_err);
			if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) {
				err = -EIO;
				pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n");
				goto out_err;
			}
		}

		err = ovl_get_workdir(sb, ofs, &upperpath);
		if (err)
			goto out_err;

		if (!ofs->workdir)
			sb->s_flags |= SB_RDONLY;

		sb->s_stack_depth = upper_sb->s_stack_depth;
		sb->s_time_gran = upper_sb->s_time_gran;
	}
	oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers);
	err = PTR_ERR(oe);
	if (IS_ERR(oe))
		goto out_err;

	/* If the upper fs is nonexistent, we mark overlayfs r/o too */
	if (!ovl_upper_mnt(ofs))
		sb->s_flags |= SB_RDONLY;

	if (!ofs->config.uuid && ofs->numfs > 1) {
		pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=on.\n");
		ofs->config.uuid = true;
	}

	if (!ovl_force_readonly(ofs) && ofs->config.index) {
		err = ovl_get_indexdir(sb, ofs, oe, &upperpath);
		if (err)
			goto out_free_oe;

		/* Force r/o mount with no index dir */
		if (!ofs->indexdir)
			sb->s_flags |= SB_RDONLY;
	}

	err = ovl_check_overlapping_layers(sb, ofs);
	if (err)
		goto out_free_oe;

	/* Show index=off in /proc/mounts for forced r/o mount */
	if (!ofs->indexdir) {
		ofs->config.index = false;
		if (ovl_upper_mnt(ofs) && ofs->config.nfs_export) {
			pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n");
			ofs->config.nfs_export = false;
		}
	}

	if (ofs->config.metacopy && ofs->config.nfs_export) {
		pr_warn("NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n");
		ofs->config.nfs_export = false;
	}

	if (ofs->config.nfs_export)
		sb->s_export_op = &ovl_export_operations;

	/* Never override disk quota limits or use reserved space */
	cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);

	sb->s_magic = OVERLAYFS_SUPER_MAGIC;
	sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
		ovl_trusted_xattr_handlers;
	sb->s_fs_info = ofs;
	sb->s_flags |= SB_POSIXACL;
	sb->s_iflags |= SB_I_SKIP_SYNC;

    // 把 overlay 文件系统的根目录设置到 upperDir里
	err = -ENOMEM;
    // 创建root的inode并且指向新建的inode对象root_inode
	root_dentry = ovl_get_root(sb, upperpath.dentry, oe);
	if (!root_dentry)
		goto out_free_oe;

	mntput(upperpath.mnt);
	kfree(splitlower);

	sb->s_root = root_dentry;

	return 0;

out_free_oe:
	ovl_entry_stack_free(oe);
	kfree(oe);
out_err:
	kfree(splitlower);
	path_put(&upperpath);
	ovl_free_fs(ofs);
out:
	return err;
}

操作overlay 文件系统的目录操作结构体实现:

static const struct dentry_operations ovl_dentry_operations = {
	.d_release = ovl_dentry_release,
	.d_real = ovl_d_real,
	.d_revalidate = ovl_dentry_revalidate,
	.d_weak_revalidate = ovl_dentry_weak_revalidate,
};

数据结构图:

参考网址:

Linux源码剖析——OverlayFS 源码分析_linux overlay-CSDN博客

2、描述符操作结构体 

如果你做过kernel module ,读过linux设计实现.就很容易理解了

描述符操作结构体定义:

const struct file_operations ovl_dir_operations = {
	.read		= generic_read_dir,
	.open		= ovl_dir_open,
	.iterate	= ovl_iterate,
	.llseek		= ovl_dir_llseek,
	.fsync		= ovl_dir_fsync,
	.release	= ovl_dir_release,
};

当我们使用linux 系统调用打开overlay 设备文件的时候会触发操作结构体的函数,

open 函数:

static int ovl_dir_open(struct inode *inode, struct file *file)
{
	struct path realpath;
	struct file *realfile;
	struct ovl_dir_file *od;
	enum ovl_path_type type;

	od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
	if (!od)
		return -ENOMEM;

	type = ovl_path_real(file->f_path.dentry, &realpath);
	realfile = ovl_dir_open_realfile(file, &realpath);
	if (IS_ERR(realfile)) {
		kfree(od);
		return PTR_ERR(realfile);
	}
	od->realfile = realfile;
	od->is_real = ovl_dir_is_real(file->f_path.dentry);
	od->is_upper = OVL_TYPE_UPPER(type);
	file->private_data = od;

	return 0;
}

struct ovl_dir_file {
	bool is_real; // 是否需要合并
	bool is_upper; // 是否需要从upper读取
	struct ovl_dir_cache *cache; // 缓存目录
	struct list_head *cursor; // 遍历游标
	struct file *realfile; // 真实文件
	struct file *upperfile; // overlay 里 在upper目录所在位置
};

这里主要做的操作是初始化ovl_dir_file,并且把他挂载到万能指针private_data中。

读的操作是通过getdents,我们看迭代器:

static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
	struct ovl_dir_file *od = file->private_data;
	struct dentry *dentry = file->f_path.dentry;
	struct ovl_cache_entry *p;
	const struct cred *old_cred;
	int err;

	old_cred = ovl_override_creds(dentry->d_sb);
	if (!ctx->pos)
		ovl_dir_reset(file);

    //是否需要读取真实路径
	if (od->is_real) {
        // 不需要合并直接读取真实路径
		/*
		 * If parent is merge, then need to adjust d_ino for '..', if
		 * dir is impure then need to adjust d_ino for copied up
		 * entries.
		 */
		if (ovl_xino_bits(dentry->d_sb) ||
		    (ovl_same_fs(dentry->d_sb) &&
		     (ovl_is_impure_dir(file) ||
		      OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) {
			err = ovl_iterate_real(file, ctx);
		} else {
			err = iterate_dir(od->realfile, ctx);
		}
		goto out;
	}

    // 创建目录缓存
	if (!od->cache) {
		struct ovl_dir_cache *cache;

		cache = ovl_cache_get(dentry);
		err = PTR_ERR(cache);
		if (IS_ERR(cache))
			goto out;

		od->cache = cache;
		ovl_seek_cursor(od, ctx->pos);
	}

    // 直接把合并后的目录缓存,遍历返回用户层
	while (od->cursor != &od->cache->entries) {
		p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
		if (!p->is_whiteout) {
			if (!p->ino) {
				err = ovl_cache_update_ino(&file->f_path, p);
				if (err)
					goto out;
			}
		}
		/* ovl_cache_update_ino() sets is_whiteout on stale entry */
		if (!p->is_whiteout) {
			if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
				break;
		}
		od->cursor = p->l_node.next;
		ctx->pos++;
	}
	err = 0;
out:
	revert_creds(old_cred);
	return err;
}

  • 8
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值