linux cgroup (2)----mount

严阵以待戒备森严

于 2023-11-05 12:23:02 发布

阅读量99

点赞数 2

分类专栏： cgroup 文章标签： linux 运维

本文链接：https://blog.csdn.net/qq_40079927/article/details/134224165

版权

cgroup 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

kernel: 5.0
以下mount流程为初始化完成之后的第一次mount

cgroup_mount

static struct dentry *cgroup_mount(struct file_system_type *fs_type,
			 int flags, const char *unused_dev_name,
			 void *data)
{
	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
	struct dentry *dentry;
	int ret;

	get_cgroup_ns(ns);

	/* Check if the caller has permission to mount. */
	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
		put_cgroup_ns(ns);
		return ERR_PTR(-EPERM);
	}

	/*
	 * The first time anyone tries to mount a cgroup, enable the list
	 * linking each css_set to its tasks and fix up all existing tasks.
	 */
	if (!use_task_css_set_links)
		cgroup_enable_task_cg_lists();//

	if (fs_type == &cgroup2_fs_type) {
		unsigned int root_flags;

		ret = parse_cgroup_root_flags(data, &root_flags);
		if (ret) {
			put_cgroup_ns(ns);
			return ERR_PTR(ret);
		}

		cgrp_dfl_visible = true;
		cgroup_get_live(&cgrp_dfl_root.cgrp);

		dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
					 CGROUP2_SUPER_MAGIC, ns);
		if (!IS_ERR(dentry))
			apply_cgroup_root_flags(root_flags);
	} else {
		dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
				       CGROUP_SUPER_MAGIC, ns);
	}

	put_cgroup_ns(ns);
	return dentry;
}

cgroup_enable_task_cg_lists

只在第一次挂载时发挥作用，用于将此时的所有task以cg_list为连接件接入所处css_set的tasks链表，后续新fork的task则是在创建时就接入父进程cgroups指向的css_set的tasks链表（见copy_process -->cgroup_post_fork)

static void cgroup_enable_task_cg_lists(void)
{
	struct task_struct *p, *g;

	/*
	 * We need tasklist_lock because RCU is not safe against
	 * while_each_thread(). Besides, a forking task that has passed
	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
	 * is not guaranteed to have its child immediately visible in the
	 * tasklist if we walk through it with RCU.
	 */
	read_lock(&tasklist_lock);
	spin_lock_irq(&css_set_lock);

	if (use_task_css_set_links)
		goto out_unlock;

	use_task_css_set_links = true;

	do_each_thread(g, p) {
		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
			     task_css_set(p) != &init_css_set);

		/*
		 * We should check if the process is exiting, otherwise
		 * it will race with cgroup_exit() in that the list
		 * entry won't be deleted though the process has exited.
		 * Do it while holding siglock so that we don't end up
		 * racing against cgroup_exit().
		 *
		 * Interrupts were already disabled while acquiring
		 * the css_set_lock, so we do not need to disable it
		 * again when acquiring the sighand->siglock here.
		 */
		spin_lock(&p->sighand->siglock);
		if (!(p->flags & PF_EXITING)) {
			struct css_set *cset = task_css_set(p);
			/*fork一个进程时，新进程的cgroups先指向init_css_set(copy_process --> cgroup_fork)
			 *如果已经完成初次挂载（use_task_css_set_links为true)则指向父进程的cgroups
			 */
			if (!css_set_populated(cset))
				css_set_update_populated(cset, true); //设置cset关联的所有cgroup的nr_populaed_csets成员，
													//表明该cgroup已有task关联
			list_add_tail(&p->cg_list, &cset->tasks);//css_set的tasks链表维护正常状态下的task，另有mg_tasks维护迁移状态，
			get_css_set(cset); //增加引用计数
			cset->nr_tasks++;
		}
		spin_unlock(&p->sighand->siglock);
	} while_each_thread(g, p);
out_unlock:
	spin_unlock_irq(&css_set_lock);
	read_unlock(&tasklist_lock);
}

cgroup1_mount(&cgroup_fs_type, flags, data,CGROUP_SUPER_MAGIC, ns);

struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
			     void *data, unsigned long magic,
			     struct cgroup_namespace *ns)
{
	struct super_block *pinned_sb = NULL;
	struct cgroup_sb_opts opts;
	struct cgroup_root *root;
	struct cgroup_subsys *ss;
	struct dentry *dentry;
	int i, ret;
	bool new_root = false;

	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);

	/* First find the desired set of subsystems */
	ret = parse_cgroupfs_options(data, &opts);//解析用户空间传入的数据，包括子系统名等
	if (ret)
		goto out_unlock;

	/*
	 * Destruction of cgroup root is asynchronous, so subsystems may
	 * still be dying after the previous unmount.  Let's drain the
	 * dying subsystems.  We just need to ensure that the ones
	 * unmounted previously finish dying and don't care about new ones
	 * starting.  Testing ref liveliness is good enough.
	 */
	for_each_subsys(ss, i) {
		if (!(opts.subsys_mask & (1 << i)) || //此时所有子系统的root都指向cgrp_dfl_root，所以这个循环跳过
		    ss->root == &cgrp_dfl_root)
			continue;

		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
			mutex_unlock(&cgroup_mutex);
			msleep(10);
			ret = restart_syscall();
			goto out_free;
		}
		cgroup_put(&ss->root->cgrp);
	}

	for_each_root(root) {  //每个cgroup_root都会以root_list为连接件接入cgroup_roots链表，此时仅有cgrp_dfl_root
		bool name_match = false;

		if (root == &cgrp_dfl_root)
			continue;

		/*
		 * If we asked for a name then it must match.  Also, if
		 * name matches but sybsys_mask doesn't, we should fail.
		 * Remember whether name matched.
		 */
		if (opts.name) {
			if (strcmp(opts.name, root->name))
				continue;
			name_match = true;
		}

		/*
		 * If we asked for subsystems (or explicitly for no
		 * subsystems) then they must match.
		 */
		if ((opts.subsys_mask || opts.none) &&
		    (opts.subsys_mask != root->subsys_mask)) {
			if (!name_match)
				continue;
			ret = -EBUSY;
			goto out_unlock;
		}

		if (root->flags ^ opts.flags)
			pr_warn("new mount options do not match the existing superblock, will be ignored\n");

		/*
		 * We want to reuse @root whose lifetime is governed by its
		 * ->cgrp.  Let's check whether @root is alive and keep it
		 * that way.  As cgroup_kill_sb() can happen anytime, we
		 * want to block it by pinning the sb so that @root doesn't
		 * get killed before mount is complete.
		 *
		 * With the sb pinned, tryget_live can reliably indicate
		 * whether @root can be reused.  If it's being killed,
		 * drain it.  We can use wait_queue for the wait but this
		 * path is super cold.  Let's just sleep a bit and retry.
		 */
		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
		if (IS_ERR(pinned_sb) ||
		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
			mutex_unlock(&cgroup_mutex);
			if (!IS_ERR_OR_NULL(pinned_sb))
				deactivate_super(pinned_sb);
			msleep(10);
			ret = restart_syscall();
			goto out_free;
		}

		ret = 0;
		goto out_unlock;
	}
  //第一次挂载时，上面两个循环啥也没做
	/*
	 * No such thing, create a new one.  name= matching without subsys
	 * specification is allowed for already existing hierarchies but we
	 * can't create new one without subsys specification.
	 */
	if (!opts.subsys_mask && !opts.none) {
		ret = -EINVAL;
		goto out_unlock;
	}

	/* Hierarchies may only be created in the initial cgroup namespace. */
	if (ns != &init_cgroup_ns) {
		ret = -EPERM;
		goto out_unlock;
	}

	root = kzalloc(sizeof(*root), GFP_KERNEL);
	if (!root) {
		ret = -ENOMEM;
		goto out_unlock;
	}
	new_root = true;

	init_cgroup_root(root, &opts); //现在系统中有了第二个cgroup_root,第二个cgroup,第二个cgoup_subsys_state，他们彼此引用。

	ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
	if (ret)
		cgroup_free_root(root);

out_unlock:
	mutex_unlock(&cgroup_mutex);
out_free:
	kfree(opts.release_agent);
	kfree(opts.name);

	if (ret)
		return ERR_PTR(ret);
//调用kernfs_mount完成实际mount，
	dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
				 CGROUP_SUPER_MAGIC, ns);

	/*
	 * There's a race window after we release cgroup_mutex and before
	 * allocating a superblock. Make sure a concurrent process won't
	 * be able to re-use the root during this window by delaying the
	 * initialization of root refcnt.
	 */
	if (new_root) {
		mutex_lock(&cgroup_mutex);
		percpu_ref_reinit(&root->cgrp.self.refcnt);
		mutex_unlock(&cgroup_mutex);
	}

	/*
	 * If @pinned_sb, we're reusing an existing root and holding an
	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
	 */
	if (pinned_sb)
		deactivate_super(pinned_sb);

	return dentry;
}

cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);

和初始化调用该函数不同，此时subsys_mask不为0，会在rebind_subsystem中

int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
{
	LIST_HEAD(tmp_links);
	struct cgroup *root_cgrp = &root->cgrp;
	struct kernfs_syscall_ops *kf_sops;
	struct css_set *cset;
	int i, ret;

	lockdep_assert_held(&cgroup_mutex);

	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
	if (ret < 0)
		goto out;
	root_cgrp->id = ret;
	root_cgrp->ancestor_ids[0] = ret;

	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
			      ref_flags, GFP_KERNEL);
	if (ret)
		goto out;

	/*
	 * We're accessing css_set_count without locking css_set_lock here,
	 * but that's OK - it can only be increased by someone holding
	 * cgroup_lock, and that's us.  Later rebinding may disable
	 * controllers on the default hierarchy and thus create new csets,
	 * which can't be more than the existing ones.  Allocate 2x.
	 */
	ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
	if (ret)
		goto cancel_ref;

	ret = cgroup_init_root_id(root);
	if (ret)
		goto cancel_ref;

	kf_sops = root == &cgrp_dfl_root ?
		&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;

	root->kf_root = kernfs_create_root(kf_sops,
					   KERNFS_ROOT_CREATE_DEACTIVATED |
					   KERNFS_ROOT_SUPPORT_EXPORTOP,
					   root_cgrp);
	if (IS_ERR(root->kf_root)) {
		ret = PTR_ERR(root->kf_root);
		goto exit_root_id;
	}
	root_cgrp->kn = root->kf_root->kn;

	ret = css_populate_dir(&root_cgrp->self);
	if (ret)
		goto destroy_root;

	ret = rebind_subsystems(root, ss_mask);
	if (ret)
		goto destroy_root;

	ret = cgroup_bpf_inherit(root_cgrp);
	WARN_ON_ONCE(ret);

	trace_cgroup_setup_root(root);

	/*
	 * There must be no failure case after here, since rebinding takes
	 * care of subsystems' refcounts, which are explicitly dropped in
	 * the failure exit path.
	 */
	list_add(&root->root_list, &cgroup_roots);
	cgroup_root_count++;

	/*
	 * Link the root cgroup in this hierarchy into all the css_set
	 * objects.
	 */
	spin_lock_irq(&css_set_lock);
	hash_for_each(css_set_table, i, cset, hlist) {
		link_css_set(&tmp_links, cset, root_cgrp);
		if (css_set_populated(cset))
			cgroup_update_populated(root_cgrp, true);
	}
	spin_unlock_irq(&css_set_lock);

	BUG_ON(!list_empty(&root_cgrp->self.children));
	BUG_ON(atomic_read(&root->nr_cgrps) != 1);

	kernfs_activate(root_cgrp->kn);
	ret = 0;
	goto out;

destroy_root:
	kernfs_destroy_root(root->kf_root);
	root->kf_root = NULL;
exit_root_id:
	cgroup_exit_root_id(root);
cancel_ref:
	percpu_ref_exit(&root_cgrp->self.refcnt);
out:
	free_cgrp_cset_links(&tmp_links);
	return ret;
}

rebind_dubsystem

完成子系统在不同cgroup_root间的移动，两次调用cgroup_apply_control，因为原cgroup中移出指定子系统，新cgroup中加入指定子系统。


int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
	struct cgroup *dcgrp = &dst_root->cgrp;
	struct cgroup_subsys *ss;
	int ssid, i, ret;

	lockdep_assert_held(&cgroup_mutex);

	do_each_subsys_mask(ss, ssid, ss_mask) {
		/*
		 * If @ss has non-root csses attached to it, can't move.
		 * If @ss is an implicit controller, it is exempt from this
		 * rule and can be stolen.
		 */
		if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
		    !ss->implicit_on_dfl)
			return -EBUSY;

		/* can't move between two non-dummy roots either */
		if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
			return -EBUSY;
	} while_each_subsys_mask();

	do_each_subsys_mask(ss, ssid, ss_mask) {
		struct cgroup_root *src_root = ss->root;
		struct cgroup *scgrp = &src_root->cgrp;
		struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
		struct css_set *cset;

		WARN_ON(!css || cgroup_css(dcgrp, ss));

		/* disable from the source */
		//将该子系统从cgrp_dfl_root中移出，cgroup_root的subsys_mask表明其subsys数组中被哪些子系统的css填充
		src_root->subsys_mask &= ~(1 << ssid);
		WARN_ON(cgroup_apply_control(scgrp));
		cgroup_finalize_control(scgrp, 0); //从cgrp_dfl_root->cgrp及其后代中移出指定子系统的属性文件

		/* rebind */
		//表明指定子系统加入了新的cgroup_root
		RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
		rcu_assign_pointer(dcgrp->subsys[ssid], css);
		ss->root = dst_root;
		css->cgroup = dcgrp;

		spin_lock_irq(&css_set_lock);
		hash_for_each(css_set_table, i, cset, hlist)
			list_move_tail(&cset->e_cset_node[ss->id],
				       &dcgrp->e_csets[ss->id]);
		spin_unlock_irq(&css_set_lock);

		/* default hierarchy doesn't enable controllers by default */
		dst_root->subsys_mask |= 1 << ssid;
		if (dst_root == &cgrp_dfl_root) {
			static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
		} else {
			dcgrp->subtree_control |= 1 << ssid;
			static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
		}

		ret = cgroup_apply_control(dcgrp);
		if (ret)
			pr_warn("partial failure to rebind %s controller (err=%d)\n",
				ss->name, ret);

		if (ss->bind)
			ss->bind(css);
	} while_each_subsys_mask();

	kernfs_activate(dcgrp->kn);
	return 0;
}

cgroup_apply_control

static int cgroup_apply_control(struct cgroup *cgrp)
{
	int ret;

	cgroup_propagate_control(cgrp); //将cgroup所有后代的子系统相关与cgrp同步

	ret = cgroup_apply_control_enable(cgrp);//创建cgroup及所有后代的cftype节点
	if (ret)
		return ret;

	/*
	 * At this point, cgroup_e_css_by_mask() results reflect the new csses
	 * making the following cgroup_update_dfl_csses() properly update
	 * css associations of all tasks in the subtree.
	 */
	ret = cgroup_update_dfl_csses(cgrp);//进程迁移
	if (ret)
		return ret;

	return 0;
}