kernel: 5.0
以下mount流程为初始化完成之后的第一次mount
cgroup_mount
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct dentry *dentry;
int ret;
get_cgroup_ns(ns);
/* Check if the caller has permission to mount. */
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
put_cgroup_ns(ns);
return ERR_PTR(-EPERM);
}
/*
* The first time anyone tries to mount a cgroup, enable the list
* linking each css_set to its tasks and fix up all existing tasks.
*/
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();//
if (fs_type == &cgroup2_fs_type) {
unsigned int root_flags;
ret = parse_cgroup_root_flags(data, &root_flags);
if (ret) {
put_cgroup_ns(ns);
return ERR_PTR(ret);
}
cgrp_dfl_visible = true;
cgroup_get_live(&cgrp_dfl_root.cgrp);
dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
CGROUP2_SUPER_MAGIC, ns);
if (!IS_ERR(dentry))
apply_cgroup_root_flags(root_flags);
} else {
dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
CGROUP_SUPER_MAGIC, ns);
}
put_cgroup_ns(ns);
return dentry;
}
cgroup_enable_task_cg_lists
只在第一次挂载时发挥作用,用于将此时的所有task以cg_list为连接件接入所处css_set的tasks链表,后续新fork的task则是在创建时就接入父进程cgroups指向的css_set的tasks链表(见copy_process -->cgroup_post_fork)
static void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
/*
* We need tasklist_lock because RCU is not safe against
* while_each_thread(). Besides, a forking task that has passed
* cgroup_post_fork() without seeing use_task_css_set_links = 1
* is not guaranteed to have its child immediately visible in the
* tasklist if we walk through it with RCU.
*/
read_lock(&tasklist_lock);
spin_lock_irq(&css_set_lock);
if (use_task_css_set_links)
goto out_unlock;
use_task_css_set_links = true;
do_each_thread(g, p) {
WARN_ON_ONCE(!list_empty(&p->cg_list) ||
task_css_set(p) != &init_css_set);
/*
* We should check if the process is exiting, otherwise
* it will race with cgroup_exit() in that the list
* entry won't be deleted though the process has exited.
* Do it while holding siglock so that we don't end up
* racing against cgroup_exit().
*
* Interrupts were already disabled while acquiring
* the css_set_lock, so we do not need to disable it
* again when acquiring the sighand->siglock here.
*/
spin_lock(&p->sighand->siglock);
if (!(p->flags & PF_EXITING)) {
struct css_set *cset = task_css_set(p);
/*fork一个进程时,新进程的cgroups先指向init_css_set(copy_process --> cgroup_fork)
*如果已经完成初次挂载(use_task_css_set_links为true)则指向父进程的cgroups
*/
if (!css_set_populated(cset))
css_set_update_populated(cset, true); //设置cset关联的所有cgroup的nr_populaed_csets成员,
//表明该cgroup已有task关联
list_add_tail(&p->cg_list, &cset->tasks);//css_set的tasks链表维护正常状态下的task,另有mg_tasks维护迁移状态,
get_css_set(cset); //增加引用计数
cset->nr_tasks++;
}
spin_unlock(&p->sighand->siglock);
} while_each_thread(g, p);
out_unlock:
spin_unlock_irq(&css_set_lock);
read_unlock(&tasklist_lock);
}
cgroup1_mount(&cgroup_fs_type, flags, data,CGROUP_SUPER_MAGIC, ns);
struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
void *data, unsigned long magic,
struct cgroup_namespace *ns)
{
struct super_block *pinned_sb = NULL;
struct cgroup_sb_opts opts;
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct dentry *dentry;
int i, ret;
bool new_root = false;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);//解析用户空间传入的数据,包括子系统名等
if (ret)
goto out_unlock;
/*
* Destruction of cgroup root is asynchronous, so subsystems may
* still be dying after the previous unmount. Let's drain the
* dying subsystems. We just need to ensure that the ones
* unmounted previously finish dying and don't care about new ones
* starting. Testing ref liveliness is good enough.
*/
for_each_subsys(ss, i) {
if (!(opts.subsys_mask & (1 << i)) || //此时所有子系统的root都指向cgrp_dfl_root,所以这个循环跳过
ss->root == &cgrp_dfl_root)
continue;
if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
msleep(10);
ret = restart_syscall();
goto out_free;
}
cgroup_put(&ss->root->cgrp);
}
for_each_root(root) { //每个cgroup_root都会以root_list为连接件接入cgroup_roots链表,此时仅有cgrp_dfl_root
bool name_match = false;
if (root == &cgrp_dfl_root)
continue;
/*
* If we asked for a name then it must match. Also, if
* name matches but sybsys_mask doesn't, we should fail.
* Remember whether name matched.
*/
if (opts.name) {
if (strcmp(opts.name, root->name))
continue;
name_match = true;
}
/*
* If we asked for subsystems (or explicitly for no
* subsystems) then they must match.
*/
if ((opts.subsys_mask || opts.none) &&
(opts.subsys_mask != root->subsys_mask)) {
if (!name_match)
continue;
ret = -EBUSY;
goto out_unlock;
}
if (root->flags ^ opts.flags)
pr_warn("new mount options do not match the existing superblock, will be ignored\n");
/*
* We want to reuse @root whose lifetime is governed by its
* ->cgrp. Let's check whether @root is alive and keep it
* that way. As cgroup_kill_sb() can happen anytime, we
* want to block it by pinning the sb so that @root doesn't
* get killed before mount is complete.
*
* With the sb pinned, tryget_live can reliably indicate
* whether @root can be reused. If it's being killed,
* drain it. We can use wait_queue for the wait but this
* path is super cold. Let's just sleep a bit and retry.
*/
pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
if (IS_ERR(pinned_sb) ||
!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
if (!IS_ERR_OR_NULL(pinned_sb))
deactivate_super(pinned_sb);
msleep(10);
ret = restart_syscall();
goto out_free;
}
ret = 0;
goto out_unlock;
}
//第一次挂载时,上面两个循环啥也没做
/*
* No such thing, create a new one. name= matching without subsys
* specification is allowed for already existing hierarchies but we
* can't create new one without subsys specification.
*/
if (!opts.subsys_mask && !opts.none) {
ret = -EINVAL;
goto out_unlock;
}
/* Hierarchies may only be created in the initial cgroup namespace. */
if (ns != &init_cgroup_ns) {
ret = -EPERM;
goto out_unlock;
}
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root) {
ret = -ENOMEM;
goto out_unlock;
}
new_root = true;
init_cgroup_root(root, &opts); //现在系统中有了第二个cgroup_root,第二个cgroup,第二个cgoup_subsys_state,他们彼此引用。
ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
if (ret)
cgroup_free_root(root);
out_unlock:
mutex_unlock(&cgroup_mutex);
out_free:
kfree(opts.release_agent);
kfree(opts.name);
if (ret)
return ERR_PTR(ret);
//调用kernfs_mount完成实际mount,
dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
CGROUP_SUPER_MAGIC, ns);
/*
* There's a race window after we release cgroup_mutex and before
* allocating a superblock. Make sure a concurrent process won't
* be able to re-use the root during this window by delaying the
* initialization of root refcnt.
*/
if (new_root) {
mutex_lock(&cgroup_mutex);
percpu_ref_reinit(&root->cgrp.self.refcnt);
mutex_unlock(&cgroup_mutex);
}
/*
* If @pinned_sb, we're reusing an existing root and holding an
* extra ref on its sb. Mount is complete. Put the extra ref.
*/
if (pinned_sb)
deactivate_super(pinned_sb);
return dentry;
}
cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
和初始化调用该函数不同,此时subsys_mask不为0,会在rebind_subsystem中
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
struct kernfs_syscall_ops *kf_sops;
struct css_set *cset;
int i, ret;
lockdep_assert_held(&cgroup_mutex);
ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
if (ret < 0)
goto out;
root_cgrp->id = ret;
root_cgrp->ancestor_ids[0] = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
ref_flags, GFP_KERNEL);
if (ret)
goto out;
/*
* We're accessing css_set_count without locking css_set_lock here,
* but that's OK - it can only be increased by someone holding
* cgroup_lock, and that's us. Later rebinding may disable
* controllers on the default hierarchy and thus create new csets,
* which can't be more than the existing ones. Allocate 2x.
*/
ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
if (ret)
goto cancel_ref;
ret = cgroup_init_root_id(root);
if (ret)
goto cancel_ref;
kf_sops = root == &cgrp_dfl_root ?
&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED |
KERNFS_ROOT_SUPPORT_EXPORTOP,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
root_cgrp->kn = root->kf_root->kn;
ret = css_populate_dir(&root_cgrp->self);
if (ret)
goto destroy_root;
ret = rebind_subsystems(root, ss_mask);
if (ret)
goto destroy_root;
ret = cgroup_bpf_inherit(root_cgrp);
WARN_ON_ONCE(ret);
trace_cgroup_setup_root(root);
/*
* There must be no failure case after here, since rebinding takes
* care of subsystems' refcounts, which are explicitly dropped in
* the failure exit path.
*/
list_add(&root->root_list, &cgroup_roots);
cgroup_root_count++;
/*
* Link the root cgroup in this hierarchy into all the css_set
* objects.
*/
spin_lock_irq(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist) {
link_css_set(&tmp_links, cset, root_cgrp);
if (css_set_populated(cset))
cgroup_update_populated(root_cgrp, true);
}
spin_unlock_irq(&css_set_lock);
BUG_ON(!list_empty(&root_cgrp->self.children));
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
kernfs_activate(root_cgrp->kn);
ret = 0;
goto out;
destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
exit_root_id:
cgroup_exit_root_id(root);
cancel_ref:
percpu_ref_exit(&root_cgrp->self.refcnt);
out:
free_cgrp_cset_links(&tmp_links);
return ret;
}
rebind_dubsystem
完成子系统在不同cgroup_root间的移动,两次调用cgroup_apply_control,因为原cgroup中移出指定子系统,新cgroup中加入指定子系统。
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
int ssid, i, ret;
lockdep_assert_held(&cgroup_mutex);
do_each_subsys_mask(ss, ssid, ss_mask) {
/*
* If @ss has non-root csses attached to it, can't move.
* If @ss is an implicit controller, it is exempt from this
* rule and can be stolen.
*/
if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
!ss->implicit_on_dfl)
return -EBUSY;
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
} while_each_subsys_mask();
do_each_subsys_mask(ss, ssid, ss_mask) {
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
struct css_set *cset;
WARN_ON(!css || cgroup_css(dcgrp, ss));
/* disable from the source */
//将该子系统从cgrp_dfl_root中移出,cgroup_root的subsys_mask表明其subsys数组中被哪些子系统的css填充
src_root->subsys_mask &= ~(1 << ssid);
WARN_ON(cgroup_apply_control(scgrp));
cgroup_finalize_control(scgrp, 0); //从cgrp_dfl_root->cgrp及其后代中移出指定子系统的属性文件
/* rebind */
//表明指定子系统加入了新的cgroup_root
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
css->cgroup = dcgrp;
spin_lock_irq(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist)
list_move_tail(&cset->e_cset_node[ss->id],
&dcgrp->e_csets[ss->id]);
spin_unlock_irq(&css_set_lock);
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) {
static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
} else {
dcgrp->subtree_control |= 1 << ssid;
static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
}
ret = cgroup_apply_control(dcgrp);
if (ret)
pr_warn("partial failure to rebind %s controller (err=%d)\n",
ss->name, ret);
if (ss->bind)
ss->bind(css);
} while_each_subsys_mask();
kernfs_activate(dcgrp->kn);
return 0;
}
cgroup_apply_control
static int cgroup_apply_control(struct cgroup *cgrp)
{
int ret;
cgroup_propagate_control(cgrp); //将cgroup所有后代的子系统相关与cgrp同步
ret = cgroup_apply_control_enable(cgrp);//创建cgroup及所有后代的cftype节点
if (ret)
return ret;
/*
* At this point, cgroup_e_css_by_mask() results reflect the new csses
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
ret = cgroup_update_dfl_csses(cgrp);//进程迁移
if (ret)
return ret;
return 0;
}
总结
cgroup的mount首先创建一个cgroup_root,然后将指定子系统从源cgroup_root迁移到新创建的cgroup_root