前面说过,对进程分组管控,本质上就是分组中的进程拥有独立于其它分组进程的子系统状态(struct cgroup_subsys_state),系统根据进程对应的子系统状态为进程分配资源。内核用结构体struct css_set表示进程与子系统状态的关系。css_set(set of cgroup_subsys_state objects)顾名思义,就是子系统状态的集合。进程在每个子系统都有对应的状态,例如一个进程有自己的cpuset状态、blkio状态等,这些状态放在指针数组subsys[CGROUP_SUBSYS_COUNT]中。不同的进程也可能拥有相同的子系统状态的集合,例如所有前台程序的cpuset状态、blkio状态都相同,具有相同子系统状态集合的进程链接在css_set的成员list_head tasks中。
struct task_struct {
#ifdef CONFIG_CGROUPS
struct css_set __rcu *cgroups;
struct list_head cg_list;
#endif
.......................................
}
struct css_set {
//包含哪些task
struct list_head tasks;
//cgrp_cset_link链接到这里
struct list_head cgrp_links;
//子系统状态
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
........................................................
struct cgroup *mg_src_cgrp; //迁移源cgroup
struct cgroup *mg_dst_cgrp; //迁移目的cgroup
struct css_set *mg_dst_cset; //迁移目的css_set
..........................................................
};
进程控制块中有两个成员与struct css_set相关,css_set __rcu *cgroups指向进程对应的css_set,list_head cg_list链接到对应css_set的成员list_head tasks中。进程、css_set、 cgroup_subsys_state的关系如下。
结构体struct cgrp_cset_link用于链接css_set和cgroup。cgrp_cset_link的成员cgroup *cgrp指向链接的cgroup,css_set *cset指向链接的css_set。
struct cgrp_cset_link {
struct cgroup *cgrp; //对应cgroup
struct css_set *cset; //对应css_set
/* 链接到css_set */
struct list_head cset_link;
/* 链接到cgroup */
struct list_head cgrp_link;
};
1.init_css_set初始化
全局变量init_css_set是系统最初始的css_set,保存了所有子系统的默认状态。在cgroup_init_subsys ()中,ss->css_alloc()这一步的入参为空,入参为空时,子系统css_alloc()接口反回初始状态,例如cpuset返回的是top_cpuset.css。在init_css_set.subsys[ss->id] = css时,把初始状态放到init_css_set保管子系统状态数组的对应位置。
static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
struct cgroup_subsys_state *css;
pr_debug("Initializing cgroup subsys %s\n", ss->name);
mutex_lock(&cgroup_mutex);
idr_init(&ss->css_idr);
INIT_LIST_HEAD(&ss->cfts);
/* Create the root cgroup state for this subsystem */
ss->root = &cgrp_dfl_root;
//调用css_alloc分配cgroup_subsys_state,入参为空
css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
................................................................
init_css_set.subsys[ss->id] = css; //放到init_css_set保管子系统状态数组的对应位置
................................................................
}
static struct cgroup_subsys_state * cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cpuset *cs;
//cpuset的css_alloc接口,入参为空时返回的是top_cpuset
if (!parent_css)
return &top_cpuset.css;
...............................................................
}
2.mount子系统
mount子系统时,cgroup_setup_root()阶段,会调用link_css_set()创建cgroup_root与每个已存在css_set的联系。mount各个子系统后,css_set与cgoup的关系如下所示。
static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
struct css_set *cset;
int i, ret;
.................................................................
spin_lock_irq(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist) {
link_css_set(&tmp_links, cset, root_cgrp); //链接css_set与cgroup
if (css_set_populated(cset))
cgroup_update_populated(root_cgrp, true);
}
spin_unlock_irq(&css_set_lock);
.................................................................
}
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
struct cgroup *cgrp)
{
struct cgrp_cset_link *link;
BUG_ON(list_empty(tmp_links));
if (cgroup_on_dfl(cgrp))
cset->dfl_cgrp = cgrp;
link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
link->cset = cset;
link->cgrp = cgrp;
/*
* Always add links to the tail of the lists so that the lists are
* in choronological order.
*/
list_move_tail(&link->cset_link, &cgrp->cset_links);
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
if (cgroup_parent(cgrp))
cgroup_get(cgrp);
}
3.进程迁移
创建分组时并不会创建css_set,当向分组迁移进程时发现没有css_set保存分组状态才会创建。进程迁移操作,就是把需要迁移进程的pid,写入到对应分组的task节点中。该节点对应的write操作接口为cgroup_tasks_write()。cgroup_tasks_write()-->__cgroup_procs_write()通过pid从当前进程所在pid namesapce中找到struct task,进程迁移的本质就是将task->cgroups指向目的css_set,task-> cg_list链接到目的css_set。
static struct cftype cgroup_legacy_base_files[] = {
..................................................
{
.name = "tasks",
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_TASKS,
.write = cgroup_tasks_write,
},
...................................................
static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
struct cgroup_subsys *ss;
struct cgroup *cgrp;
pid_t pid;
int ssid, ret;
......................................................
percpu_down_write(&cgroup_threadgroup_rwsem);
rcu_read_lock();
if (pid) {
//通过pid在当前pid namesapce中查找task_struct
tsk = find_task_by_vpid(pid);
if (!tsk) {
ret = -ESRCH;
goto out_unlock_rcu;
}
} else {
tsk = current;
}
......................................................
ret = cgroup_procs_write_permission(tsk, cgrp, of);
if (!ret) //迁移
ret = cgroup_attach_task(cgrp, tsk, threadgroup);
......................................................
}
static int cgroup_attach_task(struct cgroup *dst_cgrp,
struct task_struct *leader, bool threadgroup)
{
LIST_HEAD(preloaded_csets);
struct task_struct *task;
int ret;
...........................................................
rcu_read_lock();
task = leader;
do {
//准备好源cgroup和目的cgroup
cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
&preloaded_csets);
if (!threadgroup)
break;
} while_each_thread(leader, task);
rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
/* 准备好目的css_set */
ret = cgroup_migrate_prepare_dst(&preloaded_csets);
if (!ret)//迁移
ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
...........................................................
}
cgroup_attach_task()首先调用cgroup_migrate_add_src(),准备好源cgroup(src_cset->mg_src_cgrp)和目的cgroup(src_cset->mg_dst_cgrp)。
static void cgroup_migrate_add_src(struct css_set *src_cset,
struct cgroup *dst_cgrp,
struct list_head *preloaded_csets)
{
struct cgroup *src_cgrp;
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
if (src_cset->dead)
return;
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
if (!list_empty(&src_cset->mg_preload_node))
return;
..........................................................................................................
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
list_add(&src_cset->mg_preload_node, preloaded_csets);
}
cgroup_attach_task()然后调用cgroup_migrate_prepare_dst(),准备好迁移目的css_set(src_cset->mg_dst_cset)。find_css_set()中没有找到合适的csss_set,会创建一个新的css_set。
static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
{
LIST_HEAD(csets);
struct css_set *src_cset, *tmp_cset;
..........................................................................
list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
struct css_set *dst_cset;
dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
if (!dst_cset)
goto err;
..........................................................................
src_cset->mg_dst_cset = dst_cset;
if (list_empty(&dst_cset->mg_preload_node))
list_add(&dst_cset->mg_preload_node, &csets);
else
put_css_set(dst_cset);
}
.......................................................................
}
static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup *cgrp)
{
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
struct css_set *cset;
struct list_head tmp_links;
struct cgrp_cset_link *link;
struct cgroup_subsys *ss;
unsigned long key;
int ssid;
lockdep_assert_held(&cgroup_mutex);
//如果存在则直接返回,不存在则创建一个新的
cset = find_existing_css_set(old_cset, cgrp, template);
if (cset)
get_css_set(cset);
spin_unlock_irq(&css_set_lock);
if (cset)
return cset;
cset = kzalloc(sizeof(*cset), GFP_KERNEL);
if (!cset)
return NULL;
............................................................
}
cgroup_attach_task()最后调用cgroup_migrate()把进程迁移到目的css_set,核心操作就是将目的css_set赋值给task->cgroups,将task->cg_list链接到目的css_set。
4.子进程继承父进程css_set
在cgroup_init_early ()阶段将init_css_set设置为init对应的css_set。创建子进程时会先后调用cgroup_fork()和cgroup_post_fork()。cgroup_fork()中将init_css_set设置为子进程默认的css_set。cgroup_post_fork()判断是否已经mount过子系统,如果没有mount过子系统,子进程就用cgroup_fork()中设置的init_css_set,这时子进程并未链接到对应css_set中。
int __init cgroup_init_early(void)
{
static struct cgroup_sb_opts __initdata opts;
struct cgroup_subsys *ss;
int i;
init_cgroup_root(&cgrp_dfl_root, &opts);
cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
//init_css_set设置为init对应的css_set
RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
...................................................................
}
static __latent_entropy struct task_struct *copy_process(
unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
int trace,
unsigned long tls,
int node)
{
int retval;
struct task_struct *p;
unsigned long sig[_NSIG_WORDS];
unsigned long shared_sig[_NSIG_WORDS];
int i;
.............................................................
cgroup_fork(p);//创建子进程时先调用cgroup_fork,再调用cgroup_post_fork
.............................................................
cgroup_post_fork(p);
.............................................................
}
void cgroup_fork(struct task_struct *child)
{
RCU_INIT_POINTER(child->cgroups, &init_css_set);
INIT_LIST_HEAD(&child->cg_list);
}
void cgroup_post_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
..................................................................
//如果mount过子系统
if (use_task_css_set_links) {
struct css_set *cset;
spin_lock_irq(&css_set_lock);
cset = task_css_set(current); //获取父进程css_set
if (list_empty(&child->cg_list)) {
get_css_set(cset);
//加到父进程所在css_set
css_set_move_task(child, NULL, cset, false);
}
spin_unlock_irq(&css_set_lock);
}
..................................................................
}
没有mount子系统之前,所有进程的task->cgroups指向init_css_set。
第一次mount子系统时,调用cgroup_enable_task_cg_lists()将进程链接到它所在的css_set。mount过子系统后,再有子进程创建时cgroup_post_fork()会进入if (use_task_css_set_links)分支,用父进程的css_set作为子进程的css_set,并把子进程链接到该css_set中。
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
bool is_v2 = fs_type == &cgroup2_fs_type;
struct super_block *pinned_sb = NULL;
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
int i;
bool new_sb;
get_cgroup_ns(ns);
..................................................................
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
..................................................................
}
static void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
.................................................................
read_lock(&tasklist_lock);
do_each_thread(g, p) {
WARN_ON_ONCE(!list_empty(&p->cg_list) ||
task_css_set(p) != &init_css_set);
............................................................
spin_lock(&p->sighand->siglock);
if (!(p->flags & PF_EXITING)) {
//获取进程对应的css_set
struct css_set *cset = task_css_set(p);
if (!css_set_populated(cset))
css_set_update_populated(cset, true);
//进程链到所在css_set
list_add_tail(&p->cg_list, &cset->tasks);
get_css_set(cset);
}
spin_unlock(&p->sighand->siglock);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
out_unlock:
spin_unlock_irq(&css_set_lock);
}