[Android Cgroup层次关系] 四、Cgroup与进程关系

        前面说过,对进程分组管控,本质上就是分组中的进程拥有独立于其它分组进程的子系统状态(struct cgroup_subsys_state),系统根据进程对应的子系统状态为进程分配资源。内核用结构体struct css_set表示进程与子系统状态的关系。css_set(set of cgroup_subsys_state objects)顾名思义,就是子系统状态的集合。进程在每个子系统都有对应的状态,例如一个进程有自己的cpuset状态、blkio状态等,这些状态放在指针数组subsys[CGROUP_SUBSYS_COUNT]中。不同的进程也可能拥有相同的子系统状态的集合,例如所有前台程序的cpuset状态、blkio状态都相同,具有相同子系统状态集合的进程链接在css_set的成员list_head tasks中

struct task_struct {
#ifdef CONFIG_CGROUPS
	struct css_set __rcu		*cgroups;
	struct list_head		cg_list;
#endif
    .......................................
}
struct css_set {
    //包含哪些task
	struct list_head tasks;
    //cgrp_cset_link链接到这里
	struct list_head cgrp_links;
    //子系统状态
	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
    ........................................................
	struct cgroup *mg_src_cgrp;    //迁移源cgroup
	struct cgroup *mg_dst_cgrp;    //迁移目的cgroup
	struct css_set *mg_dst_cset;   //迁移目的css_set
    ..........................................................
};

        进程控制块中有两个成员与struct css_set相关,css_set __rcu *cgroups指向进程对应的css_set,list_head cg_list链接到对应css_set的成员list_head tasks中。进程、css_set cgroup_subsys_state的关系如下。

         结构体struct cgrp_cset_link用于链接css_set和cgroup。cgrp_cset_link的成员cgroup *cgrp指向链接的cgroup,css_set *cset指向链接的css_set。

struct cgrp_cset_link {
	struct cgroup		*cgrp;     //对应cgroup
	struct css_set		*cset;     //对应css_set
	/* 链接到css_set */
	struct list_head	cset_link;
	/* 链接到cgroup */
	struct list_head	cgrp_link;
};

 1.init_css_set初始化

        全局变量init_css_set是系统最初始的css_set,保存了所有子系统的默认状态。在cgroup_init_subsys ()中,ss->css_alloc()这一步的入参为空,入参为空时,子系统css_alloc()接口反回初始状态,例如cpuset返回的是top_cpuset.css。在init_css_set.subsys[ss->id] = css时,把初始状态放到init_css_set保管子系统状态数组的对应位置。

static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
	struct cgroup_subsys_state *css;

	pr_debug("Initializing cgroup subsys %s\n", ss->name);

	mutex_lock(&cgroup_mutex);

	idr_init(&ss->css_idr);
	INIT_LIST_HEAD(&ss->cfts);

	/* Create the root cgroup state for this subsystem */
	ss->root = &cgrp_dfl_root;
    //调用css_alloc分配cgroup_subsys_state,入参为空
	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));  
	/* We don't handle early failures gracefully */
	BUG_ON(IS_ERR(css));
	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
    ................................................................
	init_css_set.subsys[ss->id] = css;     //放到init_css_set保管子系统状态数组的对应位置
    ................................................................
}

static struct cgroup_subsys_state * cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
	struct cpuset *cs;
    //cpuset的css_alloc接口,入参为空时返回的是top_cpuset
	if (!parent_css)
		return &top_cpuset.css;
    ...............................................................
}

2.mount子系统 

        mount子系统时,cgroup_setup_root()阶段,会调用link_css_set()创建cgroup_root与每个已存在css_set的联系mount各个子系统后,css_set与cgoup的关系如下所示。

static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
	LIST_HEAD(tmp_links);
	struct cgroup *root_cgrp = &root->cgrp;
	struct css_set *cset;
	int i, ret;

    .................................................................
	spin_lock_irq(&css_set_lock);
	hash_for_each(css_set_table, i, cset, hlist) {
		link_css_set(&tmp_links, cset, root_cgrp); //链接css_set与cgroup
		if (css_set_populated(cset))
			cgroup_update_populated(root_cgrp, true);
	}
	spin_unlock_irq(&css_set_lock);
    .................................................................
}

static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
			 struct cgroup *cgrp)
{
	struct cgrp_cset_link *link;

	BUG_ON(list_empty(tmp_links));

	if (cgroup_on_dfl(cgrp))
		cset->dfl_cgrp = cgrp;

	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
	link->cset = cset;
	link->cgrp = cgrp;

	/*
	 * Always add links to the tail of the lists so that the lists are
	 * in choronological order.
	 */
	list_move_tail(&link->cset_link, &cgrp->cset_links);
	list_add_tail(&link->cgrp_link, &cset->cgrp_links);

	if (cgroup_parent(cgrp))
		cgroup_get(cgrp);
}

3.进程迁移

        创建分组时并不会创建css_set,当向分组迁移进程时发现没有css_set保存分组状态才会创建。进程迁移操作,就是把需要迁移进程的pid,写入到对应分组的task节点中。该节点对应的write操作接口为cgroup_tasks_write()。cgroup_tasks_write()-->__cgroup_procs_write()通过pid从当前进程所在pid namesapce中找到struct task,进程迁移的本质就是将task->cgroups指向目的css_set,task-> cg_list链接到目的css_set。

static struct cftype cgroup_legacy_base_files[] = {
    ..................................................
	{
		.name = "tasks",
		.seq_start = cgroup_pidlist_start,
		.seq_next = cgroup_pidlist_next,
		.seq_stop = cgroup_pidlist_stop,
		.seq_show = cgroup_pidlist_show,
		.private = CGROUP_FILE_TASKS,
		.write = cgroup_tasks_write,
	},
	...................................................

static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
				    size_t nbytes, loff_t off, bool threadgroup)
{
	struct task_struct *tsk;
	struct cgroup_subsys *ss;
	struct cgroup *cgrp;
	pid_t pid;
	int ssid, ret;
    ......................................................
	percpu_down_write(&cgroup_threadgroup_rwsem);
	rcu_read_lock();
	if (pid) {
	         //通过pid在当前pid namesapce中查找task_struct
		tsk = find_task_by_vpid(pid);
		if (!tsk) {
			ret = -ESRCH;
			goto out_unlock_rcu;
		}
	} else {
		tsk = current;
	}
    ......................................................
	ret = cgroup_procs_write_permission(tsk, cgrp, of);
	if (!ret) //迁移
		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
    ......................................................
}

static int cgroup_attach_task(struct cgroup *dst_cgrp,
			      struct task_struct *leader, bool threadgroup)
{
	LIST_HEAD(preloaded_csets);
	struct task_struct *task;
	int ret;
    ...........................................................
	rcu_read_lock();
	task = leader;
	do {
	         //准备好源cgroup和目的cgroup
		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
				       &preloaded_csets);
		if (!threadgroup)
			break;
	} while_each_thread(leader, task);
	rcu_read_unlock();
	spin_unlock_irq(&css_set_lock);

	/* 准备好目的css_set */
	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
	if (!ret)//迁移
		ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);

    ...........................................................
}

        cgroup_attach_task()首先调用cgroup_migrate_add_src(),准备好源cgroup(src_cset->mg_src_cgrp)和目的cgroup(src_cset->mg_dst_cgrp)。

static void cgroup_migrate_add_src(struct css_set *src_cset,
				   struct cgroup *dst_cgrp,
				   struct list_head *preloaded_csets)
{
	struct cgroup *src_cgrp;

	lockdep_assert_held(&cgroup_mutex);
	lockdep_assert_held(&css_set_lock);

	if (src_cset->dead)
		return;

	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

	if (!list_empty(&src_cset->mg_preload_node))
		return;
         ..........................................................................................................
	src_cset->mg_src_cgrp = src_cgrp;
	src_cset->mg_dst_cgrp = dst_cgrp;
	get_css_set(src_cset);
	list_add(&src_cset->mg_preload_node, preloaded_csets);
}

        cgroup_attach_task()然后调用cgroup_migrate_prepare_dst(),准备好迁移目的css_set(src_cset->mg_dst_cset)。find_css_set()中没有找到合适的csss_set,会创建一个新的css_set。

static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
{
	LIST_HEAD(csets);
	struct css_set *src_cset, *tmp_cset;
    ..........................................................................
	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
		struct css_set *dst_cset;

		dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
		if (!dst_cset)
			goto err;
                  
       ..........................................................................
		src_cset->mg_dst_cset = dst_cset;

		if (list_empty(&dst_cset->mg_preload_node))
			list_add(&dst_cset->mg_preload_node, &csets);
		else
			put_css_set(dst_cset);
	}
    .......................................................................
}

static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup *cgrp)
{
	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
	struct css_set *cset;
	struct list_head tmp_links;
	struct cgrp_cset_link *link;
	struct cgroup_subsys *ss;
	unsigned long key;
	int ssid;

	lockdep_assert_held(&cgroup_mutex);

	//如果存在则直接返回,不存在则创建一个新的
	cset = find_existing_css_set(old_cset, cgrp, template);
	if (cset)
		get_css_set(cset);
	spin_unlock_irq(&css_set_lock);

	if (cset)
		return cset;

	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
	if (!cset)
		return NULL;
    ............................................................
}

        cgroup_attach_task()最后调用cgroup_migrate()把进程迁移到目的css_set,核心操作就是将目的css_set赋值给task->cgroups,将task->cg_list链接到目的css_set。

 4.子进程继承父进程css_set

        在cgroup_init_early ()阶段将init_css_set设置为init对应的css_set。创建子进程时会先后调用cgroup_fork()和cgroup_post_fork()。cgroup_fork()中将init_css_set设置为子进程默认的css_set。cgroup_post_fork()判断是否已经mount过子系统,如果没有mount过子系统,子进程就用cgroup_fork()中设置的init_css_set,这时子进程并未链接到对应css_set中。

int __init cgroup_init_early(void)
{
	static struct cgroup_sb_opts __initdata opts;
	struct cgroup_subsys *ss;
	int i;

	init_cgroup_root(&cgrp_dfl_root, &opts);
	cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
         //init_css_set设置为init对应的css_set
	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
    ...................................................................
}

static __latent_entropy struct task_struct *copy_process(
					unsigned long clone_flags,
					unsigned long stack_start,
					unsigned long stack_size,
					int __user *child_tidptr,
					struct pid *pid,
					int trace,
					unsigned long tls,
					int node)
{
	int retval;
	struct task_struct *p;
	unsigned long sig[_NSIG_WORDS];
	unsigned long shared_sig[_NSIG_WORDS];
	int i;
    .............................................................
    cgroup_fork(p);//创建子进程时先调用cgroup_fork,再调用cgroup_post_fork
    .............................................................
    cgroup_post_fork(p);
    .............................................................
}

void cgroup_fork(struct task_struct *child)
{
	RCU_INIT_POINTER(child->cgroups, &init_css_set);
	INIT_LIST_HEAD(&child->cg_list);
}

void cgroup_post_fork(struct task_struct *child)
{
	struct cgroup_subsys *ss;
	int i;
    ..................................................................
    //如果mount过子系统
	if (use_task_css_set_links) {
		struct css_set *cset;

		spin_lock_irq(&css_set_lock);
		cset = task_css_set(current); //获取父进程css_set
		if (list_empty(&child->cg_list)) {
			get_css_set(cset);
			//加到父进程所在css_set
			css_set_move_task(child, NULL, cset, false);
		}
		spin_unlock_irq(&css_set_lock);
	}
    ..................................................................
}

没有mount子系统之前,所有进程的task->cgroups指向init_css_set

        第一次mount子系统时,调用cgroup_enable_task_cg_lists()将进程链接到它所在的css_set。mount过子系统后,再有子进程创建时cgroup_post_fork()会进入if (use_task_css_set_links)分支,用父进程的css_set作为子进程的css_set,并把子进程链接到该css_set中。

static struct dentry *cgroup_mount(struct file_system_type *fs_type,
			 int flags, const char *unused_dev_name,
			 void *data)
{
	bool is_v2 = fs_type == &cgroup2_fs_type;
	struct super_block *pinned_sb = NULL;
	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
	struct cgroup_subsys *ss;
	struct cgroup_root *root;
	struct cgroup_sb_opts opts;
	struct dentry *dentry;
	int ret;
	int i;
	bool new_sb;

	get_cgroup_ns(ns);
    ..................................................................
	if (!use_task_css_set_links)
		cgroup_enable_task_cg_lists();
	..................................................................
}
static void cgroup_enable_task_cg_lists(void)
{
	struct task_struct *p, *g;
         .................................................................
	read_lock(&tasklist_lock);
	do_each_thread(g, p) {
		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
			     task_css_set(p) != &init_css_set);
        ............................................................
		spin_lock(&p->sighand->siglock);
		if (!(p->flags & PF_EXITING)) {
		    //获取进程对应的css_set
			struct css_set *cset = task_css_set(p);

			if (!css_set_populated(cset))
				css_set_update_populated(cset, true);
			//进程链到所在css_set
			list_add_tail(&p->cg_list, &cset->tasks);
			get_css_set(cset);
		}
		spin_unlock(&p->sighand->siglock);
	} while_each_thread(g, p);
	read_unlock(&tasklist_lock);
out_unlock:
	spin_unlock_irq(&css_set_lock);
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值