下面来看各个子系统对cgroup的支持,第一篇先研究blkio子系统
blkio子系统支持三种类型的QoS控制:
blkio.weight, blkio.weight_device:这些是基于设备权重值的控制方式
blkio.throttle.read_bps_device,blkio.throttle.write_bps_device:这些是基于带宽的控制方式
blkio.throttle.read_iops_device,blkio.throttle.write_iops_device:这些是基于iops的控制方式
其中基于权重的控制方式,必须依赖于CFQ调度器,而基于throttle的控制方式则只需要在通用块层实现就可以了
1) 基于blkio的cgroup_subsys的定义如下:
struct cgroup_subsys blkio_subsys = {
.name = "blkio",
.create = blkiocg_create,
.can_attach_task = blkiocg_can_attach_task,
.attach_task = blkiocg_attach_task,
.destroy = blkiocg_destroy,
.populate = blkiocg_populate,
#ifdef CONFIG_BLK_CGROUP
/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
.subsys_id = blkio_subsys_id,
#endif
.use_id = 1,
.module = THIS_MODULE,
};
blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup):初始化一个blkio_cgroup结构,并初始化blkio_cgroup->policy_list, blkio_cgroup->blkg_list
blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup):略过
blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup):初始化好blkio_files里所有的blkio_policy_node对应的cgroup文件系统的文件
blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk):
/*
* We cannot support shared io contexts, as we have no mean to support
* two tasks with the same ioc in two different groups without major rework
* of the main cic data structures. For now we allow a task to change
* its cgroup only if it's the only owner of its ioc.
*/
2) 基于blkio的policy的数据结构定义如下:
struct blkio_policy_node {
struct list_head node;
dev_t dev;
/* This node belongs to max bw policy or porportional weight policy */
enum blkio_policy_id plid;
/* cgroup file to which this rule belongs to */
int fileid;
union {
unsigned int weight;
/*
* Rate read/write in terms of byptes per second
* Whether this rate represents read or write is determined
* by file type "fileid".
*/
u64 bps;
unsigned int iops;
} val;
};
struct blkio_policy_ops {
blkio_unlink_group_fn *blkio_unlink_group_fn;
blkio_update_group_weight_fn *blkio_update_group_weight_fn;
blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
};
enum blkio_policy_id {
BLKIO_POLICY_PROP = 0,/* Proportional Bandwidth division */
BLKIO_POLICY_THROTL,/* Throttling */
};
struct blkio_policy_type {
struct list_head list;
struct blkio_policy_ops ops;
enum blkio_policy_id plid;
};
blkio_policy_node,基本上可以看做一个cgroup文件系统下的一个配置文件对应一个blkio_policy_node,一个cgroup目录的所有的policy_node都会被链在一个blkio_cgroup->policy_list的链表中
blkio_policy_type根据不同的blkio_policy_id有不同的blkio_policy_ops,blkio_policy_register在cfq_init,throtl_init时被调用,这两个初始化函数分别对应基于权重的控制和基于阀值的控制,目前有两个全局的blkio_policy_type的变量:
static struct blkio_policy_type blkio_policy_cfq = {
.ops = {
.blkio_unlink_group_fn =cfq_unlink_blkio_group,
.blkio_update_group_weight_fn =cfq_update_blkio_group_weight,
},
.plid = BLKIO_POLICY_PROP,
};
以及
static struct blkio_policy_type blkio_policy_throtl = {
.ops = {
.blkio_unlink_group_fn = throtl_unlink_blkio_group,
.blkio_update_group_read_bps_fn =
throtl_update_blkio_group_read_bps,
.blkio_update_group_write_bps_fn =
throtl_update_blkio_group_write_bps,
.blkio_update_group_read_iops_fn =
throtl_update_blkio_group_read_iops,
.blkio_update_group_write_iops_fn =
throtl_update_blkio_group_write_iops,
},
.plid = BLKIO_POLICY_THROTL,
};
3) 基于blkio的cgroup文件系统的数据结构如下:
struct cftype blkio_files[] = {
{
.name = "weight_device",
.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
BLKIO_PROP_weight_device),
.read_seq_string = blkiocg_file_read,
.write_string = blkiocg_file_write,
.max_write_len = 256,
},
{
.name = "weight",
.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
BLKIO_PROP_weight),
.read_u64 = blkiocg_file_read_u64,
.write_u64 = blkiocg_file_write_u64,
},
{
.name = "throttle.read_bps_device",
.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
BLKIO_THROTL_read_bps_device),
.read_seq_string = blkiocg_file_read,
.write_string = blkiocg_file_write,
.max_write_len = 256,
},
{
.name = "throttle.write_bps_device",
.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
BLKIO_THROTL_write_bps_device),
.read_seq_string = blkiocg_file_read,
.write_string = blkiocg_file_write,
.max_write_len = 256,
},
{
.name = "throttle.read_iops_device",
.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
BLKIO_THROTL_read_iops_device),
.read_seq_string = blkiocg_file_read,
.write_string = blkiocg_file_write,
.max_write_len = 256,
},
{
.name = "throttle.write_iops_device",
.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
BLKIO_THROTL_write_iops_device),
.read_seq_string = blkiocg_file_read,
.write_string = blkiocg_file_write,
.max_write_len = 256,
},
基本上调用的都是blkiocg_file_read,blkiocg_file_write
blkio_files中的struct cftype有个private成员变量,通过BLKIOFILE_PRIVATE宏来赋值,e.g.
.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, BLKIO_PROP_weight_device)
之后可以通过BLKIOFILE_POLICY获取其policy类型:BLKIO_POLICY_THROTL或者BLKIO_POLICY_PROP,通过BLKIOFILE_ATTR获取其文件名,所有的配置文件都有如下定义:
/* cgroup files owned by proportional weight policy */
enum blkcg_file_name_prop {
BLKIO_PROP_weight = 1,
BLKIO_PROP_weight_device,
BLKIO_PROP_io_service_bytes,
BLKIO_PROP_io_serviced,
BLKIO_PROP_time,
BLKIO_PROP_sectors,
BLKIO_PROP_unaccounted_time,
BLKIO_PROP_io_service_time,
BLKIO_PROP_io_wait_time,
BLKIO_PROP_io_merged,
BLKIO_PROP_io_queued,
BLKIO_PROP_avg_queue_size,
BLKIO_PROP_group_wait_time,
BLKIO_PROP_idle_time,
BLKIO_PROP_empty_time,
BLKIO_PROP_dequeue,
};
/* cgroup files owned by throttle policy */
enum blkcg_file_name_throtl {
BLKIO_THROTL_read_bps_device,
BLKIO_THROTL_write_bps_device,
BLKIO_THROTL_read_iops_device,
BLKIO_THROTL_write_iops_device,
BLKIO_THROTL_io_service_bytes,
BLKIO_THROTL_io_serviced,
};
static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, struct seq_file *m):通过cftype得到POLICY_ID, POLICY_FILE_NAME,通过struct cgroup得到struct blkio_cgroup,然后调用blkio_read_policy_node_files,按照一定格式存到一个seq_file里面,可以参考blkio_print_policy_node函数
static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, const char *buffer):先调用blkio_policy_parse_and_set生成一个新的blkio_policy_node,下面的步骤就是先删了已有的policy node,再把新的policy node插入到blkio_cgroup->policy_list里面,最后调用blkio_update_policy_node_blkg,该函数对blkio_cgroup下面的所有blkio_group,都调用blkio_update_blkg_policy,该函数会根据blkio_policy_node的plid, fileid,调用不同的 blkio_update_xxxxx函数,以weight为例,最终调用到blkio_update_group_weight,后者又调用cfq_update_blkio_group_weight(这是跟CFQ紧耦合的一个函数,这里不做介绍了)
4) 几个关键的数据结构blkio_cgroup和blkio_group
struct blkio_cgroup {
struct cgroup_subsys_state css;
unsigned int weight;
spinlock_t lock;
struct hlist_head blkg_list;
struct list_head policy_list; /* list of blkio_policy_node */
};
struct blkio_group {
/* An rcu protected unique identifier for the group */
void *key;
struct hlist_node blkcg_node;
unsigned short blkcg_id;
/* Store cgroup path */
char path[128];
/* The device MKDEV(major, minor), this group has been created for */
dev_t dev;
/* policy which owns this blk group */
enum blkio_policy_id plid;
/* Need to serialize the stats in the case of reset/update */
spinlock_t stats_lock;
struct blkio_group_stats stats;
/* Per cpu stats pointer */
struct blkio_group_stats_cpu __percpu *stats_cpu;
};
blkio_cgroup代表了一个cgroup,但是这个cgroup里的进程有可能会读写多个块设备,所有通过一个cfq_data或者throtl_data的结构作为红黑树的key,把多个blkio_group关联到一个blkio_cgroup中。每个cfq_data或者throtl_data(根据policy的不同)实际上代表了一个块设备