-------------------------------------linux内核2.6.24 组调度简述
(2016-11-1 by
aweii)
-----------------一、概述
Linux的组调度概要而言,创建组时每个cpu上分配一个组调度实体se和1个cfs_rq队列,
se.my_q=cfs_rq,组调度实体se在cpu的rq.cfs_rq中调度,作
为具体的进程调度实体的父实体(parent
sched_entity),进程在se.my_q中调度,se.my_q
加入cpu的rq的leaf cfs_rq list (叶子cfs_rq的列表,调度
进程的cfs运行队列是最下层叶子节点)。以上流程可参考2.6.24版内核的
sched_create_group函数。
----------------二、相关数据结构:
1、cpu运行队列rq
struct rq {
spinlock_t lock;
unsigned long nr_running; //进程数
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned char idle_at_tick;
#ifdef CONFIG_NO_HZ
unsigned char in_nohz_recently;
#endif
struct load_weight load; //cpu负荷的度量
unsigned long nr_load_updates;
u64 nr_switches; //进程切换次数
struct cfs_rq cfs; //完全公平调度策略的运行队列(红黑树结构)
#ifdef CONFIG_FAIR_GROUP_SCHED
struct list_head leaf_cfs_rq_list; //叶子cfs_rq结构的列表(见概述)
#endif
struct rt_rq rt; //实时调度策略的运行队列
unsigned long nr_uninterruptible;
struct task_struct *curr, *idle;
//当前运行进程和idle进程指针
unsigned long next_balance;
struct mm_struct *prev_mm;
u64 clock, prev_clock_raw;
s64 clock_max_delta;
unsigned int clock_warps, clock_overflows;
u64 idle_clock;
unsigned int clock_deep_idle_events;
u64 tick_timestamp;
atomic_t nr_iowait;
//以下用于负载均衡和进程迁移
#ifdef CONFIG_SMP
struct sched_domain *sd;
int active_balance;
int push_cpu;
int cpu;
struct task_struct *migration_thread;
struct list_head migration_queue;
#endif
//以下记录调度有关信息
#ifdef CONFIG_SCHEDSTATS
struct sched_info rq_sched_info;
unsigned int yld_exp_empty;
unsigned int yld_act_empty;
unsigned int yld_both_empty;
unsigned int yld_count;
unsigned int sched_switch;
unsigned int sched_count;
unsigned int sched_goidle;
unsigned int ttwu_count;
unsigned int ttwu_local;
unsigned int bkl_count;
#endif
struct lock_class_key rq_lock_key;
};
2、调度实体sched_entity
struct sched_entity {
struct load_weightload;
struct rb_noderun_node;
unsigned inton_rq;
u64exec_start; //开始运行时间
u64sum_exec_runtime; //本次累计运行时间
u64vruntime;
u64prev_sum_exec_runtime; //上次累计运行时间
…………
//以下与组调度有关
#ifdef CONFIG_FAIR_GROUP_SCHED
//父调度实体,本se在parent->my_q中调度运行,比如开启user
group调度或container
group调度的情况下,进程的父se是其所属user的se调度实体
struct sched_entity*parent;
//注释很明白:cfs_rq
指向所属的运行队列(本实体在其中参与调度)
struct cfs_rq*cfs_rq;
struct cfs_rq*my_q; //my_q指向拥有的cfs_rq,本实体的子实体(如进程的调度实体)
在其中参与调度。
#endif
};
3、task_group创建组时用到的结构,存放在各cpu上新建的cfs_rq和组调度实体se的指针
struct task_group {
#ifdef CONFIG_FAIR_CGROUP_SCHED
struct cgroup_subsys_state css;
#endif
struct sched_entity **se;//存放se指针的数组指针
struct cfs_rq **cfs_rq; //存放cfs_rq指针的数组指针
unsigned long shares;
spinlock_t lock;
struct rcu_head rcu;
};
----------------三、相关宏:
#define cpu_rq(cpu)(&per_cpu(runqueues,
(cpu))) //获取cpu对应的rq
#define task_rq(p)cpu_rq(task_cpu(p))
//获取进程所属的rq(
组调度开启与否都适用)
#define cpu_curr(cpu)(cpu_rq(cpu)->curr)
//cpu上的当前运行进程
----------------四、相关函数
1、创建组
struct task_group *sched_create_group(void)
{
struct task_group *tg;
struct cfs_rq *cfs_rq;
struct sched_entity *se;
struct rq *rq;
int i;
tg = kzalloc(sizeof(*tg), GFP_KERNEL); //创建组结构
if (!tg)
return ERR_PTR(-ENOMEM);
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS,
GFP_KERNEL); //创建cfs_rq
指针数组
if (!tg->cfs_rq)
goto err;
tg->se = kzalloc(sizeof(se) * NR_CPUS,
GFP_KERNEL); //创建sched_entity指针数组
if (!tg->se)
goto err;
//以下循环,在每个cpu上创建一个cfs_rq和sched_entity
for_each_possible_cpu(i) {
rq = cpu_rq(i);
cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
cpu_to_node(i));
if (!cfs_rq)
goto err;
se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
cpu_to_node(i));
if (!se)
goto err;
memset(cfs_rq, 0, sizeof(struct cfs_rq));
memset(se, 0, sizeof(struct sched_entity));
tg->cfs_rq[i] = cfs_rq; //cfs_rq加入数组
init_cfs_rq(cfs_rq, rq);
cfs_rq->tg = tg;
tg->se[i] = se; //se加入数组
se->cfs_rq =
&rq->cfs;
se->my_q = cfs_rq;
se->load.weight = NICE_0_LOAD;
//初始权重为NICE_0_LOAD,
那么虚拟时钟等于现实时钟
//存放权重倒数因子,方便除法运算,vruntime=运行时间*NICE_0_LOAD/se->load.
weight
se->load.inv_weight =
div64_64(1ULL<<32,
NICE_0_LOAD);
se->parent = NULL;
}
//以下循环,将创建的各cfs_rq加入rq的叶子cfs_rq列表,即rq->leaf_cfs_rq_list,
可见一斑的组调度是两层结构,组实体下就是进程实体了
for_each_possible_cpu(i) {
rq = cpu_rq(i);
cfs_rq = tg->cfs_rq[i];
list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
&rq->leaf_cfs_rq_list);
}
tg->shares = NICE_0_LOAD;
spin_lock_init(&tg->lock);
return tg;
err:
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
if (tg->se)
kfree(tg->se[i]);
}
kfree(tg->cfs_rq);
kfree(tg->se);
kfree(tg);
return ERR_PTR(-ENOMEM);
}
总结:组实体se的cfs_rq运行队列se->my_q,进程实体在其中调度,组实体在cpu的rq.
cfs_rq中调度。rq.cfs_rq是一层运行队列,
组实体se->my_q是第二层,也是叶子层。组实体tg->se[cpu]是父,进程实体tsk->se
是子。
2、CFS调度类(CFS调度策略相关函数的汇集类)
static const struct sched_class fair_sched_class =
{
.next= &idle_sched_class,
.enqueue_task= enqueue_task_fair, //进程入队列
.dequeue_task= dequeue_task_fair, //进程出队列
.yield_task= yield_task_fair,
.check_preempt_curr= check_preempt_wakeup,
.pick_next_task= pick_next_task_fair, //取下个被调度的进程
.put_prev_task= put_prev_task_fair, //刚切换出来的进程返回队列
#ifdef CONFIG_SMP
.load_balance= load_balance_fair, //负载均衡
.move_one_task= move_one_task_fair,
#endif
.set_curr_task
=
set_curr_task_fair, //设置当前进程
.task_tick= task_tick_fair,
//被时钟中断中update_process_times->
scheduler_tick调用
.task_new= task_new_fair, //新建进程插入cfs_rq运行队列
};
(1)pick_next_task_fair: 挑拣下个进程
static struct task_struct *pick_next_task_fair(struct rq
*rq)
{
struct cfs_rq *cfs_rq =
&rq->cfs;//顶层运行队列
struct sched_entity *se;
if (unlikely(!cfs_rq->nr_running))
return NULL;
do {
se = pick_next_entity(cfs_rq);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
return task_of(se);
}
(2)enqueue_task_fair:进程加入cfs_rq运行队列,和(1)相反,这是个自底向上的过程
static void enqueue_task_fair(struct rq *rq, struct
task_struct *p, int wakeup)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se =
&p->se;
//一般也是循环两次,先检查调度实体是否在队列,不在的话再加入队列的红黑树,然后?
钙涓甘堤澹械诙盅?如果开启了组调度的话)。
for_each_sched_entity(se) {
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, wakeup);
wakeup = 1;
}
}
其它函数说明略。