Linux SMP启动流程学习(三)
4 构建CPU拓扑关系
4.1 创建调度域拓扑关系—sched_init_domains()
在系统启动开始的时候就开始构建CPU的拓扑关系,具体流程如下:
[start_kernel() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> sched_init_domains()]
源码:/kernel/sched/topology.c:1770
/*
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
* For now this just excludes isolated CPUs, but could be used to
* exclude other special cases in the future.
*/
int sched_init_domains(const struct cpumask *cpu_map)
{
int err;
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
arch_update_cpu_topology();
ndoms_cur = 1;
doms_cur = alloc_sched_domains(ndoms_cur);
if (!doms_cur)
doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
//真正建立调度域拓扑关系的函数
err = build_sched_domains(doms_cur[0], NULL);//
register_sched_domain_sysctl();
return err;
}
start_kernel() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> sched_init_domains() -> build_sched_domains()
源码:/kernel/sched/topology.c:1636
/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
*/
static int
build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
{
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
struct rq *rq = NULL;
int i, ret = -ENOMEM;
//最终调用std_alloc(),创建调度域等数据结构
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
/* Set up domains for CPUs specified by the cpu_map: */
//遍历cpu_map = cpu_active_mask中的所有CPU
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
sd = NULL;
// 对每个CPU遍历所有的SDTL,相当于每个CPU
// 都有自己的一套SDTL对应的调度域,为每个CPU都初始化
// 一整套SDTL对应的调度域和调度组。
for_each_sd_topology(tl) {
// 为每个CPU都建立调度域和调度组
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP)
sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
}
/* Build the groups for the domains */
// 遍历cpu_active_mask中的所有CPU
for_each_cpu(i, cpu_map) {
// 遍历CPU中对应的调度域
// per_cpu_ptr(),获取最低的SDTL对应的调度域
// sd->parent,得到上一级的调度域
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
sd->span_weight = cpumask_weight(sched_domain_span(sd));
if (sd->flags & SD_OVERLAP) {
if (build_overlap_sched_groups(sd, i))
goto error;
} else {
//创建调度组
if (build_sched_groups(sd, i))
goto error;
}
}
}
/* Calculate CPU capacity for physical packages and nodes */
// 设置各个调度组能力系数,内核通常设定单个CPU最大的调度能力系数
// 为1024,但不同的体系架构对调度能力系数有不同的计算方法
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
continue;
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
claim_allocations(i, sd);
init_sched_groups_capacity(i, sd);
}
}
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
// 最后cpu_attach_domain()把相关的调度域关联到
// 运行队列的struct rq的root_domain中,还会对
// 各个级别的调度域做一些精简,例如调度域和上
// 一级调度域的兄弟位图(span)相同,或者调度域的
// 兄弟位只有自己一个,那么就要删掉一个了。
cpu_attach_domain(sd, d.rd, i);
}
rcu_read_unlock();
if (rq && sched_debug_enabled) {
pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
}
ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);
return ret;
}
4.2 申请调度域—__sdt_alloc()
每个SDTL都有一个struct sched_domain_topology_level数据结构来描述,并且内嵌一个struct sd_data数据结构,包含sched_domain、sched_group和sched_group_capacity的二级指针
每个SDTL都分配一个Per-CPU变量的含sched_domain、sched_group和sched_group_capacity数据结构
在每个SDTL中为每个CPU都分配含sched_domain、sched_group和sched_group_capacity数据结构,即每个CPU在每个SDTL中都有对应的调度域和调度组
start_kernel() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> sched_init_domains() -> build_sched_domains() -> __visit_domain_allocation_hell() -> sdt_alloc()
源码:/kernel/sched/topology.c:1496
static int __sdt_alloc(const struct cpumask *cpu_map)
{
struct sched_domain_topology_level *tl;
int j;
// 遍历系统默认CPU拓扑层次关系数组default_topology,见第二章
// 遍历顺序为SMT –> MT -> DIE
for_each_sd_topology(tl) {
struct sd_data *sdd = &tl->data;
//为SDTL的调度域(sd)分配Per-Cpu变量的数据结构
sdd->sd = alloc_percpu(struct sched_domain *);
if (!sdd->sd)
return -ENOMEM;
//为SDTL的共享调度域分配Per-Cpu变量的数据结构
sdd->sds = alloc_percpu(struct sched_domain_shared *);
if (!sdd->sds)
return -ENOMEM;
//为SDTL的调度组(sg)分配Per-Cpu变量的数据结构
sdd->sg = alloc_percpu(struct sched_group *);
if (!sdd->sg)
return -ENOMEM;
//为SDTL的调度组能力(sgc)分配Per-Cpu变量的数据结构
sdd->sgc = alloc_percpu(struct sched_group_capacity *);
if (!sdd->sgc)
return -ENOMEM;
// 为每个CPU都创建一个调度域、调度组和调度能力组数据结构,
// 并且存放在Per-CPU变量中
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_domain_shared *sds;
struct sched_group *sg;
struct sched_group_capacity *sgc;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sd)
return -ENOMEM;
*per_cpu_ptr(sdd->sd, j) = sd;
sds = kzalloc_node(sizeof(struct sched_domain_shared),
GFP_KERNEL, cpu_to_node(j));
if (!sds)
return -ENOMEM;
*per_cpu_ptr(sdd->sds, j) = sds;
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sg)
return -ENOMEM;
sg->next = sg;
*per_cpu_ptr(sdd->sg, j) = sg;
sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sgc)
return -ENOMEM;
#ifdef CONFIG_SCHED_DEBUG
sgc->id = j;
#endif
*per_cpu_ptr(sdd->sgc, j) = sgc;
}
}
return 0;
}
4.3 建立调度域—build_sched_domain()
start_kernel() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> sched_init_domains() -> build_sched_domains() -> build_sched_domain()
源码:/kernel/sched/topology.c:1601
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
// 由tl 和 cpu_id来获取对应的struct sched_domain数据结构并初始化其成员
struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
// 由于SDTL的遍历是从SMT级到MC级再到DIE级递进的,
// 因此SMT级的CPU可以看做MC级的孩子,MC级可以看做
// SMT级CPU的父亲,它们存在父子关系或上下级关系。
// struct sched_domain()数据结构中有parent和child成员用于描述此关系。
if (child) {
sd->level = child->level + 1;
sched_domain_level_max = max(sched_domain_level_max, sd->level);
child->parent = sd;
if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) {
pr_err("BUG: arch topology borken\n");
#ifdef CONFIG_SCHED_DEBUG
pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name);
#endif
/* Fixup, ensure @sd has at least @child cpus. */
cpumask_or(sched_domain_span(sd),
sched_domain_span(sd),
sched_domain_span(child));
}
}
set_domain_attribute(sd, attr);
return sd;
}
4.3.1 初始化调度域—sd_init()
源码:/kernel/sched/topology.c:1085
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map,
struct sched_domain *child, int cpu)
{
struct sd_data *sdd = &tl->data;
//获取该CPU对应的sched_domain数据结构
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
int sd_id, sd_weight, sd_flags = 0;
#ifdef CONFIG_NUMA
/*
* Ugly hack to pass state to sd_numa_mask()...
*/
sched_domains_curr_level = tl->numa_level;
#endif
sd_weight = cpumask_weight(tl->mask(cpu));
if (tl->sd_flags)
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
"wrong sd_flags in topology description\n"))
sd_flags &= ~TOPOLOGY_SD_FLAGS;
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
.busy_factor = 32,
.imbalance_pct = 125,
.cache_nice_tries = 0,
.busy_idx = 0,
.idle_idx = 0,
.newidle_idx = 0,
.wake_idx = 0,
.forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
| 1*SD_BALANCE_EXEC
| 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
| 1*SD_WAKE_AFFINE
| 0*SD_SHARE_CPUCAPACITY
| 0*SD_SHARE_PKG_RESOURCES
| 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
| 0*SD_NUMA
| sd_flags
,
.last_balance = jiffies,
.balance_interval = sd_weight,
.smt_gain = 0,
.max_newidle_lb_cost = 0,
.next_decay_max_lb_cost = jiffies,
.child = child,
#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
#endif
};
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
sd_id = cpumask_first(sched_domain_span(sd));
/*
* Convert topological properties into behaviour.
*/
if (sd->flags & SD_ASYM_CPUCAPACITY) {
struct sched_domain *t = sd;
for_each_lower_domain(t)
t->flags |= SD_BALANCE_WAKE;
}
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
#ifdef CONFIG_NUMA
} else if (sd->flags & SD_NUMA) {
sd->cache_nice_tries = 2;
sd->busy_idx = 3;
sd->idle_idx = 2;
sd->flags |= SD_SERIALIZE;
if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
sd->flags &= ~(SD_BALANCE_EXEC |
SD_BALANCE_FORK |
SD_WAKE_AFFINE);
}
#endif
} else {
sd->flags |= SD_PREFER_SIBLING;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
sd->idle_idx = 1;
}
/*
* For all levels sharing cache; connect a sched_domain_shared
* instance.
*/
if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
atomic_inc(&sd->shared->ref);
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
}
sd->private = sdd;
return sd;
}
4.4 建立调度组—build_sched_groups()
源码:/kernel/sched/topology.c:875
/*
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_capacity to 0.
*
* Assumes the sched_domain tree is fully constructed
*/
static int
build_sched_groups(struct sched_domain *sd, int cpu)
{
struct sched_group *first = NULL, *last = NULL;
struct sd_data *sdd = sd->private;
const struct cpumask *span = sched_domain_span(sd);
struct cpumask *covered;
int i;
lockdep_assert_held(&sched_domains_mutex);
covered = sched_domains_tmpmask;
cpumask_clear(covered);
// 设置该调度域sd中不同的CPU对应的调度组的包含关系,
// 这些调度组分别用next 指针串联起来
for_each_cpu_wrap(i, span, cpu) {
struct sched_group *sg;
if (cpumask_test_cpu(i, covered))
continue;
sg = get_group(i, sdd);//获取该CPU对应的调度组并放在sd->groups指针中
cpumask_or(covered, covered, sched_group_span(sg));
if (!first)
first = sg;
if (last)
last->next = sg;
last = sg;
}
last->next = first;
sd->groups = first;
return 0;
}
4.4.1 获取调度组—get_group()
/*
* Package topology (also see the load-balance blurb in fair.c)
*
* The scheduler builds a tree structure to represent a number of important
* topology features. By default (default_topology[]) these include:
*
* - Simultaneous multithreading (SMT)
* - Multi-Core Cache (MC)
* - Package (DIE)
*
* Where the last one more or less denotes everything up to a NUMA node.
*
* The tree consists of 3 primary data structures:
*
* sched_domain -> sched_group -> sched_group_capacity
* ^ ^ ^ ^
* `-' `-'
*
* The sched_domains are per-cpu and have a two way link (parent & child) and
* denote the ever growing mask of CPUs belonging to that level of topology.
*
* Each sched_domain has a circular (double) linked list of sched_group's, each
* denoting the domains of the level below (or individual CPUs in case of the
* first domain level). The sched_group linked by a sched_domain includes the
* CPU of that sched_domain [*].
*
* Take for instance a 2 threaded, 2 core, 2 cache cluster part:
*
* CPU 0 1 2 3 4 5 6 7
*
* DIE [ ]
* MC [ ] [ ]
* SMT [ ] [ ] [ ] [ ]
*
* - or -
*
* DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
* MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
* SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
*
* CPU 0 1 2 3 4 5 6 7
*
* One way to think about it is: sched_domain moves you up and down among these
* topology levels, while sched_group moves you sideways through it, at child
* domain granularity.
*
* sched_group_capacity ensures each unique sched_group has shared storage.
*
* There are two related construction problems, both require a CPU that
* uniquely identify each group (for a given domain):
*
* - The first is the balance_cpu (see should_we_balance() and the
* load-balance blub in fair.c); for each group we only want 1 CPU to
* continue balancing at a higher domain.
*
* - The second is the sched_group_capacity; we want all identical groups
* to share a single sched_group_capacity.
*
* Since these topologies are exclusive by construction. That is, its
* impossible for an SMT thread to belong to multiple cores, and cores to
* be part of multiple caches. There is a very clear and unique location
* for each CPU in the hierarchy.
*
* Therefore computing a unique CPU for each group is trivial (the iteration
* mask is redundant and set all 1s; all CPUs in a group will end up at _that_
* group), we can simply pick the first CPU in each group.
*
*
* [*] in other words, the first group of each domain is its child domain.
*/
static struct sched_group *get_group(int cpu, struct sd_data *sdd)
{
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
struct sched_domain *child = sd->child;
struct sched_group *sg;
if (child)
cpu = cpumask_first(sched_domain_span(child));
sg = *per_cpu_ptr(sdd->sg, cpu);
sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
/* For claim_allocations: */
atomic_inc(&sg->ref);
atomic_inc(&sg->sgc->ref);
if (child) {
cpumask_copy(sched_group_span(sg), sched_domain_span(child));
cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
} else {
cpumask_set_cpu(cpu, sched_group_span(sg));
cpumask_set_cpu(cpu, group_balance_mask(sg));
}
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
return sg;
}
- cpu=1时,get_group()函数首先获取cpu1在DIE级别的调度域sd_die_1,然后通过child指针获取MC级别的调度域sd_mc_1。获取sd_mc_1域里的第一个CPU,为何会是CPU0而不是CPU1呢?我们返回来仔细看一下build_sched_domain()函数,发现sd_mc域的span兄弟位图的设置和tl->mask(cpu)函数相关,同属MC级别的CPUs应该包含同样的范围,也就是对于CPU0来说,它的兄弟位应该是[cpu0|cpu1],同样对于CPU1来说也是同样的道理。
4.5 CPU拓扑示例
-
如上图所示,假设在一个4核处理器中,每个物理CPU核心拥有独立L1 Cache且不支持超线程技术,分成两个簇Cluster0和Cluster1,每个簇包含两个物理CPU核,簇中的CPU核共享L2 Cache。
-
在分析之前先总结Linux内核里构建CPU域和调度组拓扑关系图的一些原则。
根据CPU物理属性分层次,从下到上,由SMT->MC->DIE的递进关系来分层,用数据结构struct sched_domain_topology_level来描述,简称为SDTL
每个SDTL都为调度域和调度组都建立一个Per-CPU变量,并且为每个CPU都分配响应的数据结构
在同一个SDTL中由芯片设计决定哪些CPUs是兄弟关系。调度域中有span成员来描述,调度组有cpumark成员来描述兄弟关系
同一个CPU的不同SDTL的调度域有父子关系。每个调度域里包含了相应的调度组并且这些调度组串联成一个链表,调度域的groups成员是链表头。 -
因为每个CPU核心只有一个执行线程,所以4核处理器没有SMT属性。cluster由两个CPU物理核组成,这两个CPU是MC层级且是兄弟关系。整个处理器可以看做一个DIE级别,因此该处理器只有两个层级,即MC和DIE。根据上述原则,可以标识出上述4核处理器的调度域和调度组的拓扑关系图,如下图所示。
每个SDTL为每个CPU都分配了对应的调度域和调度组,以CPU0为例,在图中,虚线表示管辖。 -
1) 对于DIE级别,CPU0对应的调度域是domain_die_0,该调度域管辖着4个CPU并包含两个调度组,分别为group_die_0和group_die_1。其中
调度组group_die_0管辖着CPU0和CPU1
调度组group_die_1管辖着CPU2和CPU3 -
2) 对于MC级别,CPU0对应的调度域是domain_mc_0,该调度域管辖着CPU0和CPU1并包含两个调度组,分别为group_mc_0和group_mc_1。其中
调度组group_mc_0管辖CPU0
调度组group_mc_1管辖CPU1
为什么DIE级别的所有调度组只有group_die_0和group_die_1呢?
-
因为在建立调度组的函数build_sched_groups()有一个判断(if(cpu != cpumask_first(span))),这样只有参与cpu为调度域的第一个CPU才会建立DIE层级的调度组。注意get_group()函数,它会返回子调度域兄弟关系的第一个CPU。
-
除此之外还有两层关系,一是父子关系,通过struct sched_domain数据结构中的parent和child成员来完成;另外一个关系是同一个SDTL中调度组都链接成一个链表,通过struct sched_domain数据结构中的groups成员来完成,如下图所示。
-
最后再关心一下,SMP是如何均衡负载的呢?在内核中,SMP负载均衡机制从注册软终端开始,每次系统处理调度tick时会检查当前是否需要处理SMP负载均衡。详情可见[start_kernel() -> sched_init() -> init_sched_fair_class()]。
源码:/kernel/sched/fair.c:9592
__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
// run_rebalance_domains,负载均衡的核心入口
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */
}
参考:
《Linux SMP启动过程分析报告》
《Device Tree(三):代码分析》
《arm linux启动流程三》