Linux SMP启动流程学习（三）

最新推荐文章于 2024-05-09 17:17:29 发布

楓潇潇

最新推荐文章于 2024-05-09 17:17:29 发布

阅读量1.5k

点赞数

分类专栏： # Linux 启动流程学习

本文链接：https://blog.csdn.net/u013836909/article/details/94206035

版权

Linux 启动流程学习专栏收录该内容

8 篇文章 7 订阅

订阅专栏

Linux SMP启动流程学习（三）

4 构建CPU拓扑关系

4.1 创建调度域拓扑关系—sched_init_domains()

在系统启动开始的时候就开始构建CPU的拓扑关系，具体流程如下：
[start_kernel() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> sched_init_domains()]

源码：/kernel/sched/topology.c：1770

/*
 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
 * For now this just excludes isolated CPUs, but could be used to
 * exclude other special cases in the future.
 */
int sched_init_domains(const struct cpumask *cpu_map)
{
	int err;

	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
	zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
	zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);

	arch_update_cpu_topology();
	ndoms_cur = 1;
	doms_cur = alloc_sched_domains(ndoms_cur);
	if (!doms_cur)
		doms_cur = &fallback_doms;
	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
    //真正建立调度域拓扑关系的函数
	err = build_sched_domains(doms_cur[0], NULL);//
	register_sched_domain_sysctl();

	return err;
}

start_kernel() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> sched_init_domains() -> build_sched_domains()
源码：/kernel/sched/topology.c:1636

/*
 * Build sched domains for a given set of CPUs and attach the sched domains
 * to the individual CPUs
 */
static int
build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
{
	enum s_alloc alloc_state;
	struct sched_domain *sd;
	struct s_data d;
	struct rq *rq = NULL;
	int i, ret = -ENOMEM;

    //最终调用std_alloc()，创建调度域等数据结构
	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
	if (alloc_state != sa_rootdomain)
		goto error;

	/* Set up domains for CPUs specified by the cpu_map: */
    //遍历cpu_map = cpu_active_mask中的所有CPU
	for_each_cpu(i, cpu_map) {
		struct sched_domain_topology_level *tl;

		sd = NULL;
        // 对每个CPU遍历所有的SDTL，相当于每个CPU
// 都有自己的一套SDTL对应的调度域，为每个CPU都初始化
// 一整套SDTL对应的调度域和调度组。
		for_each_sd_topology(tl) {
            // 为每个CPU都建立调度域和调度组
			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
			if (tl == sched_domain_topology)
				*per_cpu_ptr(d.sd, i) = sd;
			if (tl->flags & SDTL_OVERLAP)
				sd->flags |= SD_OVERLAP;
			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
				break;
		}
	}

	/* Build the groups for the domains */
    // 遍历cpu_active_mask中的所有CPU
	for_each_cpu(i, cpu_map) {
        // 遍历CPU中对应的调度域
        // per_cpu_ptr()，获取最低的SDTL对应的调度域
        // sd->parent，得到上一级的调度域
		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
			sd->span_weight = cpumask_weight(sched_domain_span(sd));
			if (sd->flags & SD_OVERLAP) {
				if (build_overlap_sched_groups(sd, i))
					goto error;
			} else {
                  //创建调度组
				if (build_sched_groups(sd, i))
					goto error;
			}
		}
	}

	/* Calculate CPU capacity for physical packages and nodes */
// 设置各个调度组能力系数，内核通常设定单个CPU最大的调度能力系数
// 为1024，但不同的体系架构对调度能力系数有不同的计算方法
	for (i = nr_cpumask_bits-1; i >= 0; i--) {
		if (!cpumask_test_cpu(i, cpu_map))
			continue;

		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
			claim_allocations(i, sd);
			init_sched_groups_capacity(i, sd);
		}
	}

	/* Attach the domains */
	rcu_read_lock();
	for_each_cpu(i, cpu_map) {
		rq = cpu_rq(i);
		sd = *per_cpu_ptr(d.sd, i);

		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);

        // 最后cpu_attach_domain()把相关的调度域关联到
// 运行队列的struct rq的root_domain中，还会对
// 各个级别的调度域做一些精简，例如调度域和上
// 一级调度域的兄弟位图(span)相同，或者调度域的
// 兄弟位只有自己一个，那么就要删掉一个了。
		cpu_attach_domain(sd, d.rd, i);
	}
	rcu_read_unlock();

	if (rq && sched_debug_enabled) {
		pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
	}

	ret = 0;
error:
	__free_domain_allocs(&d, alloc_state, cpu_map);
	return ret;
}

4.2 申请调度域—__sdt_alloc()

 每个SDTL都有一个struct sched_domain_topology_level数据结构来描述，并且内嵌一个struct sd_data数据结构，包含sched_domain、sched_group和sched_group_capacity的二级指针
 每个SDTL都分配一个Per-CPU变量的含sched_domain、sched_group和sched_group_capacity数据结构
 在每个SDTL中为每个CPU都分配含sched_domain、sched_group和sched_group_capacity数据结构，即每个CPU在每个SDTL中都有对应的调度域和调度组

start_kernel() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> sched_init_domains() -> build_sched_domains() -> __visit_domain_allocation_hell() -> sdt_alloc()

源码：/kernel/sched/topology.c：1496

static int __sdt_alloc(const struct cpumask *cpu_map)
{
	struct sched_domain_topology_level *tl;
	int j;

// 遍历系统默认CPU拓扑层次关系数组default_topology，见第二章
// 遍历顺序为SMT –> MT -> DIE
	for_each_sd_topology(tl) {
		struct sd_data *sdd = &tl->data;

        //为SDTL的调度域(sd)分配Per-Cpu变量的数据结构
		sdd->sd = alloc_percpu(struct sched_domain *);
		if (!sdd->sd)
			return -ENOMEM;
   
        //为SDTL的共享调度域分配Per-Cpu变量的数据结构
		sdd->sds = alloc_percpu(struct sched_domain_shared *);
		if (!sdd->sds)
			return -ENOMEM;

        //为SDTL的调度组(sg)分配Per-Cpu变量的数据结构
		sdd->sg = alloc_percpu(struct sched_group *);
		if (!sdd->sg)
			return -ENOMEM;

        //为SDTL的调度组能力(sgc)分配Per-Cpu变量的数据结构
		sdd->sgc = alloc_percpu(struct sched_group_capacity *);
		if (!sdd->sgc)
			return -ENOMEM;

        // 为每个CPU都创建一个调度域、调度组和调度能力组数据结构，
        // 并且存放在Per-CPU变量中
		for_each_cpu(j, cpu_map) {
			struct sched_domain *sd;
			struct sched_domain_shared *sds;
			struct sched_group *sg;
			struct sched_group_capacity *sgc;

			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
					GFP_KERNEL, cpu_to_node(j));
			if (!sd)
				return -ENOMEM;

			*per_cpu_ptr(sdd->sd, j) = sd;

			sds = kzalloc_node(sizeof(struct sched_domain_shared),
					GFP_KERNEL, cpu_to_node(j));
			if (!sds)
				return -ENOMEM;

			*per_cpu_ptr(sdd->sds, j) = sds;

			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
					GFP_KERNEL, cpu_to_node(j));
			if (!sg)
				return -ENOMEM;

			sg->next = sg;

			*per_cpu_ptr(sdd->sg, j) = sg;

			sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
					GFP_KERNEL, cpu_to_node(j));
			if (!sgc)
				return -ENOMEM;

#ifdef CONFIG_SCHED_DEBUG
			sgc->id = j;
#endif

			*per_cpu_ptr(sdd->sgc, j) = sgc;
		}
	}

	return 0;
}

4.3 建立调度域—build_sched_domain()
start_kernel() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> sched_init_domains() -> build_sched_domains() -> build_sched_domain()

源码：/kernel/sched/topology.c:1601

static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
		struct sched_domain *child, int cpu)
{
    // 由tl 和 cpu_id来获取对应的struct sched_domain数据结构并初始化其成员
	struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);

// 由于SDTL的遍历是从SMT级到MC级再到DIE级递进的，
// 因此SMT级的CPU可以看做MC级的孩子，MC级可以看做
// SMT级CPU的父亲，它们存在父子关系或上下级关系。
// struct sched_domain()数据结构中有parent和child成员用于描述此关系。
	if (child) {
		sd->level = child->level + 1;
		sched_domain_level_max = max(sched_domain_level_max, sd->level);
		child->parent = sd;

		if (!cpumask_subset(sched_domain_span(child),
				    sched_domain_span(sd))) {
			pr_err("BUG: arch topology borken\n");
#ifdef CONFIG_SCHED_DEBUG
			pr_err("     the %s domain not a subset of the %s domain\n",
					child->name, sd->name);
#endif
			/* Fixup, ensure @sd has at least @child cpus. */
			cpumask_or(sched_domain_span(sd),
				   sched_domain_span(sd),
				   sched_domain_span(child));
		}

	}
	set_domain_attribute(sd, attr);

	return sd;
}

4.3.1 初始化调度域—sd_init()

源码：/kernel/sched/topology.c：1085

static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
	const struct cpumask *cpu_map,
	struct sched_domain *child, int cpu)
{
	struct sd_data *sdd = &tl->data;
    //获取该CPU对应的sched_domain数据结构
	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
	int sd_id, sd_weight, sd_flags = 0;

#ifdef CONFIG_NUMA
	/*
	 * Ugly hack to pass state to sd_numa_mask()...
	 */
	sched_domains_curr_level = tl->numa_level;
#endif

	sd_weight = cpumask_weight(tl->mask(cpu));

	if (tl->sd_flags)
		sd_flags = (*tl->sd_flags)();
	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
			"wrong sd_flags in topology description\n"))
		sd_flags &= ~TOPOLOGY_SD_FLAGS;

	*sd = (struct sched_domain){
		.min_interval		= sd_weight,
		.max_interval		= 2*sd_weight,
		.busy_factor		= 32,
		.imbalance_pct		= 125,

		.cache_nice_tries	= 0,
		.busy_idx		= 0,
		.idle_idx		= 0,
		.newidle_idx		= 0,
		.wake_idx		= 0,
		.forkexec_idx		= 0,

		.flags			= 1*SD_LOAD_BALANCE
					| 1*SD_BALANCE_NEWIDLE
					| 1*SD_BALANCE_EXEC
					| 1*SD_BALANCE_FORK
					| 0*SD_BALANCE_WAKE
					| 1*SD_WAKE_AFFINE
					| 0*SD_SHARE_CPUCAPACITY
					| 0*SD_SHARE_PKG_RESOURCES
					| 0*SD_SERIALIZE
					| 0*SD_PREFER_SIBLING
					| 0*SD_NUMA
					| sd_flags
					,

		.last_balance		= jiffies,
		.balance_interval	= sd_weight,
		.smt_gain		= 0,
		.max_newidle_lb_cost	= 0,
		.next_decay_max_lb_cost	= jiffies,
		.child			= child,
#ifdef CONFIG_SCHED_DEBUG
		.name			= tl->name,
#endif
	};

	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
	sd_id = cpumask_first(sched_domain_span(sd));

	/*
	 * Convert topological properties into behaviour.
	 */

	if (sd->flags & SD_ASYM_CPUCAPACITY) {
		struct sched_domain *t = sd;

		for_each_lower_domain(t)
			t->flags |= SD_BALANCE_WAKE;
	}

	if (sd->flags & SD_SHARE_CPUCAPACITY) {
		sd->flags |= SD_PREFER_SIBLING;
		sd->imbalance_pct = 110;
		sd->smt_gain = 1178; /* ~15% */

	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
		sd->imbalance_pct = 117;
		sd->cache_nice_tries = 1;
		sd->busy_idx = 2;

#ifdef CONFIG_NUMA
	} else if (sd->flags & SD_NUMA) {
		sd->cache_nice_tries = 2;
		sd->busy_idx = 3;
		sd->idle_idx = 2;

		sd->flags |= SD_SERIALIZE;
		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
			sd->flags &= ~(SD_BALANCE_EXEC |
				       SD_BALANCE_FORK |
				       SD_WAKE_AFFINE);
		}

#endif
	} else {
		sd->flags |= SD_PREFER_SIBLING;
		sd->cache_nice_tries = 1;
		sd->busy_idx = 2;
		sd->idle_idx = 1;
	}

	/*
	 * For all levels sharing cache; connect a sched_domain_shared
	 * instance.
	 */
	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
		atomic_inc(&sd->shared->ref);
		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
	}

	sd->private = sdd;

	return sd;
}

4.4 建立调度组—build_sched_groups()

源码：/kernel/sched/topology.c：875

/*
 * build_sched_groups will build a circular linked list of the groups
 * covered by the given span, and will set each group's ->cpumask correctly,
 * and ->cpu_capacity to 0.
 *
 * Assumes the sched_domain tree is fully constructed
 */
static int
build_sched_groups(struct sched_domain *sd, int cpu)
{
	struct sched_group *first = NULL, *last = NULL;
	struct sd_data *sdd = sd->private;
	const struct cpumask *span = sched_domain_span(sd);
	struct cpumask *covered;
	int i;

	lockdep_assert_held(&sched_domains_mutex);
	covered = sched_domains_tmpmask;

	cpumask_clear(covered);

// 设置该调度域sd中不同的CPU对应的调度组的包含关系，
// 这些调度组分别用next 指针串联起来
	for_each_cpu_wrap(i, span, cpu) {
		struct sched_group *sg;

		if (cpumask_test_cpu(i, covered))
			continue;

		sg = get_group(i, sdd);//获取该CPU对应的调度组并放在sd->groups指针中

		cpumask_or(covered, covered, sched_group_span(sg));

		if (!first)
			first = sg;
		if (last)
			last->next = sg;
		last = sg;
	}
	last->next = first;
	sd->groups = first;

	return 0;
}

4.4.1 获取调度组—get_group()

/*
 * Package topology (also see the load-balance blurb in fair.c)
 *
 * The scheduler builds a tree structure to represent a number of important
 * topology features. By default (default_topology[]) these include:
 *
 *  - Simultaneous multithreading (SMT)
 *  - Multi-Core Cache (MC)
 *  - Package (DIE)
 *
 * Where the last one more or less denotes everything up to a NUMA node.
 *
 * The tree consists of 3 primary data structures:
 *
 *	sched_domain -> sched_group -> sched_group_capacity
 *	    ^ ^             ^ ^
 *          `-'             `-'
 *
 * The sched_domains are per-cpu and have a two way link (parent & child) and
 * denote the ever growing mask of CPUs belonging to that level of topology.
 *
 * Each sched_domain has a circular (double) linked list of sched_group's, each
 * denoting the domains of the level below (or individual CPUs in case of the
 * first domain level). The sched_group linked by a sched_domain includes the
 * CPU of that sched_domain [*].
 *
 * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
 *
 * CPU   0   1   2   3   4   5   6   7
 *
 * DIE  [                             ]
 * MC   [             ] [             ]
 * SMT  [     ] [     ] [     ] [     ]
 *
 *  - or -
 *
 * DIE  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
 * MC	0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
 * SMT  0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
 *
 * CPU   0   1   2   3   4   5   6   7
 *
 * One way to think about it is: sched_domain moves you up and down among these
 * topology levels, while sched_group moves you sideways through it, at child
 * domain granularity.
 *
 * sched_group_capacity ensures each unique sched_group has shared storage.
 *
 * There are two related construction problems, both require a CPU that
 * uniquely identify each group (for a given domain):
 *
 *  - The first is the balance_cpu (see should_we_balance() and the
 *    load-balance blub in fair.c); for each group we only want 1 CPU to
 *    continue balancing at a higher domain.
 *
 *  - The second is the sched_group_capacity; we want all identical groups
 *    to share a single sched_group_capacity.
 *
 * Since these topologies are exclusive by construction. That is, its
 * impossible for an SMT thread to belong to multiple cores, and cores to
 * be part of multiple caches. There is a very clear and unique location
 * for each CPU in the hierarchy.
 *
 * Therefore computing a unique CPU for each group is trivial (the iteration
 * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
 * group), we can simply pick the first CPU in each group.
 *
 *
 * [*] in other words, the first group of each domain is its child domain.
 */

static struct sched_group *get_group(int cpu, struct sd_data *sdd)
{
	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
	struct sched_domain *child = sd->child;
	struct sched_group *sg;

	if (child)
		cpu = cpumask_first(sched_domain_span(child));

	sg = *per_cpu_ptr(sdd->sg, cpu);
	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);

	/* For claim_allocations: */
	atomic_inc(&sg->ref);
	atomic_inc(&sg->sgc->ref);

	if (child) {
		cpumask_copy(sched_group_span(sg), sched_domain_span(child));
		cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
	} else {
		cpumask_set_cpu(cpu, sched_group_span(sg));
		cpumask_set_cpu(cpu, group_balance_mask(sg));
	}

	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;

	return sg;
}

cpu=1时，get_group()函数首先获取cpu1在DIE级别的调度域sd_die_1，然后通过child指针获取MC级别的调度域sd_mc_1。获取sd_mc_1域里的第一个CPU，为何会是CPU0而不是CPU1呢？我们返回来仔细看一下build_sched_domain()函数，发现sd_mc域的span兄弟位图的设置和tl->mask(cpu)函数相关，同属MC级别的CPUs应该包含同样的范围，也就是对于CPU0来说，它的兄弟位应该是[cpu0|cpu1]，同样对于CPU1来说也是同样的道理。

4.5 CPU拓扑示例

在这里插入图片描述

如上图所示，假设在一个4核处理器中，每个物理CPU核心拥有独立L1 Cache且不支持超线程技术，分成两个簇Cluster0和Cluster1，每个簇包含两个物理CPU核，簇中的CPU核共享L2 Cache。
在分析之前先总结Linux内核里构建CPU域和调度组拓扑关系图的一些原则。
 根据CPU物理属性分层次，从下到上，由SMT->MC->DIE的递进关系来分层，用数据结构struct sched_domain_topology_level来描述，简称为SDTL
 每个SDTL都为调度域和调度组都建立一个Per-CPU变量，并且为每个CPU都分配响应的数据结构
 在同一个SDTL中由芯片设计决定哪些CPUs是兄弟关系。调度域中有span成员来描述，调度组有cpumark成员来描述兄弟关系
 同一个CPU的不同SDTL的调度域有父子关系。每个调度域里包含了相应的调度组并且这些调度组串联成一个链表，调度域的groups成员是链表头。
因为每个CPU核心只有一个执行线程，所以4核处理器没有SMT属性。cluster由两个CPU物理核组成，这两个CPU是MC层级且是兄弟关系。整个处理器可以看做一个DIE级别，因此该处理器只有两个层级，即MC和DIE。根据上述原则，可以标识出上述4核处理器的调度域和调度组的拓扑关系图，如下图所示。

每个SDTL为每个CPU都分配了对应的调度域和调度组，以CPU0为例，在图中，虚线表示管辖。
1）对于DIE级别，CPU0对应的调度域是domain_die_0，该调度域管辖着4个CPU并包含两个调度组，分别为group_die_0和group_die_1。其中
 调度组group_die_0管辖着CPU0和CPU1
 调度组group_die_1管辖着CPU2和CPU3
2）对于MC级别，CPU0对应的调度域是domain_mc_0，该调度域管辖着CPU0和CPU1并包含两个调度组，分别为group_mc_0和group_mc_1。其中
 调度组group_mc_0管辖CPU0
 调度组group_mc_1管辖CPU1

为什么DIE级别的所有调度组只有group_die_0和group_die_1呢？

因为在建立调度组的函数build_sched_groups()有一个判断(if(cpu != cpumask_first(span)))，这样只有参与cpu为调度域的第一个CPU才会建立DIE层级的调度组。注意get_group()函数，它会返回子调度域兄弟关系的第一个CPU。
除此之外还有两层关系，一是父子关系，通过struct sched_domain数据结构中的parent和child成员来完成；另外一个关系是同一个SDTL中调度组都链接成一个链表，通过struct sched_domain数据结构中的groups成员来完成，如下图所示。
最后再关心一下，SMP是如何均衡负载的呢？在内核中，SMP负载均衡机制从注册软终端开始，每次系统处理调度tick时会检查当前是否需要处理SMP负载均衡。详情可见[start_kernel() -> sched_init() -> init_sched_fair_class()]。

源码：/kernel/sched/fair.c：9592

__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
    // run_rebalance_domains，负载均衡的核心入口
	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

#ifdef CONFIG_NO_HZ_COMMON
	nohz.next_balance = jiffies;
	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */

}

参考：
《Linux SMP启动过程分析报告》
《Device Tree（三）：代码分析》
《arm linux启动流程三》

楓潇潇

关注

0
点赞
踩
9

收藏

觉得还不错? 一键收藏
0
评论
Linux SMP启动流程学习（三）

Linux SMP启动流程学习（三）4 构建CPU拓扑关系4.1 创建调度域拓扑关系—sched_init_domains()在系统启动开始的时候就开始构建CPU的拓扑关系，具体流程如下：[start_kernel() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> sched_init_sm...
复制链接

扫一扫