Linux SMP启动流程学习(三)

Linux SMP启动流程学习(三)

4 构建CPU拓扑关系

4.1 创建调度域拓扑关系—sched_init_domains()

在系统启动开始的时候就开始构建CPU的拓扑关系,具体流程如下:
[start_kernel() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> sched_init_domains()]

源码:/kernel/sched/topology.c:1770

/*
 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
 * For now this just excludes isolated CPUs, but could be used to
 * exclude other special cases in the future.
 */
int sched_init_domains(const struct cpumask *cpu_map)
{
	int err;

	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
	zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
	zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);

	arch_update_cpu_topology();
	ndoms_cur = 1;
	doms_cur = alloc_sched_domains(ndoms_cur);
	if (!doms_cur)
		doms_cur = &fallback_doms;
	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
    //真正建立调度域拓扑关系的函数
	err = build_sched_domains(doms_cur[0], NULL);//
	register_sched_domain_sysctl();

	return err;
}

start_kernel() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> sched_init_domains() -> build_sched_domains()
源码:/kernel/sched/topology.c:1636

/*
 * Build sched domains for a given set of CPUs and attach the sched domains
 * to the individual CPUs
 */
static int
build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
{
	enum s_alloc alloc_state;
	struct sched_domain *sd;
	struct s_data d;
	struct rq *rq = NULL;
	int i, ret = -ENOMEM;

    //最终调用std_alloc(),创建调度域等数据结构
	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
	if (alloc_state != sa_rootdomain)
		goto error;

	/* Set up domains for CPUs specified by the cpu_map: */
    //遍历cpu_map = cpu_active_mask中的所有CPU
	for_each_cpu(i, cpu_map) {
		struct sched_domain_topology_level *tl;

		sd = NULL;
        // 对每个CPU遍历所有的SDTL,相当于每个CPU
// 都有自己的一套SDTL对应的调度域,为每个CPU都初始化
// 一整套SDTL对应的调度域和调度组。
		for_each_sd_topology(tl) {
            // 为每个CPU都建立调度域和调度组
			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
			if (tl == sched_domain_topology)
				*per_cpu_ptr(d.sd, i) = sd;
			if (tl->flags & SDTL_OVERLAP)
				sd->flags |= SD_OVERLAP;
			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
				break;
		}
	}

	/* Build the groups for the domains */
    // 遍历cpu_active_mask中的所有CPU
	for_each_cpu(i, cpu_map) {
        // 遍历CPU中对应的调度域
        // per_cpu_ptr(),获取最低的SDTL对应的调度域
        // sd->parent,得到上一级的调度域
		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
			sd->span_weight = cpumask_weight(sched_domain_span(sd));
			if (sd->flags & SD_OVERLAP) {
				if (build_overlap_sched_groups(sd, i))
					goto error;
			} else {
                  //创建调度组
				if (build_sched_groups(sd, i))
					goto error;
			}
		}
	}

	/* Calculate CPU capacity for physical packages and nodes */
// 设置各个调度组能力系数,内核通常设定单个CPU最大的调度能力系数
// 为1024,但不同的体系架构对调度能力系数有不同的计算方法
	for (i = nr_cpumask_bits-1; i >= 0; i--) {
		if (!cpumask_test_cpu(i, cpu_map))
			continue;

		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
			claim_allocations(i, sd);
			init_sched_groups_capacity(i, sd);
		}
	}

	/* Attach the domains */
	rcu_read_lock();
	for_each_cpu(i, cpu_map) {
		rq = cpu_rq(i);
		sd = *per_cpu_ptr(d.sd, i);

		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);

        // 最后cpu_attach_domain()把相关的调度域关联到
// 运行队列的struct rq的root_domain中,还会对
// 各个级别的调度域做一些精简,例如调度域和上
// 一级调度域的兄弟位图(span)相同,或者调度域的
// 兄弟位只有自己一个,那么就要删掉一个了。
		cpu_attach_domain(sd, d.rd, i);
	}
	rcu_read_unlock();

	if (rq && sched_debug_enabled) {
		pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
	}

	ret = 0;
error:
	__free_domain_allocs(&d, alloc_state, cpu_map);
	return ret;
}

4.2 申请调度域—__sdt_alloc()

 每个SDTL都有一个struct sched_domain_topology_level数据结构来描述,并且内嵌一个struct sd_data数据结构,包含sched_domain、sched_group和sched_group_capacity的二级指针
 每个SDTL都分配一个Per-CPU变量的含sched_domain、sched_group和sched_group_capacity数据结构
 在每个SDTL中为每个CPU都分配含sched_domain、sched_group和sched_group_capacity数据结构,即每个CPU在每个SDTL中都有对应的调度域和调度组

start_kernel() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> sched_init_domains() -> build_sched_domains() -> __visit_domain_allocation_hell() -> sdt_alloc()

源码:/kernel/sched/topology.c:1496

static int __sdt_alloc(const struct cpumask *cpu_map)
{
	struct sched_domain_topology_level *tl;
	int j;

// 遍历系统默认CPU拓扑层次关系数组default_topology,见第二章
// 遍历顺序为SMT –> MT -> DIE
	for_each_sd_topology(tl) {
		struct sd_data *sdd = &tl->data;

        //为SDTL的调度域(sd)分配Per-Cpu变量的数据结构
		sdd->sd = alloc_percpu(struct sched_domain *);
		if (!sdd->sd)
			return -ENOMEM;
   
        //为SDTL的共享调度域分配Per-Cpu变量的数据结构
		sdd->sds = alloc_percpu(struct sched_domain_shared *);
		if (!sdd->sds)
			return -ENOMEM;

        //为SDTL的调度组(sg)分配Per-Cpu变量的数据结构
		sdd->sg = alloc_percpu(struct sched_group *);
		if (!sdd->sg)
			return -ENOMEM;

        //为SDTL的调度组能力(sgc)分配Per-Cpu变量的数据结构
		sdd->sgc = alloc_percpu(struct sched_group_capacity *);
		if (!sdd->sgc)
			return -ENOMEM;

        // 为每个CPU都创建一个调度域、调度组和调度能力组数据结构,
        // 并且存放在Per-CPU变量中
		for_each_cpu(j, cpu_map) {
			struct sched_domain *sd;
			struct sched_domain_shared *sds;
			struct sched_group *sg;
			struct sched_group_capacity *sgc;

			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
					GFP_KERNEL, cpu_to_node(j));
			if (!sd)
				return -ENOMEM;

			*per_cpu_ptr(sdd->sd, j) = sd;

			sds = kzalloc_node(sizeof(struct sched_domain_shared),
					GFP_KERNEL, cpu_to_node(j));
			if (!sds)
				return -ENOMEM;

			*per_cpu_ptr(sdd->sds, j) = sds;

			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
					GFP_KERNEL, cpu_to_node(j));
			if (!sg)
				return -ENOMEM;

			sg->next = sg;

			*per_cpu_ptr(sdd->sg, j) = sg;

			sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
					GFP_KERNEL, cpu_to_node(j));
			if (!sgc)
				return -ENOMEM;

#ifdef CONFIG_SCHED_DEBUG
			sgc->id = j;
#endif

			*per_cpu_ptr(sdd->sgc, j) = sgc;
		}
	}

	return 0;
}

4.3 建立调度域—build_sched_domain()
start_kernel() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> sched_init_domains() -> build_sched_domains() -> build_sched_domain()

源码:/kernel/sched/topology.c:1601

static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
		struct sched_domain *child, int cpu)
{
    // 由tl 和 cpu_id来获取对应的struct sched_domain数据结构并初始化其成员
	struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);

// 由于SDTL的遍历是从SMT级到MC级再到DIE级递进的,
// 因此SMT级的CPU可以看做MC级的孩子,MC级可以看做
// SMT级CPU的父亲,它们存在父子关系或上下级关系。
// struct sched_domain()数据结构中有parent和child成员用于描述此关系。
	if (child) {
		sd->level = child->level + 1;
		sched_domain_level_max = max(sched_domain_level_max, sd->level);
		child->parent = sd;

		if (!cpumask_subset(sched_domain_span(child),
				    sched_domain_span(sd))) {
			pr_err("BUG: arch topology borken\n");
#ifdef CONFIG_SCHED_DEBUG
			pr_err("     the %s domain not a subset of the %s domain\n",
					child->name, sd->name);
#endif
			/* Fixup, ensure @sd has at least @child cpus. */
			cpumask_or(sched_domain_span(sd),
				   sched_domain_span(sd),
				   sched_domain_span(child));
		}

	}
	set_domain_attribute(sd, attr);

	return sd;
}
4.3.1 初始化调度域—sd_init()

源码:/kernel/sched/topology.c:1085

static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
	const struct cpumask *cpu_map,
	struct sched_domain *child, int cpu)
{
	struct sd_data *sdd = &tl->data;
    //获取该CPU对应的sched_domain数据结构
	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
	int sd_id, sd_weight, sd_flags = 0;

#ifdef CONFIG_NUMA
	/*
	 * Ugly hack to pass state to sd_numa_mask()...
	 */
	sched_domains_curr_level = tl->numa_level;
#endif

	sd_weight = cpumask_weight(tl->mask(cpu));

	if (tl->sd_flags)
		sd_flags = (*tl->sd_flags)();
	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
			"wrong sd_flags in topology description\n"))
		sd_flags &= ~TOPOLOGY_SD_FLAGS;

	*sd = (struct sched_domain){
		.min_interval		= sd_weight,
		.max_interval		= 2*sd_weight,
		.busy_factor		= 32,
		.imbalance_pct		= 125,

		.cache_nice_tries	= 0,
		.busy_idx		= 0,
		.idle_idx		= 0,
		.newidle_idx		= 0,
		.wake_idx		= 0,
		.forkexec_idx		= 0,

		.flags			= 1*SD_LOAD_BALANCE
					| 1*SD_BALANCE_NEWIDLE
					| 1*SD_BALANCE_EXEC
					| 1*SD_BALANCE_FORK
					| 0*SD_BALANCE_WAKE
					| 1*SD_WAKE_AFFINE
					| 0*SD_SHARE_CPUCAPACITY
					| 0*SD_SHARE_PKG_RESOURCES
					| 0*SD_SERIALIZE
					| 0*SD_PREFER_SIBLING
					| 0*SD_NUMA
					| sd_flags
					,

		.last_balance		= jiffies,
		.balance_interval	= sd_weight,
		.smt_gain		= 0,
		.max_newidle_lb_cost	= 0,
		.next_decay_max_lb_cost	= jiffies,
		.child			= child,
#ifdef CONFIG_SCHED_DEBUG
		.name			= tl->name,
#endif
	};

	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
	sd_id = cpumask_first(sched_domain_span(sd));

	/*
	 * Convert topological properties into behaviour.
	 */

	if (sd->flags & SD_ASYM_CPUCAPACITY) {
		struct sched_domain *t = sd;

		for_each_lower_domain(t)
			t->flags |= SD_BALANCE_WAKE;
	}

	if (sd->flags & SD_SHARE_CPUCAPACITY) {
		sd->flags |= SD_PREFER_SIBLING;
		sd->imbalance_pct = 110;
		sd->smt_gain = 1178; /* ~15% */

	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
		sd->imbalance_pct = 117;
		sd->cache_nice_tries = 1;
		sd->busy_idx = 2;

#ifdef CONFIG_NUMA
	} else if (sd->flags & SD_NUMA) {
		sd->cache_nice_tries = 2;
		sd->busy_idx = 3;
		sd->idle_idx = 2;

		sd->flags |= SD_SERIALIZE;
		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
			sd->flags &= ~(SD_BALANCE_EXEC |
				       SD_BALANCE_FORK |
				       SD_WAKE_AFFINE);
		}

#endif
	} else {
		sd->flags |= SD_PREFER_SIBLING;
		sd->cache_nice_tries = 1;
		sd->busy_idx = 2;
		sd->idle_idx = 1;
	}

	/*
	 * For all levels sharing cache; connect a sched_domain_shared
	 * instance.
	 */
	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
		atomic_inc(&sd->shared->ref);
		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
	}

	sd->private = sdd;

	return sd;
}

4.4 建立调度组—build_sched_groups()

源码:/kernel/sched/topology.c:875

/*
 * build_sched_groups will build a circular linked list of the groups
 * covered by the given span, and will set each group's ->cpumask correctly,
 * and ->cpu_capacity to 0.
 *
 * Assumes the sched_domain tree is fully constructed
 */
static int
build_sched_groups(struct sched_domain *sd, int cpu)
{
	struct sched_group *first = NULL, *last = NULL;
	struct sd_data *sdd = sd->private;
	const struct cpumask *span = sched_domain_span(sd);
	struct cpumask *covered;
	int i;

	lockdep_assert_held(&sched_domains_mutex);
	covered = sched_domains_tmpmask;

	cpumask_clear(covered);

// 设置该调度域sd中不同的CPU对应的调度组的包含关系,
// 这些调度组分别用next 指针串联起来
	for_each_cpu_wrap(i, span, cpu) {
		struct sched_group *sg;

		if (cpumask_test_cpu(i, covered))
			continue;

		sg = get_group(i, sdd);//获取该CPU对应的调度组并放在sd->groups指针中

		cpumask_or(covered, covered, sched_group_span(sg));

		if (!first)
			first = sg;
		if (last)
			last->next = sg;
		last = sg;
	}
	last->next = first;
	sd->groups = first;

	return 0;
}
4.4.1 获取调度组—get_group()
/*
 * Package topology (also see the load-balance blurb in fair.c)
 *
 * The scheduler builds a tree structure to represent a number of important
 * topology features. By default (default_topology[]) these include:
 *
 *  - Simultaneous multithreading (SMT)
 *  - Multi-Core Cache (MC)
 *  - Package (DIE)
 *
 * Where the last one more or less denotes everything up to a NUMA node.
 *
 * The tree consists of 3 primary data structures:
 *
 *	sched_domain -> sched_group -> sched_group_capacity
 *	    ^ ^             ^ ^
 *          `-'             `-'
 *
 * The sched_domains are per-cpu and have a two way link (parent & child) and
 * denote the ever growing mask of CPUs belonging to that level of topology.
 *
 * Each sched_domain has a circular (double) linked list of sched_group's, each
 * denoting the domains of the level below (or individual CPUs in case of the
 * first domain level). The sched_group linked by a sched_domain includes the
 * CPU of that sched_domain [*].
 *
 * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
 *
 * CPU   0   1   2   3   4   5   6   7
 *
 * DIE  [                             ]
 * MC   [             ] [             ]
 * SMT  [     ] [     ] [     ] [     ]
 *
 *  - or -
 *
 * DIE  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
 * MC	0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
 * SMT  0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
 *
 * CPU   0   1   2   3   4   5   6   7
 *
 * One way to think about it is: sched_domain moves you up and down among these
 * topology levels, while sched_group moves you sideways through it, at child
 * domain granularity.
 *
 * sched_group_capacity ensures each unique sched_group has shared storage.
 *
 * There are two related construction problems, both require a CPU that
 * uniquely identify each group (for a given domain):
 *
 *  - The first is the balance_cpu (see should_we_balance() and the
 *    load-balance blub in fair.c); for each group we only want 1 CPU to
 *    continue balancing at a higher domain.
 *
 *  - The second is the sched_group_capacity; we want all identical groups
 *    to share a single sched_group_capacity.
 *
 * Since these topologies are exclusive by construction. That is, its
 * impossible for an SMT thread to belong to multiple cores, and cores to
 * be part of multiple caches. There is a very clear and unique location
 * for each CPU in the hierarchy.
 *
 * Therefore computing a unique CPU for each group is trivial (the iteration
 * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
 * group), we can simply pick the first CPU in each group.
 *
 *
 * [*] in other words, the first group of each domain is its child domain.
 */

static struct sched_group *get_group(int cpu, struct sd_data *sdd)
{
	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
	struct sched_domain *child = sd->child;
	struct sched_group *sg;

	if (child)
		cpu = cpumask_first(sched_domain_span(child));

	sg = *per_cpu_ptr(sdd->sg, cpu);
	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);

	/* For claim_allocations: */
	atomic_inc(&sg->ref);
	atomic_inc(&sg->sgc->ref);

	if (child) {
		cpumask_copy(sched_group_span(sg), sched_domain_span(child));
		cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
	} else {
		cpumask_set_cpu(cpu, sched_group_span(sg));
		cpumask_set_cpu(cpu, group_balance_mask(sg));
	}

	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;

	return sg;
}
  • cpu=1时,get_group()函数首先获取cpu1在DIE级别的调度域sd_die_1,然后通过child指针获取MC级别的调度域sd_mc_1。获取sd_mc_1域里的第一个CPU,为何会是CPU0而不是CPU1呢?我们返回来仔细看一下build_sched_domain()函数,发现sd_mc域的span兄弟位图的设置和tl->mask(cpu)函数相关,同属MC级别的CPUs应该包含同样的范围,也就是对于CPU0来说,它的兄弟位应该是[cpu0|cpu1],同样对于CPU1来说也是同样的道理。

4.5 CPU拓扑示例

在这里插入图片描述

  • 如上图所示,假设在一个4核处理器中,每个物理CPU核心拥有独立L1 Cache且不支持超线程技术,分成两个簇Cluster0和Cluster1,每个簇包含两个物理CPU核,簇中的CPU核共享L2 Cache。

  • 在分析之前先总结Linux内核里构建CPU域和调度组拓扑关系图的一些原则。
     根据CPU物理属性分层次,从下到上,由SMT->MC->DIE的递进关系来分层,用数据结构struct sched_domain_topology_level来描述,简称为SDTL
     每个SDTL都为调度域和调度组都建立一个Per-CPU变量,并且为每个CPU都分配响应的数据结构
     在同一个SDTL中由芯片设计决定哪些CPUs是兄弟关系。调度域中有span成员来描述,调度组有cpumark成员来描述兄弟关系
     同一个CPU的不同SDTL的调度域有父子关系。每个调度域里包含了相应的调度组并且这些调度组串联成一个链表,调度域的groups成员是链表头。

  • 因为每个CPU核心只有一个执行线程,所以4核处理器没有SMT属性。cluster由两个CPU物理核组成,这两个CPU是MC层级且是兄弟关系。整个处理器可以看做一个DIE级别,因此该处理器只有两个层级,即MC和DIE。根据上述原则,可以标识出上述4核处理器的调度域和调度组的拓扑关系图,如下图所示。
    在这里插入图片描述
    每个SDTL为每个CPU都分配了对应的调度域和调度组,以CPU0为例,在图中,虚线表示管辖。

  • 1) 对于DIE级别,CPU0对应的调度域是domain_die_0,该调度域管辖着4个CPU并包含两个调度组,分别为group_die_0和group_die_1。其中
     调度组group_die_0管辖着CPU0和CPU1
     调度组group_die_1管辖着CPU2和CPU3

  • 2) 对于MC级别,CPU0对应的调度域是domain_mc_0,该调度域管辖着CPU0和CPU1并包含两个调度组,分别为group_mc_0和group_mc_1。其中
     调度组group_mc_0管辖CPU0
     调度组group_mc_1管辖CPU1

为什么DIE级别的所有调度组只有group_die_0和group_die_1呢?

  • 因为在建立调度组的函数build_sched_groups()有一个判断(if(cpu != cpumask_first(span))),这样只有参与cpu为调度域的第一个CPU才会建立DIE层级的调度组。注意get_group()函数,它会返回子调度域兄弟关系的第一个CPU。

  • 除此之外还有两层关系,一是父子关系,通过struct sched_domain数据结构中的parent和child成员来完成;另外一个关系是同一个SDTL中调度组都链接成一个链表,通过struct sched_domain数据结构中的groups成员来完成,如下图所示。

  • 最后再关心一下,SMP是如何均衡负载的呢?在内核中,SMP负载均衡机制从注册软终端开始,每次系统处理调度tick时会检查当前是否需要处理SMP负载均衡。详情可见[start_kernel() -> sched_init() -> init_sched_fair_class()]。

源码:/kernel/sched/fair.c:9592

__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
    // run_rebalance_domains,负载均衡的核心入口
	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

#ifdef CONFIG_NO_HZ_COMMON
	nohz.next_balance = jiffies;
	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */

}

参考:
Linux SMP启动过程分析报告
Device Tree(三):代码分析
arm linux启动流程三

  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值