Linux最新Linux per-cpu_linux percpu，万字Linux运维技术类校招面试题汇总

角度.

已于 2024-05-10 18:11:51 修改

阅读量874

点赞数 21

分类专栏：程序员文章标签：运维 linux 服务器

于 2024-05-10 18:11:50 首次发布

本文链接：https://blog.csdn.net/2301_76224054/article/details/138676433

版权

程序员专栏收录该内容

146 篇文章 0 订阅

订阅专栏

最后的话

最近很多小伙伴找我要Linux学习资料，于是我翻箱倒柜，整理了一些优质资源，涵盖视频、电子书、PPT等共享给大家！

资料预览

给大家整理的视频资料：

给大家整理的电子书资料：

如果本文对你有帮助，欢迎点赞、收藏、转发给朋友，让我有持续创作的动力！

网上学习资料一大堆，但如果学到的知识不成体系，遇到问题时只是浅尝辄止，不再深入研究，那么很难做到真正的技术提升。

需要这份系统化的资料的朋友，可以点击这里获取！

一个人可以走的很快，但一群人才能走的更远！不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人，都欢迎加入我们的的圈子（技术交流、学习资源、职场吐槽、大厂内推、面试辅导），让我们一起学习成长！

{
#ifdef CONFIG_X86_32
loadsegment(fs, __KERNEL_PERCPU);
#else
loadsegment(gs, 0);
wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
#endif
load_stack_canary_segment();
}

/*
* Current gdt points %fs at the “master” per-cpu area: after this,
* it’s on the real one.
*/
void switch_to_new_gdt(int cpu)
{
struct desc_ptr gdt_descr;

gdt_descr.address = (long)get\_cpu\_gdt\_table(cpu);
gdt_descr.size = GDT_SIZE - 1;
load\_gdt(&gdt_descr);
/\* Reload the per-cpu base \*/

load\_percpu\_segment(cpu);

}


（4）pcpu\_embed\_first\_chunk  
 pcpu\_embed\_first\_chunk() 由通用 percpu 设置使用。如果 arch config 需要或将使用通用设置，请构建它。  
 这是一个帮助程序，可以方便地设置嵌入的第一个percpu块，为percpu建立第一个chunk，可以在需要pcpu\_setup\_first\_chunk()的地方调用它。  
 如果此函数用于设置第一个块，则通过调用alloc\_fn来分配它，并按原样使用它，而不映射到vmalloc区域。分配总是与 atom\_size 对齐的 atom\_size 的整数倍。  
 这使第一个块能够返回到通常使用较大页面大小的线性物理映射。请注意，这可能导致NUMA机器上的cpu->unit 映射非常sparse，因此需要很大的vmalloc地址空间。如果 vmalloc 空间不是比node内存地址之间的距离大几个数量级（即 32 位 NUMA 机器），则不要使用此分配器。  
 dyn\_size 指定最小动态区域大小。  
 如果所需的大小小于最小或指定的unit size，则使用free\_fn返回剩余的大小。

/*
* pcpu_embed_first_chunk - 将第一个 percpu 块嵌入 bootmem
* @reserved_size: percpu 保留区域的大小（以字节为单位）
* @dyn_size：动态分配的最小可用大小（以字节为单位）
* @atom_size: 分配原子大小
* @cpu_distance_fn: 确定cpu之间距离的回调函数，可选
* @alloc_fn: 分配 percpu 页面的函数
* @free_fn: 释放 percpu 页面的函数
* RETURNS:
* 0 on success, -errno on failure.
*/
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn)
{
void *base = (void *)ULONG_MAX;
void **areas = NULL;
struct pcpu_alloc_info *ai;
size_t size_sum, areas_size, max_distance;
int group, i, rc;

//收集整理该架构下的percpu信息，结果放在struct pcpu\_alloc\_info结构中
ai = pcpu\_build\_alloc\_info(reserved_size, dyn_size, atom_size,
			   cpu_distance_fn);
if (IS\_ERR(ai))
	return PTR\_ERR(ai);

//计算每个cpu占用的percpu内存空间大小，包括static\_size + reserved\_size + dyn\_size
size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
areas_size = PFN\_ALIGN(ai->nr_groups \* sizeof(void \*));

//areas用来保存每个group的percpu内存起始地址，为其分配空间，做临时存储使用，用完释放掉
areas = alloc\_bootmem\_nopanic(areas_size);
if (!areas) {
	rc = -ENOMEM;
	goto out_free;
}

/\* allocate, copy and determine base address \*/
//对该系统下的每个group操作，为每个group分配percpu内存区域
//为该group分配percpu内存区域。长度为该group里的cpu数目 X 每颗处理器的percpu递进单位
for (group = 0; group < ai->nr_groups; group++) {
	struct pcpu\_group\_info \*gi = &ai->groups[group];
	unsigned int cpu = NR_CPUS;
	void \*ptr;

	for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
		cpu = gi->cpu_map[i];
	BUG\_ON(cpu == NR_CPUS);

	/\* allocate space for the whole group \*/
	//返回物理地址(是从bootmem里取得内存，得到的是物理内存)的内存虚拟地址ptr
	ptr = alloc\_fn(cpu, gi->nr_units \* ai->unit_size, atom_size);
	if (!ptr) {
		rc = -ENOMEM;
		goto out_free_areas;
	}
	/\* kmemleak tracks the percpu allocations separately \*/
	kmemleak\_free(ptr);
	//将分配到的该组percpu内存虚拟起始地址保存在areas数组中
	areas[group] = ptr;

	//比较每个group的percpu内存地址，保存最小的内存地址，即percpu内存的起始地址
	//为后边计算group的percpu内存地址的偏移量
	base = min(ptr, base);
}

/\*

* Copy data and free unused parts. This should happen after all
* allocations are complete; otherwise, we may end up with
* overlapping groups.
*/
for (group = 0; group < ai->nr_groups; group++) {
//取出该group下的组信息
struct pcpu_group_info *gi = &ai->groups[group];
//得到该group的percpu内存起始地址
void *ptr = areas[group];

	//遍历该组中的cpu，并得到每个cpu对应的percpu内存地址
	for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
		if (gi->cpu_map[i] == NR_CPUS) {
			/\* unused unit, free whole \*/
			//释放掉未使用的unit
			free\_fn(ptr, ai->unit_size);
			continue;
		}
		/\* copy and return the unused part \*/
		//将静态定义的percpu变量拷贝到每个cpu的percpu内存起始地址
		memcpy(ptr, __per_cpu_load, ai->static_size);
		//为每个cpu释放掉多余的空间，多余的空间是指 static\_size + reserved\_size + dyn\_size
		free\_fn(ptr + size_sum, ai->unit_size - size_sum);
	}
}

/\* base address is now known, determine group base offsets \*/
//计算group的percpu内存地址的偏移量
max_distance = 0;
for (group = 0; group < ai->nr_groups; group++) {
	ai->groups[group].base_offset = areas[group] - base;
	max_distance = max\_t(size\_t, max_distance,
			     ai->groups[group].base_offset);
}
max_distance += ai->unit_size;

/\* warn if maximum distance is further than 75% of vmalloc space \*/
//检查最大偏移量是否超过vmalloc空间的75%
if (max_distance > (VMALLOC_END - VMALLOC_START) \* 3 / 4) {
	pr\_warning("PERCPU: max\_distance=0x%zx too large for vmalloc "
		   "space 0x%lx\n", max_distance,
		   (unsigned long)(VMALLOC_END - VMALLOC_START));

#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/* and fail if we have fallback */
rc = -EINVAL;
goto out_free;
#endif
}

pr\_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
	PFN\_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
	ai->dyn_size, ai->unit_size);

//为percpu建立第一个chunk
rc = pcpu\_setup\_first\_chunk(ai, base);
goto out_free;

out_free_areas:
for (group = 0; group < ai->nr_groups; group++)
free_fn(areas[group],
ai->groups[group].nr_units * ai->unit_size);
out_free:
pcpu_free_alloc_info(ai);
if (areas)
free_bootmem(__pa(areas), areas_size);
return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */


小结：  
 setup\_per\_cpu\_areas函数首先计算出".data…percpu"section的空间大小（static\_size =\_\_per\_cpu\_end - \_\_per\_cpu\_start），static\_size是内核源码中所有用DEFINE\_PER\_CPU及其变体所定义出的静态per-CPU变量所占空间的大小。此外内核还为模块使用的per-CPU变量以及动态分配的per-CPU变量预留了空间，大小分别记为reserved\_size和dyn\_size。


然后setup\_per\_cpu\_areas函数调用alloc\_bootmem\_nopanic来分配一段内存，用来保存per-CPU变量副本。此时因为系统的内存管理系统还没有建立起来，所以使用的是Linux引导期内存分配器。这块内存的大小要依赖于系统中CPU的数量，因为要为每个CPU创建变量的副本。内核代码称每个CPU变量副本所在内存空间为一个unit，所以代码中的nr\_units变量实际上表示了系统中CPU的数量，每个unit的大小记为unit\_size，unit\_size =PFN\_ALIGN(static\_size + reserved\_size +dyn\_size)。如此，变量副本所在空间的大小就是nr\_units \* unit\_size。指针变量pcpu\_base\_addr指向副本空间的起始地址。


（5）pcpu\_build\_alloc\_info  
 此函数确定单元的分组、它们到 cpu 的映射以及考虑所需的 percpu 大小、分配原子大小和 CPU 之间的距离的其他参数。用来收集整理该架构下的percpu信息，结果放在struct pcpu\_alloc\_info结构中。


Groups是atom size的倍数，两种都是LOCAL\_DISTANCE的cpu会被分组在一起，共享同一组的单元的空间。返回的配置保证在不同组的不同节点上有 CPU，并且分配的虚拟地址空间的使用率 >=75%。

/* pcpu_build_alloc_info() is used by both embed and page first chunk */
#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)

/*
* pcpu_build_alloc_info - build alloc_info considering distances between CPUs
* @reserved_size: percpu 保留区域的大小（以字节为单位）
* @dyn_size：动态分配的最小可用大小（以字节为单位）
* @atom_size: 分配原子大小
* @cpu_distance_fn: 确定cpu之间距离的回调函数，可选
*
* RETURNS:
* 成功时，返回指向新的 allocation_info 的指针。
* 失败时，返回 ERR_PTR 值。
*/
static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
static int group_map[NR_CPUS] __initdata;
static int group_cnt[NR_CPUS] __initdata;

//计算出".data..percpu"section的空间大小（static\_size =\_\_per\_cpu\_end - \_\_per\_cpu\_start）
const size\_t static_size = __per_cpu_end - __per_cpu_start;

int nr_groups = 1, nr_units = 0;
size\_t size_sum, min_unit_size, alloc_size;
int upa, max_upa, uninitialized\_var(best_upa);	/\* units\_per\_alloc \*/
int last_allocs, group, unit;
unsigned int cpu, tcpu;
struct pcpu\_alloc\_info \*ai;
unsigned int \*cpu_map;

/\* this function may be called multiple times \*/
memset(group_map, 0, sizeof(group_map));
memset(group_cnt, 0, sizeof(group_cnt));

/\* calculate size\_sum and ensure dyn\_size is enough for early alloc \*/
//计算每个cpu所占有的percpu空间大小，包括static\_size + reserved\_size + dyn\_size
size_sum = PFN\_ALIGN(static_size + reserved_size +
		    max\_t(size\_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
//重新计算动态分配的percpu空间大小
dyn_size = size_sum - static_size - reserved_size;

/\*

* Determine min_unit_size, alloc_size and max_upa such that
* alloc_size is multiple of atom_size and is the smallest
* which can accommodate 4k aligned segments which are equal to
* or larger than min_unit_size.
*/
//计算每个unit的大小，即每个group中的每个cpu占用的percpu内存大小为一个unit
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);

alloc_size = roundup(min_unit_size, atom_size);
upa = alloc_size / min_unit_size;
while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
	upa--;
max_upa = upa;

/\* group cpus according to their proximity \*/
//为cpu分组，将接近的cpu分到一组中
//group\_cnt[group]即是该组中的cpu个数
for\_each\_possible\_cpu(cpu) {
	group = 0;
next_group:
	for\_each\_possible\_cpu(tcpu) {
		if (cpu == tcpu)
			break;
		if (group_map[tcpu] == group && cpu_distance_fn &&
		    (cpu\_distance\_fn(cpu, tcpu) > LOCAL_DISTANCE ||
		     cpu\_distance\_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
			group++;
			nr_groups = max(nr_groups, group + 1);
			goto next_group;
		}
	}
	group_map[cpu] = group;
	group_cnt[group]++;
}

/\*

* Expand unit size until address space usage goes over 75%
* and then as much as possible without using more address
* space.
*/
last_allocs = INT_MAX;
for (upa = max_upa; upa; upa–) {
int allocs = 0, wasted = 0;

	if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
		continue;

	for (group = 0; group < nr_groups; group++) {
		int this_allocs = DIV\_ROUND\_UP(group_cnt[group], upa);
		allocs += this_allocs;
		wasted += this_allocs \* upa - group_cnt[group];
	}

	/\*

* Don’t accept if wastage is over 1/3. The
* greater-than comparison ensures upa==1 always
* passes the following check.
*/
if (wasted > num_possible_cpus() / 3)
continue;

	/\* and then don't consume more memory \*/
	if (allocs > last_allocs)
		break;
	last_allocs = allocs;
	best_upa = upa;
}
upa = best_upa;

/\* allocate and fill alloc\_info \*/
//计算每个group中的cpu个数
for (group = 0; group < nr_groups; group++)
	nr_units += roundup(group_cnt[group], upa);

//分配pcpu\_alloc\_info结构空间，并初始化
ai = pcpu\_alloc\_alloc\_info(nr_groups, nr_units);
if (!ai)
	return ERR\_PTR(-ENOMEM);
//为每个group的cpu\_map指针赋值为group[0]，group[0]中的cpu\_map中的值初始化为NR\_CPUS
cpu_map = ai->groups[0].cpu_map;

for (group = 0; group < nr_groups; group++) {
	ai->groups[group].cpu_map = cpu_map;
	cpu_map += roundup(group_cnt[group], upa);
}

ai->static_size = static_size;
ai->reserved_size = reserved_size;
ai->dyn_size = dyn_size;
ai->unit_size = alloc_size / upa;
ai->atom_size = atom_size;
ai->alloc_size = alloc_size;

for (group = 0, unit = 0; group_cnt[group]; group++) {
	struct pcpu\_group\_info \*gi = &ai->groups[group];

	/\*

* Initialize base_offset as if all groups are located
* back-to-back. The caller should update this to
* reflect actual allocation.
*/
//设置组内的相对于0地址偏移量，后续会设置真正的对于percpu起始地址的偏移量
gi->base_offset = unit * ai->unit_size;

	//设置cpu\_map数组，数组保存该组中的cpu id号。以及设置组中的cpu个数gi->nr\_units
	for\_each\_possible\_cpu(cpu)
		if (group_map[cpu] == group)
			gi->cpu_map[gi->nr_units++] = cpu;
	gi->nr_units = roundup(gi->nr_units, upa);
	unit += gi->nr_units;
}
BUG\_ON(unit != nr_units);

return ai;

}
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */

// linux-3.10.1/mm/percpu.c

/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __read_mostly;
EXPORT_SYMBOL_GPL(pcpu_base_addr);

// linux-3.10.1/include/linux/percpu.h

extern void *pcpu_base_addr;
extern const unsigned long *pcpu_unit_offsets;

struct pcpu_group_info {
int nr_units; /* aligned # of units */
unsigned long base_offset; /* base address offset */
unsigned int *cpu_map; /* unit->cpu map, empty
* entries contain NR_CPUS */
};

struct pcpu_alloc_info {
size_t static_size;
size_t reserved_size;
size_t dyn_size;
size_t unit_size;
size_t atom_size;
size_t alloc_size;
size_t __ai_size; /* internal, don’t use */
int nr_groups; /* 0 if grouping unnecessary */
struct pcpu_group_info groups[];
};


struct pcpu\_alloc\_info：  
 static\_size：静态定义的percpu变量占用内存区域长度。在内核初始化时，就直接被拷贝到了各个percpu内存块的static区。  
 reserved\_size：预留区域，在percpu内存分配指定为预留区域分配时，将使用该区域。内核模块中的静态percpu变量，当内核模块被加载到内存时，其静态percpu变量就会在这个区域分配内存。  
 dyn\_size：动态分配的percpu变量占用内存区域长度。  
 unit\_size：每个cpu的percpu空间所占得内存空间为一个unit, 每个unit的大小记为unit\_size。  
 atom\_size：PMD\_SIZE（CONFIG\_X86\_64）/ PAGE\_SIZE。  
 alloc\_size：要分配的percpu内存空间。  
 \_\_ai\_size：整个pcpu\_alloc\_info结构体的大小。  
 nr\_groups：该架构下的处理器分组数目。  
 struct pcpu\_group\_info groups[]：该架构下的处理器分组信息。


struct pcpu\_group\_info：  
 nr\_units：该组的处理器数目  
 base\_offset：组的percpu内存地址起始地址，即组内处理器数目×处理器percpu虚拟内存递进基本单位  
 unsigned int \*cpu\_map：组内cpu对应数组，保存cpu id号


（6）pcpu\_setup\_first\_chunk  
 初始化包含内核静态 perpcu 区域的第一个 percpu 块。 此函数将从 arch percpu 区域设置路径中调用。  
 ai 包含初始化第一个块和启动动态 percpu 分配器所需的所有信息。


ai->static\_size 是静态 percpu 区域的大小。  
 ai->reserved\_size，如果非零，指定在第一个块中的静态区域之后要保留的字节数。 这会保留第一个块，使其仅可通过保留的 percpu 分配获得。 这主要用于在寻址模型对符号重定位的偏移范围有限的架构上为模块 percpu 静态区域提供服务，以确保模块 percpu 符号落在可重定位范围内。  
 ai->dyn\_size 确定第一个块中可用于动态分配的字节数。 ai->static\_size + ai->reserved\_size + ai->dyn\_size 和 ai->unit\_size 之间的区域未使用。  
 ai->unit\_size 指定单元大小，必须与 PAGE\_SIZE 对齐并且等于或大于 ai->static\_size + ai->reserved\_size + ai->dyn\_size。  
 ai->atom\_size 是分配原子大小，用作 vm 区域的对齐。  
 ai->alloc\_size 是分配大小，总是 ai->atom\_size 的倍数。 如果 ai->unit\_size 大于 ai->atom\_size，则它大于 ai->atom\_size。  
 ai->nr\_groups 和 ai->groups 描述 percpu 区域的虚拟内存布局。 应该合并的单元被放在同一个组中。 动态 VM 区域将根据这些分组进行分配。如果ai->nr\_groups为零，则假设一个包含所有单位的组。


调用者应该已经将第一个块映射到 base\_addr 并将静态数据复制到每个单元。


如果第一个块最终同时拥有保留区域和动态区域，那么它将由两个块提供服务——一个用于核心静态区域和保留区域，另一个用于动态区域。它们共享相同的vm和页面映射，但使用不同的区域分配映射来隔离彼此。后一个块在块槽中循环，可以像任何其他块一样进行动态分配。

/*
* pcpu_setup_first_chunk - 初始化第一个 percpu 块
* @ai: pcpu_alloc_info 描述如何对 percpu 区域进行整形
* @base_addr: 映射地址
*/
int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr)
{
static char cpus_buf[4096] __initdata;
static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
size_t dyn_size = ai->dyn_size;
size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
struct pcpu_chunk *schunk, *dchunk = NULL;
unsigned long *group_offsets;
size_t *group_sizes;
unsigned long *unit_off;
unsigned int cpu;
int *unit_map;
int group, unit, i;

cpumask\_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);

#define PCPU_SETUP_BUG_ON(cond) do {
if (unlikely(cond)) {
pr_emerg(“PERCPU: failed to initialize, %s”, #cond);
pr_emerg(“PERCPU: cpu_possible_mask=%s\n”, cpus_buf);
pcpu_dump_alloc_info(KERN_EMERG, ai);
BUG();
}
} while (0)

/\* sanity checks \*/
PCPU\_SETUP\_BUG\_ON(ai->nr_groups <= 0);

#ifdef CONFIG_SMP
PCPU_SETUP_BUG_ON(!ai->static_size);
PCPU_SETUP_BUG_ON((unsigned long)
#endif
PCPU_SETUP_BUG_ON(!base_addr);
PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);

/\* process group information and build config tables accordingly \*/
//为group相关percpu信息保存数组分配空间
group_offsets = alloc\_bootmem(ai->nr_groups \* sizeof(group_offsets[0]));
group_sizes = alloc\_bootmem(ai->nr_groups \* sizeof(group_sizes[0]));
//为每个cpu相关percpu信息保存数组分配空间
unit_map = alloc\_bootmem(nr_cpu_ids \* sizeof(unit_map[0]));
unit_off = alloc\_bootmem(nr_cpu_ids \* sizeof(unit_off[0]));

//对unit\_map、pcpu\_low\_unit\_cpu和pcpu\_high\_unit\_cpu变量初始化
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
	unit_map[cpu] = UINT_MAX;

pcpu_low_unit_cpu = NR_CPUS;
pcpu_high_unit_cpu = NR_CPUS;

//遍历每一group的每一个cpu
for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
	const struct pcpu\_group\_info \*gi = &ai->groups[group];
	
	//取得该组处理器的percpu内存空间的偏移量
	group_offsets[group] = gi->base_offset;
	//取得该组处理器的percpu内存空间占用的虚拟地址空间大小，即包含该组中每个cpu所占的percpu空间
	group_sizes[group] = gi->nr_units \* ai->unit_size;

	//遍历该group中的cpu
	for (i = 0; i < gi->nr_units; i++) {
		//获取该group中的cpu
		cpu = gi->cpu_map[i];
		if (cpu == NR_CPUS)
			continue;

		PCPU\_SETUP\_BUG\_ON(cpu > nr_cpu_ids);
		PCPU\_SETUP\_BUG\_ON(!cpu\_possible(cpu));
		PCPU\_SETUP\_BUG\_ON(unit_map[cpu] != UINT_MAX);

		//计算每个cpu的跨group的编号，保存在unit\_map数组中
		unit_map[cpu] = unit + i;
		//计算每个cpu的在整个系统percpu内存空间中的偏移量，保存到数组unit\_off中
		unit_off[cpu] = gi->base_offset + i \* ai->unit_size;

		/\* determine low/high unit\_cpu \*/
		if (pcpu_low_unit_cpu == NR_CPUS ||
		    unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
			pcpu_low_unit_cpu = cpu;
		if (pcpu_high_unit_cpu == NR_CPUS ||
		    unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
			pcpu_high_unit_cpu = cpu;
	}
}
//pcpu\_nr\_units变量保存系统中有多少个cpu的percpu内存空间
pcpu_nr_units = unit;

for\_each\_possible\_cpu(cpu)
	PCPU\_SETUP\_BUG\_ON(unit_map[cpu] == UINT_MAX);

/\* we're done parsing the input, undefine BUG macro and dump config \*/

#undef PCPU_SETUP_BUG_ON
pcpu_dump_alloc_info(KERN_DEBUG, ai);

//记录下全局参数，留在pcpu\_alloc时使用

//系统中group数量
pcpu_nr_groups = ai->nr_groups; 
///记录每个group的percpu内存偏移量数组
pcpu_group_offsets = group_offsets;
//记录每个group的percpu内存空间大小数组
pcpu_group_sizes = group_sizes;
//整个系统中cpu(跨group)的编号数组
pcpu_unit_map = unit_map;
//每个cpu的percpu内存空间偏移量
pcpu_unit_offsets = unit_off;

/\* determine basic parameters \*/
//每个cpu的percpu内存虚拟空间所占的页面数量
pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
//每个cpu的percpu内存虚拟空间大小
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_atom_size = ai->atom_size;
//计算pcpu\_chunk结构的大小，加上populated域的大小
pcpu_chunk_struct_size = sizeof(struct pcpu\_chunk) +
	BITS\_TO\_LONGS(pcpu_unit_pages) \* sizeof(unsigned long);

/\*

* Allocate chunk slots. The additional last slot is for
* empty chunks.
*/
//计算pcpu_nr_slots，即pcpu_slot数组的组项数量
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
//为pcpu_slot数组分配空间，不同size的chunck挂在不同“pcpu_slot”项目中
pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);

/\*

* Initialize static chunk. If reserved_size is zero, the
* static chunk covers static area + dynamic allocation area
* in the first chunk. If reserved_size is not zero, it
* covers static area + reserved area (mostly used for module
* static percpu allocation).
*/
//构建静态chunck,即pcpu_reserved_chunk
schunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&schunk->list);
//整个系统中percpu内存的起始地址
schunk->base_addr = base_addr;
//初始化为一个静态数组
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
schunk->immutable = true;
//若pcpu_unit_pages=8，即每个cpu占用的percpu空间为8页的空间，则populated域被设置为0xff
bitmap_fill(schunk->populated, pcpu_unit_pages);

if (ai->reserved_size) {
	//如果存在percpu保留空间，在指定reserved分配时作为空闲空间使用
	schunk->free_size = ai->reserved_size;
	pcpu_reserved_chunk = schunk;
	//静态chunk的大小限制包括，定义的静态变量的空间+保留的空间
	pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
} else {
	//若不存在保留空间，则将动态分配空间作为空闲空间使用
	schunk->free_size = dyn_size;
	//覆盖掉动态分配空间
	dyn_size = 0;			/\* dynamic area covered \*/
}
//记录静态chunk中空闲可使用的percpu空间大小
schunk->contig_hint = schunk->free_size;

//map数组保存空间的使用情况，负数为已使用的空间，正数表示为以后可以分配的空间
//map\_used记录chunk中存在几个map项
schunk->map[schunk->map_used++] = -ai->static_size;
if (schunk->free_size)
	schunk->map[schunk->map_used++] = schunk->free_size;

/\* init dynamic chunk if necessary \*/
//构建动态chunk分配空间
if (dyn_size) {
	dchunk = alloc\_bootmem(pcpu_chunk_struct_size);
	INIT\_LIST\_HEAD(&dchunk->list);
	//整个系统中percpu内存的起始地址
	dchunk->base_addr = base_addr;
	//初始化为一个静态数组
	dchunk->map = dmap;
	dchunk->map_alloc = ARRAY\_SIZE(dmap);
	dchunk->immutable = true;
	//记录下来分配的物理页
	bitmap\_fill(dchunk->populated, pcpu_unit_pages);

	//设置动态chunk中的空闲可分配空间大小
	dchunk->contig_hint = dchunk->free_size = dyn_size;
	//map数组保存空间的使用情况，负数为已使用的空间（静态变量空间和reserved空间），正数表示为以后可以分配的空间
	dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
	dchunk->map[dchunk->map_used++] = dchunk->free_size;
}

/\* link the first chunk in \*/
//把第一个chunk链接进对应的slot链表，reserverd的空间有自己单独的chunk：pcpu\_reserved\_chunk
pcpu_first_chunk = dchunk ?: schunk;
pcpu\_chunk\_relocate(pcpu_first_chunk, -1);

/\* we're done \*/
//pcpu\_base\_addr记录整个系统中percpu内存的起始地址
pcpu_base_addr = base_addr;
return 0;

}


（7）  
 经过setup\_per\_cpu\_areas函数，per\_cpu变量从.data…percpu section 被拷贝到了各自CPU的虚拟地址空间。原来的per\_cpu变量区域，即\_\_per\_cpu\_start和\_\_per\_cpu\_end区域将会被删除。


在内核态编程中，我们无法使用物理内存地址，只能使用内核虚拟地址。per\_cpu变量拷贝到了各自CPU的虚拟地址空间我们才能在内核态中使用per\_cpu变量。


vmalloc区间包含了系统中所有CPU副本的存储空间。


（8）pcpu\_page\_first\_chunk  
 如果架构配置需要，则构建 pcpu\_page\_first\_chunk()。简单介绍下pcpu\_page\_first\_chunk函数。  
 这是一个帮助程序，可以简化设置页面重映射的第一个percpu块，可以在需要pcpu\_setup\_first\_chunk()的地方调用它。  
 这是基本的分配器。静态percpu区域被page-by-page分配到vmalloc区域。

/*
* pcpu_page_first_chunk -使用PAGE_SIZE页面映射第一个块
* @reserved_size: percpu预留区域的大小，以字节为单位
* @alloc_fn:分配每个cpu页面的函数，总是用PAGE_SIZE调用
* @free_fn:释放percpu页面的函数，总是用PAGE_SIZE调用
* populate_pte_fn:填充pte的函数
*/
int __init pcpu_page_first_chunk(size_t reserved_size,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_populate_pte_fn_t populate_pte_fn)


先分配一块bootmem区间p，作为一级指针，然后为每个CPU分配n个页，依次把指针存放在p中。p[0]…p[n-1]属于cpu0, p[n]-p[2n-1]属于CPU2，依次类推。接着建立一个长度为n×NR\_CPUS的虚拟空间（vmalloc\_early），并把虚拟空间对应的物理页框设置为p数组指向的pages。然后把每CPU变量\_\_per\_cpu\_load拷贝至每个CPU自己的虚拟地址空间中。

…
pages = alloc_bootmem(pages_size);
…
/* copy static data */
memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
…


将．data.percpu中的数据拷贝到其中，每个CPU各有一份。由于数据从\_\_per\_cpu\_start处转移到各CPU自己的专有数据区中了，因此存取其中的变量就不能再用原先的值了，比如存取per\_cpu\_\_runqueues就不能再用per\_cpu\_\_runqueues了，需要做一个偏移量的调整，即需要加上各CPU自己的专有数据区首地址相对于\_\_per\_cpu\_start的偏移量。在这里也就是\_\_per\_cpu\_offset[i]，其中CPU i的专有数据区相对于\_\_per\_cpu\_start的偏移量为\_\_per\_cpu\_offset[i]。


经过这样的处理，.data.percpu这个section在系统初始化后就可以释放了。其中\_\_per\_cpu\_load被重定向到了．data…percpu区域，和\_\_per\_cpu\_start位置是一样的：

/**
* PERCPU_SECTION - define output section for percpu area, simple version
* @cacheline: cacheline size
*
* Align to PAGE_SIZE and outputs output section for percpu area. This
* macro doesn’t manipulate @vaddr or @phdr and __per_cpu_load and
* __per_cpu_start will be identical.
*
* This macro is equivalent to ALIGN(PAGE_SIZE); PERCPU_VADDR(@cacheline,)
* except that __per_cpu_load is defined as a relative symbol against
* .data…percpu which is required for relocatable x86_32 configuration.
*/
#define PERCPU_SECTION(cacheline)
. = ALIGN(PAGE_SIZE);
.data…percpu : AT(ADDR(.data…percpu) - LOAD_OFFSET) {
VMLINUX_SYMBOL(__per_cpu_load) = .;
PERCPU_INPUT(cacheline)
}


### 2.6 模块per-CPU变量


模块使用的per-CPU变量，大小为reserved\_size：

// linux-4.10.1/include/linux/module.h

struct module {
…
#ifdef CONFIG_SMP
/* Per-cpu data. */
void __percpu *percpu;
unsigned int percpu_size;
#endif
…
}


模块per-CPU变量相关API

// linux-4.10.1/kernel/module.c

#ifdef CONFIG_SMP

static inline void __percpu *mod_percpu(struct module *mod)
{
return mod->percpu;
}

static int percpu_modalloc(struct module *mod, struct load_info *info)
{
Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
unsigned long align = pcpusec->sh_addralign;

if (!pcpusec->sh_size)
	return 0;

if (align > PAGE_SIZE) {
	pr\_warn("%s: per-cpu alignment %li > %li\n",
		mod->name, align, PAGE_SIZE);
	align = PAGE_SIZE;
}

mod->percpu = \_\_alloc\_reserved\_percpu(pcpusec->sh_size, align);
if (!mod->percpu) {
	pr\_warn("%s: Could not allocate %lu bytes percpu data\n",
		mod->name, (unsigned long)pcpusec->sh_size);
	return -ENOMEM;
}
mod->percpu_size = pcpusec->sh_size;
return 0;

}

static void percpu_modfree(struct module *mod)
{
free_percpu(mod->percpu);
}

static unsigned int find_pcpusec(struct load_info *info)
{
return find_sec(info, “.data…percpu”);
}

static void percpu_modcopy(struct module *mod,
const void *from, unsigned long size)
{
int cpu;

for\_each\_possible\_cpu(cpu)
	memcpy(per\_cpu\_ptr(mod->percpu, cpu), from, size);

}

/**
* is_module_percpu_address - test whether address is from module static percpu
* @addr: address to test
*
* Test whether @addr belongs to module static percpu area.
*
* RETURNS:
* %true if @addr is from module static percpu area
*/
bool is_module_percpu_address(unsigned long addr)
{
struct module *mod;
unsigned int cpu;

preempt\_disable();

list\_for\_each\_entry\_rcu(mod, &modules, list) {
	if (mod->state == MODULE_STATE_UNFORMED)
		continue;
	if (!mod->percpu_size)
		continue;
	for\_each\_possible\_cpu(cpu) {
		void \*start = per\_cpu\_ptr(mod->percpu, cpu);

		if ((void \*)addr >= start &&
		    (void \*)addr < start + mod->percpu_size) {
			preempt\_enable();
			return true;
		}
	}
}

preempt\_enable();
return false;

}

// linux-4.10.1/mm/percpu.c

/**
* __alloc_reserved_percpu - allocate reserved percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Allocate zero-filled percpu area of @size bytes aligned at @align
* from reserved percpu area if arch has set it up; otherwise,
* allocation is served from the same dynamic area. Might sleep.
* Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, true, GFP_KERNEL);
}


## 三、动态per-CPU变量


### 3.1 对应API


（1）alloc\_percpu

#define alloc_percpu(type)
(typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type))

/\*\*
 \* \_\_alloc\_percpu - allocate dynamic percpu area
 \* @size: size of area to allocate in bytes
 \* @align: alignment of area (max PAGE\_SIZE)
 \*
 \* Allocate zero-filled percpu area of @size bytes aligned at @align.
 \* Might sleep. Might trigger writeouts.
 \*
 \* CONTEXT:
 \* Does GFP\_KERNEL allocation.
 \*
 \* RETURNS:
 \* Percpu pointer to the allocated area on success, NULL on failure.
 \*/
void __percpu \*\_\_alloc\_percpu(size\_t size, size\_t align)
{
	return pcpu\_alloc(size, align, false);
}
EXPORT\_SYMBOL\_GPL(__alloc_percpu);

动态percpu的实现类似于kmalloc，为系统上的每个处理器创建所需要的内存。
alloc_percpu给系统每个处理器分配一个指定类型对象(type)的实例，alloc_percpu会返回一个指针，用来间接引用动态创建的percpu数据。
__alloc_percpu的参数有两个：一个要分配的实际字节数，一个是分配时要按多少个字节对齐

(typeof(type) __percpu \*)\_\_alloc\_percpu(sizeof(type), \_\_alignof\_\_(type))

（2）free_percpu

/\*\*
 \* free\_percpu - free percpu area
 \* @ptr: pointer to area to free
 \*
 \* Free percpu area @ptr.
 \*
 \* CONTEXT:
 \* Can be called from atomic context.
 \*/
void free\_percpu(void __percpu \*ptr)
{
	......
}
EXPORT\_SYMBOL\_GPL(free_percpu);

（3）per_cpu_ptr

// linux-3.10.1/include/linux/smp.h

//smp\_processor\_id(): get the current CPU ID.

#define get\_cpu() ({ preempt\_disable(); smp\_processor\_id(); })
#define put\_cpu() preempt\_enable()

/\*
 \* Use this to get to a cpu's version of the per-cpu object
 \* dynamically allocated. Non-atomic access to the current CPU's
 \* version should probably be combined with get\_cpu()/put\_cpu().
 \*/
#ifdef CONFIG\_SMP
#define per\_cpu\_ptr(ptr, cpu) SHIFT\_PERCPU\_PTR((ptr), per\_cpu\_offset((cpu)))
#endif

（4）get/put_cpu_ptr

#define get\_cpu\_ptr(var) ({ \
 preempt\_disable(); \
 this\_cpu\_ptr(var); })

#define put\_cpu\_ptr(var) do { \
 (void)(var); \
 preempt\_enable(); \
} while (0)