一文搞明白Linux内核《物理内存模型》

Linux内核站

于 2022-05-21 15:16:08 发布

阅读量561

点赞数

文章标签： linux 运维服务器网络协议数据结构

本文链接：https://blog.csdn.net/youzhangjing_/article/details/124898650

版权

一、体系结构与内存模型

1、体系结构

目前多处理器系统当中，有两种体系结构：

非一致内存访问（NUMA），指内存划分成多个内存节点的多处理器系统，访问一个内存节点花费的时候取决于处理和内存节点的距离。NUMA是中高端服务器的主流体系结构。
对称多处理器（SMP），即一致内存访问（UMA），所有处理器访问内存花费的时间是相同。每个处理器的地位是平等的，仅在内核初始化的时候不平等：“0号处理器作为引导处理器负责初始化内核，其他处理器等待内核初始化完成。”

在实际应用中可以采用混合体系结构，在NUMA节点内部使用SMP体系结构。

2、内存模型

从处理器角度看到的物理内存分布，内核管理不同内存模型的方式存在差异。内存管理子系统当中有3种内存模型：

平坦内存（Flat Memory）：内存的物理地址空间是连续的，没有空洞。
不连续内存（Discontiguous Memory）：内存的物理地址空间存在空洞，这种模型可以高效地处理空洞。
稀疏内存（Sparse Memory）：内存的物理地址空间存在空洞，如果需要支持内存热插拔，只能选择稀疏内存模型。

二、三级结构（Node/Zone/Page）

从内存管理子系统使用节点（node）、区域（zone）和页（page）三级结构描述物理内存。

资料直通车：Linux内核源码技术学习路线+视频教程内核源码

学习直通车：Linux内核源码内存调优文件系统进程管理设备驱动/网络协议栈

1、内存节点

NUMA系统的内存节点，根据处理器和内存的距离划分；在具有不连续内存的UMA系统中，表示比区域的级别理高的内存区域，根据物理地址是否连续划分，每块物理地址连续的内存是一个内存节点。内存节点使用一个pglist_data结构体数据类型描述内存布局。

成员node_mem_map指向页描述符数组，每个物理页对应一个页描述符。node_mem_map可能不是指向数组的第一个元素，因为页描述符数组的大小必须对齐到2的(MAX_ORDER-1)次方。(MAX_ORDER-1)是页分配器可分配的最大阶数。具体pglist_ddata对应内核源码分析如下：

typedef struct pglist_data {

struct zone node_zones[MAX_NR_ZONES]; // 内存区域数组

struct zonelist node_zonelists[MAX_ZONELISTS]; // 备用区域列表

int nr_zones; // 内存区域数量

#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */

struct page *node_mem_map; // 页描述符数组，除了稀疏内存模型以外

#ifdef CONFIG_PAGE_EXTENSION

struct page_ext *node_page_ext; // 页的扩展属性

#endif

#endif

#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)

/*

* Must be held any time you expect node_start_pfn,

* node_present_pages, node_spanned_pages or nr_zones to stay constant.

*

* pgdat_resize_lock() and pgdat_resize_unlock() are provided to

* manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG

* or CONFIG_DEFERRED_STRUCT_PAGE_INIT.

*

* Nests above zone->lock and zone->span_seqlock

*/

spinlock_t node_size_lock;

#endif

unsigned long node_start_pfn; // 起始物理页号

unsigned long node_present_pages; // 物理页总数（不包括空洞）

unsigned long node_spanned_pages; // 物理页总数（包括空洞）

int node_id; // 节点标识符

wait_queue_head_t kswapd_wait;

wait_queue_head_t pfmemalloc_wait;

struct task_struct *kswapd; /* Protected by

mem_hotplug_begin/end() */

int kswapd_order;

enum zone_type kswapd_classzone_idx;

int kswapd_failures; /* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION

int kcompactd_max_order;

enum zone_type kcompactd_classzone_idx;

wait_queue_head_t kcompactd_wait;

struct task_struct *kcompactd;

#endif

/*

* This is a per-node reserve of pages that are not available

* to userspace allocations.

*/

unsigned long totalreserve_pages;

#ifdef CONFIG_NUMA

/*

* zone reclaim becomes active if more unmapped pages exist.

*/

unsigned long min_unmapped_pages;

unsigned long min_slab_pages;

#endif /* CONFIG_NUMA */

/* Write-intensive fields used by page reclaim */

ZONE_PADDING(_pad1_)

spinlock_t lru_lock;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

/*

* If memory initialisation on large machines is deferred then this

* is the first PFN that needs to be initialised.

*/

unsigned long first_deferred_pfn;

#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

spinlock_t split_queue_lock;

struct list_head split_queue;

unsigned long split_queue_len;

#endif

/* Fields commonly accessed by the page reclaim scanner */

struct lruvec lruvec;

unsigned long flags;

ZONE_PADDING(_pad2_)

/* Per-node vmstats */

struct per_cpu_nodestat __percpu *per_cpu_nodestats;

atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];

} pg_data_t;

2、内存区域

内存节点被划分为内存区域，内核定义区域类型如下所述：

enum zone_type {

#ifdef CONFIG_ZONE_DMA

/*

* ZONE_DMA is used when there are devices that are not able

* to do DMA to all of addressable memory (ZONE_NORMAL). Then we

* carve out the portion of memory that is needed for these devices.

* The range is arch specific.

*

* Some examples

*

* Architecture Limit

* ---------------------------

* parisc, ia64, sparc <4G

* s390, powerpc <2G

* arm Various

* alpha Unlimited or 0-16MB.

*

* i386, x86_64 and multiple other arches

* <16M.

*/

/*

DMA区域-->直接内存访问

*/

ZONE_DMA,

#endif

#ifdef CONFIG_ZONE_DMA32

/*

* x86_64 needs two ZONE_DMAs because it supports devices that are

* only able to do DMA to the lower 16M but also 32 bit devices that

* can only do DMA areas below 4G.

*/

ZONE_DMA32, // 64位系统

#endif

/*

* Normal addressable memory is in ZONE_NORMAL. DMA operations can be

* performed on pages in ZONE_NORMAL if the DMA devices support

* transfers to all addressable memory.

*/

ZONE_NORMAL, // 普通区域 -->线性映射区域（ARM处理器需要使用页表映射，MIPS处理器不需要使用页表映射）

#ifdef CONFIG_HIGHMEM

/*

* A memory area that is only addressable by the kernel through

* mapping portions into its own address space. This is for example

* used by i386 to allow the kernel to address the memory beyond

* 900MB. The kernel will set up special mappings (page

* table entries on i386) for each page that the kernel needs to

* access.

*/

ZONE_HIGHMEM, // 高端内存区域。64位系统的内核虚拟地址空间非常大，不再需要庙内存区域

#endif

ZONE_MOVABLE, // 可移动区域：伪内存区域，用来防止内存碎片

#ifdef CONFIG_ZONE_DEVICE

ZONE_DEVICE, // 为支持持久内存（热插拔增加的内存区域）

#endif

__MAX_NR_ZONES

};

每一个内存区域用一个zone结构体描述，对应内核源码如下：

struct zone {

/* Read-mostly fields */

/* zone watermarks, access with *_wmark_pages(zone) macros */

unsigned long _watermark[NR_WMARK]; // 页分配器使用的水线

unsigned long watermark_boost;

unsigned long nr_reserved_highatomic;

/*

* We don't know if the memory that we're going to allocate will be

* freeable or/and it will be released eventually, so to avoid totally

* wasting several GB of ram we must reserve some of the lower zone

* memory (otherwise we risk to run OOM on the lower zones despite

* there being tons of freeable ram on the higher zones). This array is

* recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl

* changes.

*/

long lowmem_reserve[MAX_NR_ZONES]; // 页分配器使用，当前区域保留多少页不能借给高的区域类型

#ifdef CONFIG_NUMA

int node;

#endif

struct pglist_data *zone_pgdat; // 指向内存节点的pglist_data实例

struct per_cpu_pageset __percpu *pageset; // 每处理器页集合

#ifndef CONFIG_SPARSEMEM

/*

* Flags for a pageblock_nr_pages block. See pageblock-flags.h.

* In SPARSEMEM, this map is stored in struct mem_section

*/

unsigned long *pageblock_flags;

#endif /* CONFIG_SPARSEMEM */

/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */

unsigned long zone_start_pfn;

/*

* spanned_pages is the total pages spanned by the zone, including

* holes, which is calculated as:

* spanned_pages = zone_end_pfn - zone_start_pfn;

*

* present_pages is physical pages existing within the zone, which

* is calculated as:

* present_pages = spanned_pages - absent_pages(pages in holes);

*

* managed_pages is present pages managed by the buddy system, which

* is calculated as (reserved_pages includes pages allocated by the

* bootmem allocator):

* managed_pages = present_pages - reserved_pages;

*

* So present_pages may be used by memory hotplug or memory power

* management logic to figure out unmanaged pages by checking

* (present_pages - managed_pages). And managed_pages should be used

* by page allocator and vm scanner to calculate all kinds of watermarks

* and thresholds.

*

* Locking rules:

*

* zone_start_pfn and spanned_pages are protected by span_seqlock.

* It is a seqlock because it has to be read outside of zone->lock,

* and it is done in the main allocator path. But, it is written

* quite infrequently.

*

* The span_seq lock is declared along with zone->lock because it is

* frequently read in proximity to zone->lock. It's good to

* give them a chance of being in the same cacheline.

*

* Write access to present_pages at runtime should be protected by

* mem_hotplug_begin/end(). Any reader who can't tolerant drift of

* present_pages should get_online_mems() to get a stable value.

*/

atomic_long_t managed_pages; // 伙伴分配器管理的物理页的数量

unsigned long spanned_pages; // 当前区域跨越的总页数，包括空洞

unsigned long present_pages; // 当前区域存在的物理页的数量，不我包括空洞

const char *name; // 区域名称

#ifdef CONFIG_MEMORY_ISOLATION

/*

* Number of isolated pageblock. It is used to solve incorrect

* freepage counting problem due to racy retrieving migratetype

* of pageblock. Protected by zone->lock.

*/

unsigned long nr_isolate_pageblock;

#endif

#ifdef CONFIG_MEMORY_HOTPLUG

/* see spanned/present_pages for more description */

seqlock_t span_seqlock;

#endif

int initialized;

/* Write-intensive fields used from the page allocator */

ZONE_PADDING(_pad1_)

/* 不同长度的空闲区域 */

struct free_area free_area[MAX_ORDER];

/* zone flags, see below */

unsigned long flags;

/* Primarily protects free_area */

spinlock_t lock;

/* Write-intensive fields used by compaction and vmstats. */

ZONE_PADDING(_pad2_)

/*

* When free pages are below this point, additional steps are taken

* when reading the number of free pages to avoid per-cpu counter

* drift allowing watermarks to be breached

*/

unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA

/* pfn where compaction free scanner should start */

unsigned long compact_cached_free_pfn;

/* pfn where async and sync compaction migration scanner should start */

unsigned long compact_cached_migrate_pfn[2];

#endif

#ifdef CONFIG_COMPACTION

/*

* On compaction failure, 1<

* are skipped before trying again. The number attempted since

* last failure is tracked with compact_considered.

*/

unsigned int compact_considered;

unsigned int compact_defer_shift;

int compact_order_failed;

#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA

/* Set to true when the PG_migrate_skip bits should be cleared */

bool compact_blockskip_flush;

#endif

bool contiguous;

ZONE_PADDING(_pad3_)

/* Zone statistics */

atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];

atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];

} ____cacheline_internodealigned_in_smp;

3、物理页

每个物理页对应一个page结构体，称为页描述符，内存节点的pglist_data实例的成员node_mem_map指向该内存节点包含的所有物理页的页描述符组成的数组。

在内核里面：内核函数page_to_nid用来得到物理内存所属的内存节点的编号源码如下：

#ifdef NODE_NOT_IN_PAGE_FLAGS

extern int page_to_nid(const struct page *page);

#else

static inline int page_to_nid(const struct page *page)

{

struct page *p = (struct page *)page;

return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;

}

#endif

page_zonenum用来得到物理页所属的内存区域类型：

static inline enum zone_type page_zonenum(const struct page *page)

{

return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;

}

三、Bootmem/Memblock分配器

在Linux内核初始化的时候需要分配内存，内核提供临时的引导内存分配器，在页分配器和块分配器初始化完成之后，把空闲的物理页交给页分配器管理，丢弃引导内存分配器。

1、bootmem分配器应用的数据结构源码如下：

// 其中下面这个结构体中成员node_bootmem_map，指向一个位图，

// 每个物理页对应一位，如果物理页被分配，把对应的位设置为1

struct bootmem_data;

在老版本里面有bootmem_data此结构体。新版本只有memblock结构体。

2、memblock分配器应用的数据结构如下：

/**

* struct memblock_type - collection of memory regions of certain type

* @cnt: number of regions

* @max: size of the allocated array

* @total_size: size of all regions

* @regions: array of regions

* @name: the memory type symbolic name

*/

// 内存块类型的数据结构

struct memblock_type {

unsigned long cnt; // 区域数量

unsigned long max; // 已分配数组的大小

phys_addr_t total_size; // 所有区域的长度

struct memblock_region *regions; // 内存块区域数组

char *name; // 内存块类型的名称

};

/**

* struct memblock - memblock allocator metadata

* @bottom_up: is bottom up direction?

* @current_limit: physical address of the current allocation limit

* @memory: usabe memory regions

* @reserved: reserved memory regions

* @physmem: all physical memory

*/

struct memblock {

bool bottom_up; // 表示分配方式，值为真表示从低地址向上分配，为假表示从高地址向下分配

phys_addr_t current_limit; // 可分配内存的最大物理地址

struct memblock_type memory; // 内存类型（已分配内存和未分配内存）

struct memblock_type reserved; // 保存类型

#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP

struct memblock_type physmem; // 物理内存类型

#endif

};

物理内存类型和内存类型区别：

内存类型是物理内存类型的子集，在引用内核时可以使用内核参数，把定可用内存的大小。物理内存类型总是包含所有内存范围（可用内存范围）。

/**

* enum memblock_flags - definition of memory region attributes

* @MEMBLOCK_NONE: no special request

* @MEMBLOCK_HOTPLUG: hotpluggable region

* @MEMBLOCK_MIRROR: mirrored region

* @MEMBLOCK_NOMAP: don't add to kernel direct mapping

*/

enum memblock_flags {

MEMBLOCK_NONE = 0x0, /* 没有特殊要求的区域 */

MEMBLOCK_HOTPLUG = 0x1, /* 可热插拔区域 */

MEMBLOCK_MIRROR = 0x2, /* 镜像区域 */

MEMBLOCK_NOMAP = 0x4, /* 不添加到内核直接映射（线性映射区域） */

};

/**

* struct memblock_region - represents a memory region

* @base: physical address of the region

* @size: size of the region

* @flags: memory region attributes

* @nid: NUMA node id

*/

// 内存块区域数据结构如下：

struct memblock_region {

phys_addr_t base; // 起始物理地址

phys_addr_t size; // 长度

enum memblock_flags flags; // 标志

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

int nid; // 节点编号

#endif

};

ARM64内核初始化memblock分配器过程，具体内核源码分析如下：

a.解析设备树二进制文件中节点/memory，把所有物理内存范围添加到memblock.memory。

b.直接在内核函数arm64_memblock_init初始化memblock。

void __init arm64_memblock_init(void)

{

const s64 linear_region_size = -(s64)PAGE_OFFSET;

/* Handle linux,usable-memory-range property */

fdt_enforce_memory_region();

/* Remove memory above our supported physical address size */

memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);

/*

* Ensure that the linear region takes up exactly half of the kernel

* virtual address space. This way, we can distinguish a linear address

* from a kernel/module/vmalloc address by testing a single bit.

*/

BUILD_BUG_ON(linear_region_size != BIT(VA_BITS - 1));

/*

* Select a suitable value for the base of physical memory.

*/

memstart_addr = round_down(memblock_start_of_DRAM(),

ARM64_MEMSTART_ALIGN);

/*

* Remove the memory that we will not be able to cover with the

* linear mapping. Take care not to clip the kernel which may be

* high in memory.

*/

memblock_remove(max_t(u64, memstart_addr + linear_region_size,

__pa_symbol(_end)), ULLONG_MAX);

if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {

/* ensure that memstart_addr remains sufficiently aligned */

memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,

ARM64_MEMSTART_ALIGN);

memblock_remove(0, memstart_addr);

}

/*

* Apply the memory limit if it was set. Since the kernel may be loaded

* high up in memory, add back the kernel region that must be accessible

* via the linear mapping.

*/

if (memory_limit != PHYS_ADDR_MAX) {

memblock_mem_limit_remove_map(memory_limit);

memblock_add(__pa_symbol(_text), (u64)(_end - _text));

}

if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {

/*

* Add back the memory we just removed if it results in the

* initrd to become inaccessible via the linear mapping.

* Otherwise, this is a no-op

*/

u64 base = phys_initrd_start & PAGE_MASK;

u64 size = PAGE_ALIGN(phys_initrd_size);

/*

* We can only add back the initrd memory if we don't end up

* with more memory than we can address via the linear mapping.

* It is up to the bootloader to position the kernel and the

* initrd reasonably close to each other (i.e., within 32 GB of

* each other) so that all granule/#levels combinations can

* always access both.

*/

if (WARN(base < memblock_start_of_DRAM() ||

base + size > memblock_start_of_DRAM() +

linear_region_size,

"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {

initrd_start = 0;

} else {

memblock_remove(base, size); /* clear MEMBLOCK_ flags */

memblock_add(base, size);

memblock_reserve(base, size);

}

}

if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {

extern u16 memstart_offset_seed;

u64 range = linear_region_size -

(memblock_end_of_DRAM() - memblock_start_of_DRAM());

/*

* If the size of the linear region exceeds, by a sufficient

* margin, the size of the region that the available physical

* memory spans, randomize the linear region as well.

*/

if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) {

range /= ARM64_MEMSTART_ALIGN;

memstart_addr -= ARM64_MEMSTART_ALIGN *

((range * memstart_offset_seed) >> 16);

}

}

/*

* Register the kernel text, kernel data, initrd, and initial

* pagetables with memblock.

*/

memblock_reserve(__pa_symbol(_text), _end - _text);

if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {

/* the generic initrd code expects virtual addresses */

initrd_start = __phys_to_virt(phys_initrd_start);

initrd_end = initrd_start + phys_initrd_size;

}

early_init_fdt_scan_reserved_mem();

/* 4GB maximum for 32-bit only capable devices */

if (IS_ENABLED(CONFIG_ZONE_DMA32))

arm64_dma_phys_limit = max_zone_dma_phys();

else

arm64_dma_phys_limit = PHYS_MASK + 1;

reserve_crashkernel();

reserve_elfcorehdr();

high_memory = __va(memblock_end_of_DRAM() - 1) + 1;

dma_contiguous_reserve(arm64_dma_phys_limit);

}

Linux内核站

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
一文搞明白Linux内核《物理内存模型》

一、体系结构与内存模型1、体系结构目前多处理器系统当中，有两种体系结构：非一致内存访问（NUMA），指内存划分成多个内存节点的多处理器系统，访问一个内存节点花费的时候取决于处理和内存节点的距离。NUMA是中高端服务器的主流体系结构。对称多处理器（SMP），即一致内存访问（UMA），所有处理器访问内存花费的时间是相同。每个处理器的地位是平等的，仅在内核初始化的时候不平等：“0号处理器作为引导处理器负责初始化内核，其他处理器等待内核初始化完成。” 在实际应用中可以采用混
复制链接

扫一扫