概述
随着大内存和内存热插拔技术发展,内核物理内存越来越不连续,内核管理这种非连续物理内存的元数据metadata也需要随之发展,避免内存浪费,内核的sparse mem稀疏内存模型就是解决该问题。
稀疏内存模型的核心思想是对更小力度的连续内存块进行更为精细的管理,用户管理连续内存的基本单位是section,那么热插拔基本单位也是一个section(linux即struct mem_section),由于sparse memory model经过了多个的优化,有好几个的macro控制逻辑,本文以Linux 5.9 X86_64下打开如下特性为例说明:
CONFIG_SPARSEMEM=y
CONFIG_SPARSEMEM_EXTREME=y
CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
CONFIG_SPARSEMEM_VMEMMAP=y
CONFIG_X86_5LEVEL=y
数据结构
mmzone.h
//4096 / 16 = 256
#define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section))
//根据sec number计算属于哪个root,即mem_section二位数组第一维的index
#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)
//2048
#define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)
//相当于是[2048][256]的二维数组,每个数组元素是struct mem_section
extern struct mem_section **mem_section;
struct mem_section {
//为了节省空间,这个字段包含很多信息,具体见下面源码的解释
unsigned long section_mem_map;
struct mem_section_usage *usage;
};
/*
* We use the lower bits of the mem_map pointer to store
* a little bit of information. The pointer is calculated
* as mem_map - section_nr_to_pfn(pnum). The result is
* aligned to the minimum alignment of the two values:
* 1. All mem_map arrays are page-aligned.
* 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
* lowest bits. PFN_SECTION_SHIFT is arch-specific
* (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
* worst combination is powerpc with 256k pages,
* which results in PFN_SECTION_SHIFT equal 6.
* To sum it up, at least 6 bits are available.
*/
#define SECTION_MARKED_PRESENT (1UL<<0)
#define SECTION_HAS_MEM_MAP (1UL<<1)
#define SECTION_IS_ONLINE (1UL<<2)
#define SECTION_IS_EARLY (1UL<<3)
#define SECTION_MAP_LAST_BIT (1UL<<4)
#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1))
#define SECTION_NID_SHIFT 3
结构图:
函数源码介绍
pfn和section的互相转换
static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
return pfn >> PFN_SECTION_SHIFT;
}
static inline unsigned long section_nr_to_pfn(unsigned long sec)
{
return sec << PFN_SECTION_SHIFT;
}
section号转换为mem_section对象的地址:我们假设nr从0开始(仅仅假设),nr_to_section(256)就是获取上图第二个蓝色数组中的第一项代表的mem_section对象的地址。
static inline struct mem_section *__nr_to_section(unsigned long nr)
{
...
return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
}
通过nr_to_section获取到具体的mem_section对象的指针后,就可以通过mem_section->section_mem_map字段获取section的具体状态:
static inline int present_section(struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
}
static inline int present_section_nr(unsigned long nr)
{
return present_section(__nr_to_section(nr));
}
static inline int valid_section(struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
}
static inline int early_section(struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_IS_EARLY));
}
page和pfn的转换
采用SparseMem模型之后,page和pfn转换也需要高效的实现,内核最终在开启CONFIG_SPARSEMEM_VMEMMAPCONFIG_SPARSEMEM_VMEMMAP的情况下使用如下公式即可互相转换:
/* memmap is virtually contiguous. */
#define __pfn_to_page(pfn) (vmemmap + (pfn))
#define __page_to_pfn(page) (unsigned long)((page) - vmemmap)
其中vmemmap根据macro不同可以固定,也可以动态计算而得不具体展开。