Linux 内核学习(4) - 内存管理

最新推荐文章于 2023-12-08 13:35:58 发布

两个幽灵

最新推荐文章于 2023-12-08 13:35:58 发布

阅读量296

点赞数

分类专栏： Linux

原文链接：https://www.bilibili.com/video/BV12J41137Dv?p=30

版权

Linux 专栏收录该内容

19 篇文章 2 订阅

订阅专栏

内存管理

内存初始化

内存布局探测：E820图

E820介绍
- 在x86的机器上，由bios提供的中断，中断号是0x15，在调用的时候AX寄存器必须为0xE820，每次返回一段内存的空间的起始地址和大小以及它的属性（可用的RAM or 被BIOS保留的）

注：以下代码有的是linux 5.10，有的是linux 2.6.30.4

代码1 arch/x86/boot/memory.c

static void detect_memory_e820(void)
{
	int count = 0;
	struct biosregs ireg, oreg;
	struct boot_e820_entry *desc = boot_params.e820_table;
	static struct boot_e820_entry buf; /* static so it is zeroed */

	initregs(&ireg);        // 初始化寄存器
	ireg.ax  = 0xe820;      // 规范
	ireg.cx  = sizeof(buf); // 缓冲区大小
	ireg.edx = SMAP;        
	ireg.di  = (size_t)&buf; 

	/*
	 * Note: at least one BIOS is known which assumes that the
	 * buffer pointed to by one e820 call is the same one as
	 * the previous call, and only changes modified fields.  Therefore,
	 * we use a temporary buffer and copy the results entry by entry.
	 *
	 * This routine deliberately does not try to account for
	 * ACPI 3+ extended attributes.  This is because there are
	 * BIOSes in the field which report zero for the valid bit for
	 * all ranges, and we don't currently make any use of the
	 * other attribute bits.  Revisit this if we see the extended
	 * attribute bits deployed in a meaningful way in the future.
	 */

	do {  // 一条一条地取出所有entry
		intcall(0x15, &ireg, &oreg); // int指令产生(模拟)0x15中断
		ireg.ebx = oreg.ebx; /* for next iteration... */ // 表示下一次要读取的序号

		/* BIOSes which terminate the chain with CF = 1 as opposed
		   to %ebx = 0 don't always report the SMAP signature on
		   the final, failing, probe. */
		if (oreg.eflags & X86_EFLAGS_CF) // 表示调用产生了error，中止
			break;

		/* Some BIOSes stop returning SMAP in the middle of
		   the search loop.  We don't know exactly how the BIOS
		   screwed up the map at that point, we might have a
		   partial map, the full map, or complete garbage, so
		   just return failure. */
		if (oreg.eax != SMAP) { // 检查一下签名是不是SMAP，不是就是出问题了
			count = 0;
			break;
		}

		*desc++ = buf;
		count++;
	} while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_table)); // 直到取出的ebx为0

	boot_params.e820_entries = count; 
}

代码2 e820 entry定义 /usr/include/x86_64-linux-gnu/asm/bootparam.h

struct boot_e820_entry {
	__u64 addr;
	__u64 size;
	__u32 type;
} __attribute__((packed));

代码3 打印E820图 arch/x86/kernel/e820.c

void __init e820_print_map(char *who)
{
	int i;

	for (i = 0; i < e820.nr_map; i++) {
		printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
		       (unsigned long long) e820.map[i].addr, // 起始地址
		       (unsigned long long) // 终止地址
		       (e820.map[i].addr + e820.map[i].size));
		e820_print_type(e820.map[i].type); // 类型
		printk(KERN_CONT "\n");
	}
}

memblock

用于启动阶段的一个简单的分配器，它负责page alloc初始化之前的内存分配管理以及在系统boot阶段满足最大内存的请求(请求大小超过page alloc的最大限制)

实现：所有状态都保存在一个全局变量中

代码4 include/linux/memblock.h

/**
 * struct memblock_region - represents a memory region
 * @base: base address of the region
 * @size: size of the region
 * @flags: memory region attributes
 * @nid: NUMA node id
 */
struct memblock_region {
	phys_addr_t base;
	phys_addr_t size;
	enum memblock_flags flags;
#ifdef CONFIG_NEED_MULTIPLE_NODES
	int nid;  // NUMA用于非对称内存访问
#endif
};

/**
 * struct memblock_type - collection of memory regions of certain type
 * @cnt: number of regions
 * @max: size of the allocated array
 * @total_size: size of all regions
 * @regions: array of regions
 * @name: the memory type symbolic name
 */
struct memblock_type {  // 内存区间
	unsigned long cnt;  // 区间的个数
	unsigned long max;
	phys_addr_t total_size;
	struct memblock_region *regions;  // 这个东西是数组
	char *name;
};

/**
 * struct memblock - memblock allocator metadata
 * @bottom_up: is bottom up direction?
 * @current_limit: physical address of the current allocation limit
 * @memory: usable memory regions
 * @reserved: reserved memory regions
 */
struct memblock {
	bool bottom_up;  /* is bottom up direction? */
	phys_addr_t current_limit;
	struct memblock_type memory;    // 可用内存区域
	struct memblock_type reserved;  // 保留内存区域
};

在这里插入图片描述

代码5 添加内存区域 mm/memblock.c

/**
 * memblock_add_range - add new memblock region
 * @type: memblock type to add new region into
 * @base: base address of the new region
 * @size: size of the new region
 * @nid: nid of the new region
 * @flags: flags of the new region
 *
 * Add new memblock region [@base, @base + @size) into @type.  The new region
 * is allowed to overlap with existing ones - overlaps don't affect already
 * existing regions.  @type is guaranteed to be minimal (all neighbouring
 * compatible regions are merged) after the addition.
 *
 * Return:
 * 0 on success, -errno on failure.
 */
static int __init_memblock memblock_add_range(struct memblock_type *type,
				phys_addr_t base, phys_addr_t size,
				int nid, enum memblock_flags flags)
{
	bool insert = false;
	phys_addr_t obase = base;
	phys_addr_t end = base + memblock_cap_size(base, &size);
	int idx, nr_new;
	struct memblock_region *rgn;

	if (!size)
		return 0;

	/* special case for empty array */
	if (type->regions[0].size == 0) {  // 如果一项也没有，直接插进去
		WARN_ON(type->cnt != 1 || type->total_size);
		type->regions[0].base = base;
		type->regions[0].size = size;
		type->regions[0].flags = flags;
		memblock_set_region_node(&type->regions[0], nid);
		type->total_size = size;
		return 0;
	}
repeat:
	/*
	 * The following is executed twice.  Once with %false @insert and
	 * then with %true.  The first counts the number of regions needed
	 * to accommodate the new area.  The second actually inserts them.
	 */
	base = obase;
	nr_new = 0;

	for_each_memblock_type(idx, type, rgn) {  // 遍历原有的区域，检查有没有重合
		phys_addr_t rbase = rgn->base;
		phys_addr_t rend = rbase + rgn->size;

		if (rbase >= end)  // 表示遍历结束
			break;
		if (rend <= base)
			continue;  // 表示没有重合
		/*
		 * @rgn overlaps.  If it separates the lower part of new
		 * area, insert that portion.
		 */
		if (rbase > base) {
#ifdef CONFIG_NEED_MULTIPLE_NODES
			WARN_ON(nid != memblock_get_region_node(rgn));
#endif
			WARN_ON(flags != rgn->flags);
			nr_new++;
			if (insert)
				memblock_insert_region(type, idx++, base,
						       rbase - base, nid,
						       flags);
		}
		/* area below @rend is dealt with, forget about it */
		base = min(rend, end);
	}

	/* insert the remaining portion */
	if (base < end) {  // 插入memblock region
		nr_new++;
		if (insert)
			memblock_insert_region(type, idx, base, end - base,
					       nid, flags);
	}

	if (!nr_new)
		return 0;

	/*
	 * If this was the first round, resize array and repeat for actual
	 * insertions; otherwise, merge and return.
	 */
	if (!insert) {
		while (type->cnt + nr_new > type->max)  // 如果当前数组空间不够就进行扩展
			if (memblock_double_array(type, obase, size) < 0)
				return -ENOMEM;
		insert = true;
		goto repeat;  // 再重复一遍循环，进行区域的插入
	} else {
		memblock_merge_regions(type);  // 合并相邻区间
		return 0;
	}
}

将一段区域设为可用 memblock_add_node
将一段区域设为保留 memblock_reserve
从memblock中分配内存:
- 其基本算法是，找到在memblock.memory但不在memblock.reserved的满足size大小的区域，然后将该段区域加入到memblock.reserved中

代码6 memblock分配内存

static void * __init memblock_alloc_internal(
				phys_addr_t size, phys_addr_t align,
				phys_addr_t min_addr, phys_addr_t max_addr,
				int nid, bool exact_nid)

memblock释放内存：memblock_free()
Linux Kernel中memblock的使用
- kernel将自己占用的内存部分设为reserved，例如kernel的image所占内存,initrd所占的内存等
- 将e820探测的可用内存加入到memblock.memory中
- 总而言之，系统的空闲内存存在于memory中单不包括reserved的部分

page allocator

Linux内存中在运行阶段可用的大内存分配器，是以页为单位
分配的大小以2的倍数为单位，范围从2⁰到2^MAX_ORDER，MAX_ORDER可以编译选项CONFIG_FORCE_MAX_ZONEORDER配置，默认是11。即最大可以请求2¹⁰个页面，一个页面是4K
涉及的基本概念
- Node: NUMA的概念，即系统中的内存节点，每个node都在struct pglist_data *node_data[]中有对应的一项，以node的ID为序号。NUMA的内存布局探测是在ACPI中完成的，和E820不一样。在代码中对应NODE_DATA()
- Zone: 可以理解为每个页面的类型
  - 每个node中都有对应的zone, 存放在node_data[node.id]->node_zones[MAX_NR_ZONES];
  - Zone的类型有：ZONE_DMA;ZONE_DMA32;ZONE_NORMAL
- Zone order：即zone的查找次序，它决定如果当前请求的类型不满足后应该随后要到哪个zone中去分配。比如，用户可能想请求ZONE_HIGH,但ZONE_HIGH中的内存已经分配完了，这时候可以让它到其它的ZONE
- Kernel中有两种类型的order
  - A: NODE序，即所有请求都优先在本地节点完成
  - B: ZONE序，即在各个节点中优先分配相同类型的内存
- Kernel中每一个页面都有一个表示结构，即struct page，存放在struct page mem_map[]中，它以物理页面的序号作为索引，每个页面只能属于zone，函数page_zone(page)可以找到page的zone

在这里插入图片描述

算法：伙伴系统
- 需要高效，且尽量避免碎片
- 按照2的幂大小来组织内存，幂为0~MAX_ORDER，对应有MAX_ORDER-1条链表来组织空间
- 每个ZONE都有MAX_ORDER-1条链表，存放在zone->free_area[MAX_ORDER]中
- 分配时，根据请求的大小匹配到最佳空闲区，然后进行分配
- 如果最佳空闲区没有空闲页面了，则一直往上请求然后将上层进行拆分，如再请求2⁽ⁿ⁺¹⁾个内存
- 释放页面时，看它是否和相近的合并，如这种情况：
- 释放页面时，如果可以合并则拼成一个大的空闲区并将之移动到上层，一直这样合并下去：
- Linux Kernel中的页面组织方式
  - 为了让struct page尽可能的小，page allocator和其它子系统复用很多的成员
  - page->lru链表用来链接对应order的空闲块
  - page->_mapcount为PAGE_BUDDY_MAPCOUNT_VALUE时表示以该page为起始页面的内存块是空闲的（中间块的计数为-1），如果分配出去则置位-1
  - page->private表示该page为起始页面的内存块所有的order，即该内存块的大小是2^{page->private}
  - page所有的node和zone的信息都被编码在page->flags中，通过page_to_nid(page)可得到该page所在的node，通过page_zonenum(page)可得到该page所有的zone类型，page_zone(page)直接返回page所在的zone
  - 找到自己的小伙伴的过程
    - 先将自己的页号对 (1 << MAX_ORDER) 取余，即：
      page_index = page_to_pfn(page) & (1 << MAX_ORDER - 1)
    - 然后找到小伙伴的index也就是 __find_buddy_index()
      buddy_idx = page_idx ^ (1 << order)
    - 最后找到小伙伴的页面
      buddy = page + (buddy_idx - page_idx)
- 基于 page migration type 的页面分组：
  - 将页面按照migration进行分组，为了避免外碎片而导入的机制，对应migration type的请求都会到相应的区域中寻找
  - 内部碎片的产生：因为所有的内存分配必须起始于可被 4、8 或 16 整除（视处理器体系结构而定）的地址或者因为MMU的分页机制的限制，决定内存分配算法仅能把预定大小的内存块分配给客户。假设当某个客户请求一个 43 字节的内存块时，因为没有适合大小的内存，所以它可能会获得 44字节、48字节等稍大一点的字节，因此由所需大小四舍五入而产生的多余空间就叫内部碎片。
    外部碎片的产生：频繁的分配与回收物理页面会导致大量的、连续且小的页面块夹杂在已分配的页面中间，就会产生外部碎片。
  - 如果想请求的type不能满足，会fallback到其它类型中
  - 每个zone都有自己单独的分组
  - enum {MIGRATE_UNMOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_PCPTYPES, MIGRATE_RESERVE}
- Linux Kernel为了加速单个页面分配释放同时又提高cache利用率而导入的缓存
  - 每个zone都有一个称为pcp（struct per_cpu_pages）的percpu缓存，例如
    pcp = &this_cpu_ptr(zone->pageset)->pcp
  - 如果是为cold的页表，也就是说长时间没有使用的页面可能不在cache中了，在释放的时候加至list末尾，否则加到头部(为了优先请求hot页面)

位于：include/linux/mmzone.h

内存映射

硬件背景

使用从page allocator中得到的物理页面
- 页面对应的是物理地址
- CPU仅能使用虚拟地址来访问内存
- 所以，应将物理地址关联到CPU寻址的虚拟地址
x86背景
- 由于历史原因，x86寻址比较复杂：段映射+页面映射
- 段映射在x86_64中被废弃掉了
- 简言之，就是一个radix-tree like的算法，将线性地址分成几个区域，然后各区域值作为对应页表(paging-structure)的偏移
CPU模式不同，寻址方式上有些小差别
- 32位：2层页表(10+10+12)
- 32位 PAE: 3层页表(2+9+9+12)
- 64位：4层页表(9+9+9+9+12)
如果映射有异常，CPU会产生page fault异常
TLB介绍
- TLB用来缓存从虚拟地址到物理地址的映射
- 在reload页表的时候回自动刷新
- 如果映射关系有修改需要手动刷新TLB项

Linux Kernel 地址空间

起源是CPU的内存保护机制：特权级和非特权级
- 特权级可以做一切事情
- 非特权级不能执行特权指令来修改系统资源
两大空间：内核空间和用户空间
- 内核空间位于特权层
- 用户空间位于非特权层
- x86中有4个特权级，但是Linux内核只用了两级
- x86_64有2个特权级
内核空间和用户空间复用一部分地址空间
- 用来避免统一进程内核态和用户态转换时对TLB的刷新
- 在x86 CPU是内核空间1G，用户空间3G
- 在x86_64中，0xffff880000000000以下为用户空间，以上为内核空间
将映射层次抽象成PGD, PUD, PMD, PTE，如果某层不存在，则其对应位数为0
在内核中，虚拟地址的映射如下：
- 基础映射，物理地址和虚拟地址在偏移地址(PAGE_OFFSET)上是以1比1的关系映射的，即在内核中，物理地址对应的虚拟地址为: VG=PAGE_OFFSET+PA
- 内存拼接的映射，即vmalloc()映射的内存，用来将不连续的物理内存拼接成连续的虚拟地址供用户使用，用来减少内存碎片

Linux Kernel 内存映射

32位CPU上的映射问题
- 内核仅能使用1G的地址空间，也就是在同一时刻最大能使用1G的物理地址空间，如果物理内存超过1G，如何访问1G以上的内存？
- 在这种情况下，kernel将一部分地址用来做动态映射，将不能直接访问的物理内存映射到这个地址中
各种API
- vmalloc的APIs:
  - void *vmalloc(unsigned long size) / vfree(void *p)
- kmap的APIs：
  - void *kmap(struct page *page) / kunmap(struct page *page)
- kmap_atomic的APIs：
  - void *kmap_atomic(struct page *page) / kunmap_atomic(void *addr)

slab allocator

目的：相对应土工页面大小4K的page allocator来说，slab提供的小内存分配器
背景：
- 当前linux内核中提供了3种slab分配器，分别是slab,slub,slob，他们提供给外部使用的API都是一样的，在编译内核的时候只能选择其一
- 默认为slub，slob仅适合用于嵌入式中（占用资源极少）
- slab vs slub
  - slab管理结构很大，设计很复杂，slub简化了一切
  - slub便于调试
  - 下面的算法分析以slub为例
- slab简而言之，就是一个对象的缓存器，当有对象释放的时候，就缓存到slab里面，然后需要分配的时候，就从slab缓存中取出来
- slab以page allocator作为后端，当缓存对象不够时，就从page allocator中取
- API
  - 创建一个slab: struct kmem_cache *kmem_cache_create(name, size, align, flags, ctor)
  - 销毁slab: kmem_cache_destroy
  - 从slab中分配对象 kmem_cache_alloc
  - 将内存释放到slab中：kmem_cache_free
- 除此之外，slab还内建了一些slab cache, 用于不需要特殊处理的对象分配，对用户可见的接口为
  - 内存分配 __always_inline
  - 内存释放 kfree
  - 对于大块的内存请求，会落入到page allocator中

两个幽灵

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Linux 内核学习(4) - 内存管理

内存管理内存初始化内存布局探测：E820图E820介绍在x86的机器上，由bios提供的中断，中断号是0x15，在调用的时候AX寄存器必须为0xE820，每次返回一段内存的空间的起始地址和大小以及它的属性（可用的RAM or 被BIOS保留的）注：以下代码有的是linux 5.10，有的是linux 2.6.30.4代码1 arch/x86/boot/memory.cstatic void detect_memory_e820(void){ int count = 0; st
复制链接

扫一扫