Memory Management [LKD 12]

最新推荐文章于 2020-11-06 21:18:05 发布

scutth

最新推荐文章于 2020-11-06 21:18:05 发布

阅读量244

点赞数

分类专栏： LKD3 Linux 文章标签： LKD

本文链接：https://blog.csdn.net/scutth/article/details/106415302

版权

Linux 同时被 2 个专栏收录

37 篇文章 0 订阅

订阅专栏

LKD3

8 篇文章 0 订阅

订阅专栏

kernel中和user space存在很大不同，从user space角度看，分配/释放内存易如反掌，即便失败了也容易处理，kernel里面不一样。比如有些kernel code不允许sleep，或者失败的时候难以处理等。考虑到kernel种种的复杂性，就需要轻量级效率高的内存管理方式。

Pages

kenel管理物理内存的基本单位是page，尽管处理器能够处理的最小单位是byte或者word。MMU作为内存管理单元，处理的最小单位就是page，因此MMU中的page table就是以page作为粒度对内存进行管理，这也是为什么虚拟地址到物理地址映射表被称为page table的原因。

一个page到底多大，取决于架构，比如32位系统上一般是4KB，64位系统上一般是8KB。

page在kernel中使用struct page来管理：

struct page {
	/* First double word block */
	unsigned long flags;		/* Atomic flags, some possibly
					 * updated asynchronously */
	union {
		/* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */
		struct address_space *mapping;

		void *s_mem;			/* slab first object */
		atomic_t compound_mapcount;	/* first tail page */
		/* page_deferred_list().next	 -- second tail page */
	};

	/* Second double word */
	union {
		pgoff_t index;		/* Our offset within mapping. */
		void *freelist;		/* sl[aou]b first free object */
		/* page_deferred_list().prev	-- second tail page */
	};

	union {
		_slub_counter_t counters;
		unsigned int active;		/* SLAB */
		struct {			/* SLUB */
			unsigned inuse:16;
			unsigned objects:15;
			unsigned frozen:1;
		};
		int units;			/* SLOB */

		struct {			/* Page cache */
			/*
			 * Count of ptes mapped in mms, to show when
			 * page is mapped & limit reverse map searches.
			 *
			 * Extra information about page type may be
			 * stored here for pages that are never mapped,
			 * in which case the value MUST BE <= -2.
			 * See page-flags.h for more details.
			 */
			atomic_t _mapcount;

			/*
			 * Usage count, *USE WRAPPER FUNCTION* when manual
			 * accounting. See page_ref.h
			 */
			atomic_t _refcount;
		};
	};

	/*
	 * WARNING: bit 0 of the first word encode PageTail(). That means
	 * the rest users of the storage space MUST NOT use the bit to
	 * avoid collision and false-positive PageTail().
	 */
	union {
		struct list_head lru;	/* Pageout list, eg. active_list
					 * protected by zone_lru_lock !
					 * Can be used as a generic list
					 * by the page owner.
					 */
		struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
					    * lru or handled by a slab
					    * allocator, this points to the
					    * hosting device page map.
					    */
		struct {		/* slub per cpu partial pages */
			struct page *next;	/* Next partial slab */
#ifdef CONFIG_64BIT
			int pages;	/* Nr of partial slabs left */
			int pobjects;	/* Approximate # of objects */
#else
			short int pages;
			short int pobjects;
#endif
		};

		struct rcu_head rcu_head;	/* Used by SLAB
						 * when destroying via RCU
						 */
		/* Tail pages of compound page */
		struct {
			unsigned long compound_head; /* If bit zero is set */

			/* First tail page only */
			unsigned char compound_dtor;
			unsigned char compound_order;
			/* two/six bytes available here */
		};

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
		struct {
			unsigned long __pad;	/* do not overlay pmd_huge_pte
						 * with compound_head to avoid
						 * possible bit 0 collision.
						 */
			pgtable_t pmd_huge_pte; /* protected by page->ptl */
		};
#endif
	};

	union {
		/*
		 * Mapping-private opaque data:
		 * Usually used for buffer_heads if PagePrivate
		 * Used for swp_entry_t if PageSwapCache
		 * Indicates order in the buddy system if PageBuddy
		 */
		unsigned long private;
#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
		spinlock_t *ptl;
#else
		spinlock_t ptl;
#endif
#endif
		struct kmem_cache *slab_cache;	/* SL[AU]B: Pointer to slab */
	};

#ifdef CONFIG_MEMCG
	struct mem_cgroup *mem_cgroup;
#endif

	/*
	 * On machines where all RAM is mapped into kernel address space,
	 * we can simply calculate the virtual address. On machines with
	 * highmem some memory is mapped into kernel virtual memory
	 * dynamically, so we need a place to store that address.
	 * Note that this field could be 16 bits on x86 ... ;)
	 *
	 * Architectures with slow multiplication can define
	 * WANT_PAGE_VIRTUAL in asm/page.h
	 */
#if defined(WANT_PAGE_VIRTUAL)
	void *virtual;			/* Kernel virtual address (NULL if
					   not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */

#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
	int _last_cpupid;
#endif
} _struct_page_alignment;

struct page比较大（不过也没task_struct那么大），这里把它的内容都列了出来，struct page定义在<include/linux/mm_types.h>里。这里只介绍重点的几个：

struct page {
    /* Atomic flags, some possibly updated asynchronously */
    unsigned long flag;

    /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */
    struct address_space *mapping;

    /* Our offset within mapping. */
    pgoff_t index;

    struct {	/* Page cache */
        /*
        * Count of ptes mapped in mms, to show when
        * page is mapped & limit reverse map searches.
        *
        * Extra information about page type may be
        * stored here for pages that are never mapped,
        * in which case the value MUST BE <= -2.
        * See page-flags.h for more details.
        */
        atomic_t _mapcount;

        /*
         * Usage count, *USE WRAPPER FUNCTION* when manual
         * accounting. See page_ref.h
         */
        atomic_t _refcount;
    };

    /* Pageout list, eg. active_list
     * protected by zone_lru_lock !
     * Can be used as a generic list
     * by the page owner.
     */
    struct list_head lru;

    /*
     * Mapping-private opaque data:
     * Usually used for buffer_heads if PagePrivate
     * Used for swp_entry_t if PageSwapCache
     * Indicates order in the buddy system if PageBuddy
     */
    unsigned long private;

    /*
     * On machines where all RAM is mapped into kernel address space,
     * we can simply calculate the virtual address. On machines with
     * highmem some memory is mapped into kernel virtual memory
     * dynamically, so we need a place to store that address.
     * Note that this field could be 16 bits on x86 ... ;)
     *
     * Architectures with slow multiplication can define
     * WANT_PAGE_VIRTUAL in asm/page.h
     */
#if defined(WANT_PAGE_VIRTUAL)
    /* Kernel virtual address (NULL if
     * not kmapped, ie. highmem)
     */
    void *virtual;
#endif /* WANT_PAGE_VIRTUAL */

flags记录了page当前的statue，比如是否dirty，是否被lock等，这些flag中的bitmask都定义在<linux/page-flags.h>里，这里对page flag的种类做了列举，但是不深入解释。

/*
 * Various page->flags bits:
 *
 * PG_reserved is set for special pages, which can never be swapped out. Some
 * of them might not even exist...
 *
 * The PG_private bitflag is set on pagecache pages if they contain filesystem
 * specific data (which is normally at page->private). It can be used by
 * private allocations for its own usage.
 *
 * During initiation of disk I/O, PG_locked is set. This bit is set before I/O
 * and cleared when writeback _starts_ or when read _completes_. PG_writeback
 * is set before writeback starts and cleared when it finishes.
 *
 * PG_locked also pins a page in pagecache, and blocks truncation of the file
 * while it is held.
 *
 * page_waitqueue(page) is a wait queue of all tasks waiting for the page
 * to become unlocked.
 *
 * PG_uptodate tells whether the page's contents is valid.  When a read
 * completes, the page becomes uptodate, unless a disk I/O error happened.
 *
 * PG_referenced, PG_reclaim are used for page reclaim for anonymous and
 * file-backed pagecache (see mm/vmscan.c).
 *
 * PG_error is set to indicate that an I/O error occurred on this page.
 *
 * PG_arch_1 is an architecture specific page state bit.  The generic code
 * guarantees that this bit is cleared for a page when it first is entered into
 * the page cache.
 *
 * PG_hwpoison indicates that a page got corrupted in hardware and contains
 * data with incorrect ECC bits that triggered a machine check. Accessing is
 * not safe since it may cause another machine check. Don't touch!
 */

/*
 * Don't use the *_dontuse flags.  Use the macros.  Otherwise you'll break
 * locked- and dirty-page accounting.
 *
 * The page flags field is split into two parts, the main flags area
 * which extends from the low bits upwards, and the fields area which
 * extends from the high bits downwards.
 *
 *  | FIELD | ... | FLAGS |
 *  N-1           ^       0
 *               (NR_PAGEFLAGS)
 *
 * The fields area is reserved for fields mapping zone, node (for NUMA) and
 * SPARSEMEM section (for variants of SPARSEMEM that require section ids like
 * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP).
 */
enum pageflags {
	PG_locked,		/* Page is locked. Don't touch. */
	PG_error,
	PG_referenced,
	PG_uptodate,
	PG_dirty,
	PG_lru,
	PG_active,
	PG_waiters,		/* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
	PG_slab,
	PG_owner_priv_1,	/* Owner use. If pagecache, fs may use*/
	PG_arch_1,
	PG_reserved,
	PG_private,		/* If pagecache, has fs-private data */
	PG_private_2,		/* If pagecache, has fs aux data */
	PG_writeback,		/* Page is under writeback */
	PG_head,		/* A head page */
	PG_mappedtodisk,	/* Has blocks allocated on-disk */
	PG_reclaim,		/* To be reclaimed asap */
	PG_swapbacked,		/* Page is backed by RAM/swap */
	PG_unevictable,		/* Page is "unevictable"  */
#ifdef CONFIG_MMU
	PG_mlocked,		/* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
	PG_uncached,		/* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
	PG_hwpoison,		/* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
	PG_young,
	PG_idle,
#endif
	__NR_PAGEFLAGS,

	/* Filesystems */
	PG_checked = PG_owner_priv_1,

	/* SwapBacked */
	PG_swapcache = PG_owner_priv_1,	/* Swap page: swp_entry_t in private */

	/* Two page bits are conscripted by FS-Cache to maintain local caching
	 * state.  These bits are set on pages belonging to the netfs's inodes
	 * when those inodes are being locally cached.
	 */
	PG_fscache = PG_private_2,	/* page backed by cache */

	/* XEN */
	/* Pinned in Xen as a read-only pagetable page. */
	PG_pinned = PG_owner_priv_1,
	/* Pinned as part of domain save (see xen_mm_pin_all()). */
	PG_savepinned = PG_dirty,
	/* Has a grant mapping of another (foreign) domain's page. */
	PG_foreign = PG_owner_priv_1,

	/* SLOB */
	PG_slob_free = PG_private,

	/* Compound pages. Stored in first tail page's flags */
	PG_double_map = PG_private_2,

	/* non-lru isolated movable page */
	PG_isolated = PG_reclaim,
};

接着看struct page中的重点成员——_refcount，这个是page的引用计数，用来记录这个page被引用了多少次，当这个值是-1的时候，说明当前的这个page没有人使用。kernel中的code应当使用page_count来获取某个page的引用计数，而不是直接读这个值。注意，即便page是free的，_refount是-1，但是page_count会返回0，表示当前没有人使用，而不是-1；如果有人使用，就返回被引用次数。page在很多使用都会被引用，比如被page cache使用（struct page中的mapping就会指向和这个page关联的address_space object），作为private data被使用；或者被process使用，记录在process对用的page table里。

这里强调了一点，struct page是用来管理物理内存，但是和物理内存中的内容不直接关联，比如page对应某个物理内存地址，但是里面的内容可能并不是之前的内容，因为有可能发生swap，原来物理内存中的内容被swap到磁盘上，里面存放的是新的内容，但是物理地址是同一个地址。

kernel使用struct page来管理和记录每个page，因为kernel需要知道某个page是否是free的，然后才能决定是否分配，以及分配哪些page给caller。如果一个page不是free，那么kernel需要知道owner是谁，可能的owner包括用户态进程，动态分配的kernel data，静态的kernel code，以及page cache等。

另外，kernel对物理内存的认识，就是一个大的array，这个数组里包含的所有物理内存对应的page结构体，也就是说，kernel会为每一个page的物理内存分配一个struct page来管理它。

Zones

物理内存对于kernel来说，本来是一个平坦的地址空间，但是因为某些硬件的种种限制，导致kernel不得不把物理内存分成不同的zone，同一个zone里的物理内存有一样的属性，不同zone的物理内存具有不一样的属性。硬件引入的限制主要是两点：

1. 有些硬件无法寻址所有的物理内存。因为有些硬件总线宽度不够，比如ISA设备，只有24bit宽，那就只能寻址16M的物理地址空间，那么kernel就不能为ISA设备分配DMA的memory，否则一旦超出16M的范围，ISA设备就无法访问了。

2. 系统中可能存在超过虚拟地址能够表示的物理内存。尤其在32位CPU上，总线只有32位宽，那就只能访问4GB的物理的地址，超过4GB的部分kernel没有能够直接访问的地址，这样的memory在kernel看来，属于high memory。

正是因为上面的限制，kernel把物理内存分为了大概四个部分：

1. ZONE_DMA——这里面的物理内存可以直接用来做DMA，也就是所有的device都可以直接做DMA。

2. ZONE_DMA32——这里的物理内存只能给32位的设备直接做DMA。

3. ZONE_NORMAL——这个zone里包含普通的，已经被map过的page。

4. ZONE_HIGHMEM——这个zone里就是high memory，也就是没有被kernel永久map进来的物理内存。

kernel的zone类型，不止上面的四种，下面把kernel的zone type都列出来（mmzone.h）：

enum zone_type {
#ifdef CONFIG_ZONE_DMA
	/*
	 * ZONE_DMA is used when there are devices that are not able
	 * to do DMA to all of addressable memory (ZONE_NORMAL). Then we
	 * carve out the portion of memory that is needed for these devices.
	 * The range is arch specific.
	 *
	 * Some examples
	 *
	 * Architecture		Limit
	 * ---------------------------
	 * parisc, ia64, sparc	<4G
	 * s390			<2G
	 * arm			Various
	 * alpha		Unlimited or 0-16MB.
	 *
	 * i386, x86_64 and multiple other arches
	 * 			<16M.
	 */
	ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
	/*
	 * x86_64 needs two ZONE_DMAs because it supports devices that are
	 * only able to do DMA to the lower 16M but also 32 bit devices that
	 * can only do DMA areas below 4G.
	 */
	ZONE_DMA32,
#endif
	/*
	 * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
	 * performed on pages in ZONE_NORMAL if the DMA devices support
	 * transfers to all addressable memory.
	 */
	ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
	/*
	 * A memory area that is only addressable by the kernel through
	 * mapping portions into its own address space. This is for example
	 * used by i386 to allow the kernel to address the memory beyond
	 * 900MB. The kernel will set up special mappings (page
	 * table entries on i386) for each page that the kernel needs to
	 * access.
	 */
	ZONE_HIGHMEM,
#endif
	ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
	ZONE_DEVICE,
#endif
	__MAX_NR_ZONES

};

从kernel中对zone type的定义也能看出来，这些zone type是跟架构相关的，

上图是x86-32位机器上这些zone对应的物理内存的range范围。

kernel把物理内存划分为这些zone，只是方便对物理内存的管理，比如当driver需要DMA的物理内存时，就从ZONE_DMA中分配page。不不过有的时候driver设置了zone，并不是说只能从这个zone里分配memory。比如driver设置了zone为ZONE_NORMAL，那么kernel可能从ZONE_DMA中分配，也可能从ZONE_NORMAL中分配page；如果设置了zone为ZONE_HIGHMEM，那么kernel可能从上面三个zone中的任意一个分配page。

因为zone是和架构相关的，有些架构上没有包含所有这些zone，比如intel的x86_64上，就没有ZONE_HIGHMEM，所有的memory都位于ZONE_DMA和ZONE_NORMAL中。每个zone都使用结构struct zone来表示：

struct zone {
	unsigned long watermark[NR_WMARK];
	unsigned long nr_reserved_highatomic;
	long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
	int node;
#endif
	struct pglist_data	*zone_pgdat;
	struct per_cpu_pageset __percpu *pageset;

#ifndef CONFIG_SPARSEMEM
	unsigned long		*pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
	unsigned long		zone_start_pfn;

	unsigned long		managed_pages;
	unsigned long		spanned_pages;
	unsigned long		present_pages;

	const char		*name;

#ifdef CONFIG_MEMORY_ISOLATION
	unsigned long		nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
	/* see spanned/present_pages for more description */
	seqlock_t		span_seqlock;
#endif

	int initialized;
	ZONE_PADDING(_pad1_)
	/* free areas of different sizes */
	struct free_area	free_area[MAX_ORDER];
	/* zone flags, see below */
	unsigned long		flags;
	/* Primarily protects free_area */
	spinlock_t		lock;
	/* Write-intensive fields used by compaction and vmstats. */
	ZONE_PADDING(_pad2_)
	unsigned long percpu_drift_mark;

	bool			contiguous;

	ZONE_PADDING(_pad3_)
	/* Zone statistics */
	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
	atomic_long_t		vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp

因为系统中一般是三个zone，所以一共是三个struct zone全局变量。下面看一下其中重点的几个数据成员。

第一个是lock，这个是用来保证对这个zone的访问是串行的，因为每个zone只有一个，因此可能会有多个进程或者线程同时从这个zone中申请内存，所以要加锁以保证互斥访问。

第二个是watermark数组，其中包含了minimum, low, and high watermarks，也就是这个zone的基准线，用来管理zone中的free memory。

第三个是name，也就是这个zone的名字，在mm/page_alloc.c中初始化，一般是三个：DMA，Normal，HighMem。

Getting Pages

我们已经知道kernel是通过zone以及和page来管理物理内存，下面看一下如何通过kernel提供的接口来分配/释放物理内存。首先，kernel提供了一些以page为单位的interface，比如：

struct page * alloc_pages(gfp_t gfp_mask, unsigned int order);

这些接口的声明或者定义都在include/linux/gfp.h中。alloc_pages会分配2^order个page，并返回第一个page的结构体指针，如果分配失败就返回NULL。下面一个接口是获取page对应的逻辑地址：

void *page_address(const struct page *page);

如果只想要内存地址，不需要page结构体，可以使用接口：

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);

来分配物理内存。如果你只想要单个的page，可以使用接口：

struct page * alloc_page(gfp_t gfp_mask);
unsigned long __get_free_page(gfp_t gfp_mask);

Getting Zeroed Pages

如果希望分配的page里全部填0，可以使用接口：

unsigned long get_zeroed_page(gfp_t gfp_mask);

这个接口返回一个page，page中的全部被填0。主要应用于给用户态程序分配page中，这样用户态程序不会获取到原来page中的内容，起到安全的作用。

Freeing Pages

如果要释放之前分配的page，可以使用这些接口：

void __free_pages(struct page *page, unsigned int order)
void free_pages(unsigned long addr, unsigned int order)
void free_page(unsigned long addr)

尤其要注意的是，只能释放自己分配的page，否则一旦地址或者page结构体不对，就会导致kernel hang。

以上这些接口适用于要分配的内存是page的倍数，如果需要的memory不是page倍数，那么可以使用kmalloc。

kmalloc()

kmalloc可以按照字节为单位来分配内存。函数声明位于linux/slab.h中：

void *kmalloc(size_t size, gfp_t flags)

kmalloc会分配至少size个byte，并且物理地址是连续的，如果出错，返回NULL。

static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
	if (__builtin_constant_p(size)) {
		if (size > KMALLOC_MAX_CACHE_SIZE)
			return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
		if (!(flags & GFP_DMA)) {
			unsigned int index = kmalloc_index(size);

			if (!index)
				return ZERO_SIZE_PTR;

			return kmem_cache_alloc_trace(kmalloc_caches[index],
					flags, size);
		}
#endif
	}
	return __kmalloc(size, flags);
}

KMALLOC_MAX_CACHE_SIZE是page size，如果超过一个page，调用kmalloc_large 来分配。

gfp_mask Flags

gfp是get free page的缩写。在上面讲的分配page和kmalloc的时候都需要gfp 参数，用来控制memory的分配。gfp被分成了三个类型：action modifier，zone modifier以及types。其中，action modifier用来控制kernel是通过什么样的方式来分配内存，比如在interrupt handler中分配内存就不能sleep；zone modifier控制kernel从哪个zone里分配内存；types是action modifier和zone modifier的组合，用来指定要分配的是哪种类型的内存，比如GFP_KERNEL，就是用来给kernel中的process context分配内存等。

Action Modifiers

action modifier一般不会被driver直接使用，上面也提高多，types是action modifier和zone modifier的组合，是device driver直接使用的gfp flag。这里把action modifier都列了一下：

上面的这些gfp可以组合使用，比如：

ptr = kmalloc(size, __GFP_WAIT | __GFP_IO | __GFP_FS);

上面的这个sample就告诉kernel，在分配时可以被block，执行IO，或者文件系统操作。

Zone Modifiers

zone类型主要是三个：ZONE_DMA/ZONE_DMA32, ZONE_NORMAL, ZONE_HIGHMEM. 其中，ZONE_DMA/ZONE_DMA32意味着只能从支持DMA的zone里分配内存；ZONE_NORMAL可以从ZONE_DMA/ZONE_DMA32以及ZONE_NORMAL中分配内存；ZONE_HIGHMEM可以从ZONE_NORMAL以及ZONE_HIGHMEM中分配内存。

如果在分配内存时没有指定gfp，kenrel默认从ZONE_DMA和ZONE_NORMAL中分配内存。

要注意的是，__get_free_pages()和kmalloc()中不可以指定gfp为__GFP_HIGHMEM，因为他们的返回值是logical address，但是high memory在kernel中没有逻辑地址，所以只能使用alloc_page时可以指定__GFP_HIGHMEM。

Type Flags

type flag是分配内存时直接使用的gfp：

在type的gfp flag背后，其实就是action modifier和zone modifier，这些type的具体组合为：

下面我看一些kernel中经常使用的type。第一个是GFP_KERNEL，这个是kernel中用的比较多的gfp，设置这个flag在分配内存时可能会sleep，因此只能用在process context中，因为对kernel如何分配内存没有限制，所以kernel会有很高的自由度来分配内存。

还有一个用的比较多的是GFP_ATOMIC，设置这个flag，kernel在分配内存不会sleep，但是在没有足够大的连续内存时会失败。这个flag一般用在不允许sleep的上下文，如interrupt handler，softirq，tasklet等。

此外，还有GFP_NOIO和GFP_NOFS，这两个flag可能会block，但是会限制某些操作。比如GFP_NOIO，就表示在分配内存时不要做IO操作，因为此时磁盘的IO还没有初始化；类似的，GFP_NOFS告诉kernel在分配内存时不要做文件操作，因此此时文件系统可能没有初始化。

GFP_DMA告诉kernel一定要从DMA zone里分配内存。通常还会和GFP_ATOMIC或者GFP_KERNEL组合使用。

那么在哪些情况下使用哪些flag呢，这里有个汇总：

什么时候用哪个flag，取决于：1. 在什么样的context中；2. 分配的内存做什么用。只要认清楚这两点，就能决定怎么设置gfp。

kfree()

和kmalloc对应的就是kfree：

void kfree(const void *ptr)

这里要注意的是，kfree释放的memory一定是通过kmalloc分配出来的，否则会出问题。另外，绝对禁止同一块内存释放两次！不过kfree（NULL）是可以的。

vmalloc()

vmalloc的用法和kmalloc极为相似，最大的区别在于vmalloc分配出来的内存，虚拟地址是连续的，但是物理内存不一定连续，kmalloc分配出来的内存，虚拟地址和物理地址都是连续的。vmalloc和用户态的malloc工作原理是类似的。

一般来说，只有device driver需要物理地址连续的内存，因为很多外设需要做DMA，但是不支持离散的物理内存访问，所以一次DMA用到的内存都是分配物理地址连续的内存。

但是实际情况并不是这样，在kernel中，需要分配的memory的时候，绝大部分情况都是使用kmalloc，而不是vmalloc，最主要的原因就是performance，kmalloc分配是物理地址连续的内存，不需要page table就可以直接访问；vmalloc不一样，每次访问都需要page table，所以performance比较差。vmalloc的原型：

void * vmalloc(unsigned long size)
void vfree(const void *addr)

vmalloc会分配至少size这么大的虚拟地址连续的内存，如果失败了就返回NULL，vmalloc可能会sleep，因此不能在atomic context中使用，如interrupt context，softirq等。释放需要使用vfree函数，要注意的是，vfree中也会sleep，因此不能在atomic context中使用。

Slab Layer

在kernel中，为结构体分配内存是非常频繁的操作，这些内存往往不大，但是会有频繁的分配和释放，为了防止频繁的分配和释放内存，kernel引入了free list，当需要为结构体分配内存时，从free list找一段free的memory给它用，当它用完以后free的时候，只是把它加到free list中，并不真的释放内存，这样可以避免频繁的分配和释放内存。

kernel中的这个free list就是slab layer，slab layer的实现有几点考虑：

1. 使用频率较高的数据结构可能会被频繁的分配和释放，所以对这些结构体做cache。

2. 频繁的分配和释放内存会导致碎片产生，free list本身一定要是物理内存连续的，否则自身内存的分配就可能导致碎片；如果物理内存连续，从里面分配和释放内存都不会导致碎片产生。

3. 使用free list可以提高performance，因为分配和释放都是立即完成。

4. 如果这个分配器知道object size，page size，以及free list的总的size，那么它就可以做出更加智能的决策；

5. 如果其中的cache可以做成per-processor的，那么在SMP上，就不用加锁。

6. 如果分配器知道是否有NUMA，那么就可以把内存分配在和请求者同一个node上。

7. 被存储的object，可以被着色，从而方式多个object被map到同样的cache line上。

Design of the Slab Layer

在slab中，每一种object都有一个类型的cache和它对应，每个type的cache里，都是对应type的object。kmalloc就是一种cache，只不过这种cache是general purpose的，并不只是针对某种特定的object。

一个cache会被分割成很多的slab，这些slab位于一个或者多个连续的page上，比较典型的是一个page的情况，而一个cache可能包含多个slab，每个slab有可能包含多个object，object就是一个结构体实例。每个slab都有三个状态：full，partial，empty。full表示这个slab已经分完了，没有剩余的空间再分配；partial是这个slab有一部分内存被分配出去；empty表示这个slab完全是空的，没有被使用过。当kernel要给结构体分配内存时，会先从partial中分配，如果没有，就从empty里分，如果没有empty，就会创建一个slab。如下所示，是cache，slab和object的对应关系：

cache在kernel中使用kmem_cache结构体来表示：

/*
 * Slab cache management.
 */
struct kmem_cache {
	struct kmem_cache_cpu __percpu *cpu_slab;
	/* Used for retriving partial slabs etc */
	slab_flags_t flags;
	unsigned long min_partial;
	int size;		/* The size of an object including meta data */
	int object_size;	/* The size of an object without meta data */
	int offset;		/* Free pointer offset. */
#ifdef CONFIG_SLUB_CPU_PARTIAL
	int cpu_partial;	/* Number of per cpu partial objects to keep around */
#endif
	struct kmem_cache_order_objects oo;

	/* Allocation and freeing of slabs */
	struct kmem_cache_order_objects max;
	struct kmem_cache_order_objects min;
	gfp_t allocflags;	/* gfp flags to use on each alloc */
	int refcount;		/* Refcount for slab cache destroy */
	void (*ctor)(void *);
	int inuse;		/* Offset to metadata */
	int align;		/* Alignment */
	int reserved;		/* Reserved bytes at the end of slabs */
	int red_left_pad;	/* Left redzone padding size */
	const char *name;	/* Name (only for display!) */
	struct list_head list;	/* List of slab caches */
#ifdef CONFIG_SYSFS
	struct kobject kobj;	/* For sysfs */
	struct work_struct kobj_remove_work;
#endif
#ifdef CONFIG_MEMCG
	struct memcg_cache_params memcg_params;
	int max_attr_size; /* for propagation, maximum size of a stored attr */
#ifdef CONFIG_SYSFS
	struct kset *memcg_kset;
#endif
#endif

#ifdef CONFIG_SLAB_FREELIST_HARDENED
	unsigned long random;
#endif

#ifdef CONFIG_NUMA
	/*
	 * Defragmentation by allocating from a remote node.
	 */
	int remote_node_defrag_ratio;
#endif

#ifdef CONFIG_SLAB_FREELIST_RANDOM
	unsigned int *random_seq;
#endif

#ifdef CONFIG_KASAN
	struct kasan_cache kasan_info;
#endif

	struct kmem_cache_node *node[MAX_NUMNODES];
};

其中的cpu_slab是per CPU的指针，里面记录了slab的信息：

struct kmem_cache_cpu {
	void **freelist;	/* Pointer to next available object */
	unsigned long tid;	/* Globally unique transaction id */
	struct page *page;	/* The slab from which we are allocating */
#ifdef CONFIG_SLUB_CPU_PARTIAL
	struct page *partial;	/* Partially allocated frozen slabs */
#endif
#ifdef CONFIG_SLUB_STATS
	unsigned stat[NR_SLUB_STAT_ITEMS];
#endif
};

scutth

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Memory Management [LKD 12]

kernel中和user space存在很大不同，从user space角度看，分配/释放内存易如反掌，即便失败了也容易处理，kernel里面不一样。比如有些kernel code不允许sleep，或者失败的时候难以处理等。考虑到kernel种种的复杂性，就需要轻量级效率高的内存管理方式。Pages...
复制链接

扫一扫

专栏目录