内存分配的主要算法为伙伴算法,其次有slab、slub和slob算法
1、请分析5.0以后版本的伙伴算法,给出流程图和心得体会
如下分析的都是5.1版本的内核代码:
分区伙伴分配器:文件在(include/linux/mmzone.h)中
//主要描述的就是相同尺寸的内存块在伙伴系统中的组织结构
struct free_area {
struct list_head free_list[MIGRATE_TYPES];
unsigned long nr_free;//表示的是该尺寸的内存块在当前伙伴系统中的个数
//这个数值会随着内存的分配而减少,随着内存的回收而增加
};
//nr_tree表示的可不是空闲内存页 page 的个数,而是空闲内存块的个数,对于 0 阶的内存块来说 nr_free 确实表示的是单个内存页 page 的个数,因为 0 阶内存块是由一个 page 组成的,但是对于 1 阶内存块来说,nr_free 则表示的是 2 个 page 集合的个数,以此类推对于 n 阶内存块来说,nr_free 表示的是 2 的 n 次方 page 集合的个数 。
(include/linux/mmzone.h)
struct zone {
/* Read-mostly fields */
/* zone watermarks, access with *_wmark_pages(zone) macros */
unsigned long _watermark[NR_WMARK];
unsigned long watermark_boost;
unsigned long nr_reserved_highatomic;
/*
* We don't know if the memory that we're going to allocate will be
* freeable or/and it will be released eventually, so to avoid totally
* wasting several GB of ram we must reserve some of the lower zone
* memory (otherwise we risk to run OOM on the lower zones despite
* there being tons of freeable ram on the higher zones). This array is
* recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
* changes.
*/
long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
int node;
#endif
struct pglist_data *zone_pgdat;
struct per_cpu_pageset __percpu *pageset;
#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
/*
* spanned_pages is the total pages spanned by the zone, including
* holes, which is calculated as:
* spanned_pages = zone_end_pfn - zone_start_pfn;
*
* present_pages is physical pages existing within the zone, which
* is calculated as:
* present_pages = spanned_pages - absent_pages(pages in holes);
*
* managed_pages is present pages managed by the buddy system, which
* is calculated as (reserved_pages includes pages allocated by the
* bootmem allocator):
* managed_pages = present_pages - reserved_pages;
*
* So present_pages may be used by memory hotplug or memory power
* management logic to figure out unmanaged pages by checking
* (present_pages - managed_pages). And managed_pages should be used
* by page allocator and vm scanner to calculate all kinds of watermarks
* and thresholds.
*
* Locking rules:
*
* zone_start_pfn and spanned_pages are protected by span_seqlock.
* It is a seqlock because it has to be read outside of zone->lock,
* and it is done in the main allocator path. But, it is written
* quite infrequently.
*
* The span_seq lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*
* Write access to present_pages at runtime should be protected by
* mem_hotplug_begin/end(). Any reader who can't tolerant drift of
* present_pages should get_online_mems() to get a stable value.
*/
//被伙伴系统所管理的物理内存页个数
atomic_long_t managed_pages;
unsigned long spanned_pages;
unsigned long present_pages;
const char *name;
#ifdef CONFIG_MEMORY_ISOLATION
/*
* Number of isolated pageblock. It is used to solve incorrect
* freepage counting problem due to racy retrieving migratetype
* of pageblock. Protected by zone->lock.
*/
unsigned long nr_isolate_pageblock;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif
int initialized;
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
/* free areas of different sizes */
//伙伴系统的核心数据结构
struct free_area free_area[MAX_ORDER];
/* zone flags, see below */
unsigned long flags;
/* Primarily protects free_area */
spinlock_t lock;
/* Write-intensive fields used by compaction and vmstats. */
ZONE_PADDING(_pad2_)
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
* drift allowing watermarks to be breached
*/
unsigned long percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* pfn where compaction free scanner should start */
unsigned long compact_cached_free_pfn;
/* pfn where async and sync compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[2];
unsigned long compact_init_migrate_pfn;
unsigned long compact_init_free_pfn;
#endif
#ifdef CONFIG_COMPACTION
/*
* On compaction failure, 1<<compact_defer_shift compactions
* are skipped before trying again. The number attempted since
* last failure is tracked with compact_considered.
*/
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* Set to true when the PG_migrate_skip bits should be cleared */
bool compact_blockskip_flush;
#endif
bool contiguous;
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
伙伴算法:如下分析的是在linux5.1版本当中,mm/page_allow.c
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
bool no_fallback;
retry:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
struct page *page;
unsigned long mark;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* When allocating a page cache page for writing, we
* want to get it from a node that is within its dirty
* limit, such that no single node holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the node's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* XXX: For now, allow allocations to potentially
* exceed the per-node dirty limit in the slowpath
* (spread_dirty_pages unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* nodes are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of nodes in the
* dirty-throttling and the flusher threads.
*/
if (ac->spread_dirty_pages) {
if (last_pgdat_dirty_limit == zone->zone_pgdat)
continue;
if (!node_dirty_ok(zone->zone_pgdat)) {
last_pgdat_dirty_limit = zone->zone_pgdat;
continue;
}
}
if (no_fallback && nr_online_nodes > 1 &&
zone != ac->preferred_zoneref->zone) {
int local_nid;
/*
* If moving to a remote node, retry but allow
* fragmenting fallbacks. Locality is more important
* than fragmentation avoidance.
*/
local_nid = zone_to_nid(ac->preferred_zoneref->zone);
if (zone_to_nid(zone) != local_nid) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
}
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
ac_classzone_idx(ac), alloc_flags)) {
int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
* grow this zone if it contains deferred pages.
*/
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
ac_classzone_idx(ac), alloc_flags))
goto try_this_zone;
continue;
}
}
try_this_zone:
page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);
/*
* If this is a high-order atomic allocation then check
* if the pageblock should be reserved for the future
*/
if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
reserve_highatomic_pageblock(page, zone, order);
return page;
} else {
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
}
}
/*
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
if (no_fallback) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
return NULL;
}
该函数主要是遍历各个内存管理zonelist以尝试页面申请,其中for_each_zone_zonelist_nodemask()则是用于遍历zonelist的,每个内存管理区尝试申请前,都将检查内存管理区是否有可分配的内存空间、根据alloc_flags判断当前CPU是否允许在该内存管理区zone中申请以及做watermask水印检查以判断zone中的内存是否足够等。
如下是rmqueue函数的实现过程:
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
unsigned int alloc_flags)
{
struct page *page;
retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
if (migratetype == MIGRATE_MOVABLE)
page = __rmqueue_cma_fallback(zone, order);
if (!page && __rmqueue_fallback(zone, order, migratetype,
alloc_flags))
goto retry;
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
这个函数的目标是从给定的内粗区域中分配一个页面,它首先尝试分配一个页面,如果失败,它会尝试从CMA区域分配,然后是fallback策略。如果所有尝试都失败了,它会重新开始尝试。在每次尝试之间,它都会记录跟踪信息以帮助调试和性能分析。
该函数里面有两个比较重要的关键函数:
1、_rmqueue_smallest()
2、_rmqueue_fallback()
先分析一下_rmqueue_fallback函数()
下面是__rmqueue_fallback代码片段:
static __always_inline bool
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
int current_order;
int min_order = order;
struct page *page;
int fallback_mt;
bool can_steal;
/*
* Do not steal pages from freelists belonging to other pageblocks
* i.e. orders < pageblock_order. If there are no local zones free,
* the zonelists will be reiterated without ALLOC_NOFRAGMENT.
*/
if (alloc_flags & ALLOC_NOFRAGMENT)
min_order = pageblock_order;
/*
* Find the largest available free page in the other list. This roughly
* approximates finding the pageblock with the most free pages, which
* would be too costly to do exactly.
*/
for (current_order = MAX_ORDER - 1; current_order >= min_order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
if (fallback_mt == -1)
continue;
/*
* We cannot steal all free pages from the pageblock and the
* requested migratetype is movable. In that case it's better to
* steal and split the smallest available page instead of the
* largest available page, because even if the next movable
* allocation falls back into a different pageblock than this
* one, it won't cause permanent fragmentation.
*/
if (!can_steal && start_migratetype == MIGRATE_MOVABLE
&& current_order > order)
goto find_smallest;
goto do_steal;
}
return false;
find_smallest:
for (current_order = order; current_order < MAX_ORDER;
current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
if (fallback_mt != -1)
break;
}
/*
* This should not happen - we already found a suitable fallback
* when looking for the largest page.
*/
VM_BUG_ON(current_order == MAX_ORDER);
do_steal:
page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
can_steal);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
return true;
}
该函数主要用于在无法从原始的内存区域获取所需内存时,寻找其他可用的内存区域。
这是一个内联函数,函数名为_rmqueue,它接受4个参数:一个指向struct zone类型的指针zone,一个整数order,一个整数start_migratetype和一个无符号整数alloc_flags。它的返回值类型时布尔值。
它的执行流程如下图流程图所示:
下面这段代码是expand()函数:
// expand函数接受6个参数:一个指向内存区域的指针,一个指向页的指针,以及一些用于表示内存范围和类型的整数。
static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
int migratetype)
{
// 计算要分配的内存大小,它是一个2的high次幂(即2的high次方)。
unsigned long size = 1 << high;
// 当high大于low时,循环会继续执行。
while (high > low) {
// 将area指针向下移动一位,因为我们将要处理的是比当前area更小的内存块。
area--;
// 将high向下移动一位,因为我们正在处理更小的内存块。
high--;
// 将size右移一位,因为我们正在处理的是当前size的一半。
size >>= 1;
// 使用VM_BUG_ON_PAGE宏检查我们是否有一个不合法的内存范围。如果page[size]是不合法的,则程序会终止。
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
// 设置页为守卫状态。这允许在将来当buddy被释放时,可以将其合并回分配器。这个函数的成功返回将允许我们继续处理下一个内存块。
// 如果set_page_guard函数返回false,我们会继续循环处理下一个内存块。
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
// 将页添加到area的free_list中,以便将来可以重新使用它。同时增加area的nr_free计数,并设置页的order。
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
}
} // expand函数结束
该函数用于扩展一个内存区域。
简单来说,该代码的主要目的就是将一个较大的内存块分解为较小的内存块,并添加它们到内存区域的free_list中以供将来使用。这是内存管理的一部分。
下面这段代码是_rmqueue_smallest():
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype)
{
// 定义一个返回类型为struct page *的函数__rmqueue_smallest,该函数接收一个struct zone类型的指针zone,一个无符号整型变量order和一个整型变量migratetype。
// __always_inline表示该函数总是内联,不进行函数调用。这样可以减少函数调用的开销。
unsigned int current_order;
// 定义一个无符号整型变量current_order,用于记录当前检查的内存页的阶。
struct free_area *area;
// 定义一个指向struct free_area类型的指针area,用于访问内存区域的free_area数组。
struct page *page;
// 定义一个指向struct page类型的指针page,用于访问内存页。
// 循环查找适当大小的页在优先级列表中。
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
// 获取当前阶的free_area。
page = list_first_entry_or_null(&area->free_list[migratetype], struct page, lru);
// 获取该阶的优先级列表中的第一个页,如果列表为空则返回null。
if (!page)
continue;
// 如果页为null,则继续循环查找下一个阶。
list_del(&page->lru);
// 从列表中删除该页,表示该页现在可以被重新使用。
rmv_page_order(page);
// 删除页的order,因为页的大小可能会改变。
area->nr_free--;
// 将该阶的空闲页数减1,因为现在有一个页被分配出去。
expand(zone, page, order, current_order, area, migratetype);
// 对分配出去的页进行扩展操作,将其分解为更小的内存块并添加到free_area中。
set_pcppage_migratetype(page, migratetype);
// 设置该页的迁移类型,以便将来可以正确地处理该页。
return page;
// 返回被分配的页。
}
return NULL;
// 如果无法找到适当大小的页,则返回null。
}
该函数实现了分配算法的核心功能,首先for()循环其由指定的伙伴管理算法链表order阶开始,如果该阶的链表不为空,则直接通过list_del()从该链表中获取空闲页面以满足申请需要;如果该阶的链表为空,则往更高一阶的链表查找,直到找到链表不为空的一阶,至于找到了最高阶仍为空链表,则申请失败;否则将在找到链表不为空的一阶后,将空闲页面块通过list_del()从链表中摘除出来,然后通过expand()将其对等拆分开来,并将拆分出来的一半空闲部分挂接至低一阶的链表中,直到拆分至恰好满足申请需要的order阶,最后将得到的满足要求的页面返回回去。至此,页面已经分配到了。
2、https://www.51cto.com/article/645546.html结合文中给出的方法,进行实践,截图(5分),给出相关原理和代码分析,总结,心得体会(10分,可选)
这片文章主要解决的是内存泄漏问题:
“程序向系统申请内存,使用完不需要之后,不释放内存还给系统回收,造成申请的内存被浪费”
怎么去寻找到这些内存地址空间呢?
接下来进行排查思路是:
1、监控系统中每个用户进程消耗的PSS(使用pmap工具(pmap pid))。
PSS:按比例报告的物理内存,比如进程A占用20M物理内存,进程B和进程A共享5M物理内存,那么进程A的PSS就是(20-5)+5/2=17.5M
2、监控/proc/meminfo输出,重点观察Slab使用量和slab对应的/proc/slabinfo信息
3、参考/proc/meminfo输出,计算系统中未被统计的内存变化,比如内核驱动代码
直接调用alloc_page()从buddy中拿走的内存不会被单独统计。
其中在使用slabtop监控slab的使用情况时发现
在这里用的一个bpftrace进行内存泄漏探测:
1、先设置一个mem_check.c的文件。设置一个有内存泄漏的和一个没有内存泄露的。
#include <stdio.h>
#include <stdlib.h>
int main(){
char *p1 = NULL;
char *p2 = NULL;
for(int i = 0; i < 5; i++)
{
p1 = malloc(16);
}
for(int i = 0; i < 5; i++)
{
p2 = malloc(32);
free(p2);
}
getchar();
return 0;
}
上面的代码是申请了5次16个字节的内存,没有释放,存在内存泄漏。申请5次32字节的内存,有释放,没存在内存泄漏。那么我们如何通过bpftrace定位呢?
通过bpftrace对mem_check.c进行动态的统计内存的申请和释放,定位内存泄漏的问题。需要对关键的两个接口进行probe—malloc和free,这两个接口的实现在libc中。
2、编译mem_check.c文件,生成可执行文件:
gcc -o mem_check mem_check.c
3、探测mem_check可执行文件
介绍:bpftrace可以对内核态进行探测也可以对用户态进行探测,其中两个探针如下:
内核态探针:kprobe/kretprobe
用户态探针:uprobe/uretprobe
mem_check.c是一个应用程序,属于用户态函数,因此就需要用的是用户态探针:uprobe/uretprobe
通过uprobe探测mem_check.c中的malloc函数,我们单行指令验证,参数格式是uprobe:可执行文件:函数名:
将上面的单行命令变为bpftrace脚本—bpf_test.bt
bpf_test.bt:
BEGIN {
printf("start probe\n");
}
uprobe:/lib/x86_64-linux-gnu/libc.so.6:malloc /comm == "mem_check"/{
printf("malloc size: %d\n", arg0);
}
END {
printf("end probe\n");
}
探测的是mem_check中malloc的内存空间大小。
如下图可以看到mem_check总申请内存的情况,最后一个malloc size 1024是mem_check自动创建输出缓冲区的内存。
探测mem_check中malloc的返回值:
malloc的返回值是地址,需借助uretprobe进行探测,函数返回值可通过内置retval访问。uretprobe的filter与malloc参数探测时类似,将脚本修改为:
BEGIN {
printf("start probe\n");
}
uprobe:/lib/x86_64-linux-gnu/libc.so.6:malloc /comm == "mem_check"/{
printf("malloc size: %d\n", arg0);
}
uretprobe:/lib/x86_64-linux-gnu/libc.so.6:malloc /comm == "mem_check"/{
printf("addr = %p\n", retval);
}
END {
printf("end probe\n");
}
运行结果为:
探测mem_check中free
我们已经探测到mem_check中的malloc的内存大小,内存的地址,通过探测free,然后匹配malloc和free的情况就可以查找到内存的泄漏点。将脚本修改为:
BEGIN {
printf("start probe\n");
}
uprobe:/lib/x86_64-linux-gnu/libc.so.6:malloc /comm == "mem_check"/{
printf("malloc size: %d\n", arg0);
}
uretprobe:/lib/x86_64-linux-gnu/libc.so.6:malloc /comm == "mem_check"/{
printf("addr = %p\n", retval);
}
uprobe:/lib/x86_64-linux-gnu/libc.so.6:free /comm == "mem_check"/{
printf("free addr = %p\n", arg0);
}
END {
printf("end probe\n");
}
运行结果:
探测内存泄漏:
上面我们已经到了mem_check的malloc和free情况。我们可以通过malloc和free的地址合差,就可以得到内存泄漏的地址位置。
bpftrace底层使用的是ebpf的map作为存储结构,步骤如下:
1、定义一个map变量@mem:保存malloc返回的内存地址。
2、当探测到free调用时,将@mem对应地址删除。
3、最后@mem剩下的就是内存泄漏的地址。
内存泄漏的脚本如下:
BEGIN {
printf("start probe\n");
}
uprobe:/lib/x86_64-linux-gnu/libc.so.6:malloc /comm == "mem_check"/{
printf("malloc size: %d\n", arg0);
@size = arg0;
}
uretprobe:/lib/x86_64-linux-gnu/libc.so.6:malloc /comm == "mem_check"/{
printf("addr = %p\n", retval);
@mem[retval] = @size;
}
uprobe:/lib/x86_64-linux-gnu/libc.so.6:free /comm == "mem_check"/{
printf("free addr = %p\n", arg0);
delete(@mem[arg0]);
}
END {
printf("end probe\n");
}
运行结果:
如上图,@mem后面跟着的就是内存当中,没有被释放的内存和内存大小。
总结:
通过编写一些简单的bpftrace脚本,我们就可以监视到应用程序的内存分配和释放事件,捕获内存泄漏的迹象。这种直接的实时监控方式,能使开发者在问题出现时即可获得反馈。