本文分析基于linux内核 4.19.195。
linux内核的page owner特性,参考文档,主要是用来Tracking about who allocated each page,方便定位内存泄漏、内存占用问题。本文从源码角度简单分析page owner的实现原理。
page owner特性的总体设计思路非常简单,就是通过扩展page结构体,增加成员变量用于存储该page被分配的调用栈及标志位,然后hack内存页的分配和释放接口,在内存页被分配时,保存调用栈信息,设置标志位;在内存页被释放的时候,清除调用栈信息,清除标志位。然后,通过一个debugfs的接口,将所有读取该接口时刻已经被分配出去的内存页的调用栈信息传递给用户态,并在用户态制作了一个工具,用于统计这些调用栈的信息。
本文着重分析内核态在页被分配出去时,page owner特性的行为。
我们知道,内存页被分配出去前,会走进post_alloc_hook函数,进行一些处理,post_alloc_hook函数会调用set_page_owner函数,完成内存页分配调用栈的保存。
static inline void set_page_owner(struct page *page,
unsigned int order, gfp_t gfp_mask)
{
if (static_branch_unlikely(&page_owner_inited))
__set_page_owner(page, order, gfp_mask);
}
noinline void __set_page_owner(struct page *page, unsigned int order,
gfp_t gfp_mask)
{
struct page_ext *page_ext = lookup_page_ext(page);
depot_stack_handle_t handle;
if (unlikely(!page_ext))
return;
handle = save_stack(gfp_mask);
__set_page_owner_handle(page_ext, handle, order, gfp_mask);
}
第一步,调用函数lookup_page_ext获取该page对应的struct page_ext结构体。内核为每个page结构体都配备了一个struct page_ext结构体,相当于是page结构体的扩展,我猜测是因为实在不希望扩大page结构体的体积,才做了这样的折中,不然完全可以直接写进page结构体里面的。
第二步,调用save_stack函数,将该页被分配时的调用栈保存下来,并返回一个类似句柄的handle,在第三步中使用。这个函数下面重点分析。
第三步,将第二步获取到的handle,写入到对应的struct page_ext结构体,并设置标志位,表明该页已被分配出去。
第一步和第三步都比较简单,下面分析第二步。
static noinline depot_stack_handle_t save_stack(gfp_t flags)
{
unsigned long entries[PAGE_OWNER_STACK_DEPTH];
struct stack_trace trace = {
.nr_entries = 0,
.entries = entries,
.max_entries = PAGE_OWNER_STACK_DEPTH,
.skip = 2
};
depot_stack_handle_t handle;
save_stack_trace(&trace);
if (trace.nr_entries != 0 &&
trace.entries[trace.nr_entries-1] == ULONG_MAX)
trace.nr_entries--;
/*
* We need to check recursion here because our request to stackdepot
* could trigger memory allocation to save new entry. New memory
* allocation would reach here and call depot_save_stack() again
* if we don't catch it. There is still not enough memory in stackdepot
* so it would try to allocate memory again and loop forever.
*/
if (check_recursive_alloc(&trace, _RET_IP_))
return dummy_handle;
handle = depot_save_stack(&trace, flags);
if (!handle)
handle = failure_handle;
return handle;
}
首先,调用函数save_stack_trace获取调用栈信息;
然后,使用函数depot_save_stack将调用栈存储到内存中。
depot_save_stack是个神马东东呢?
在lib/stackdepot.c文件里,我们找到了该函数的实现。从该文件最开头的注释中,可以了解到,该文件就是实现了一个存放调用栈的库,这个库,只会使用内存存入调用栈信息,不会删除这些信息。并且,会使用一个哈希表,来存放表征这些调用栈的句柄,另外,用于存放调用栈信息的内存,相当于就是一个大数组,每次只需要往这块内存中添加数据,并记录该调用栈信息的首地址即可。下面来看一下函数depot_save_stack的具体实现方法。
static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; //存调用栈的大数组,其中,每个元素指向一片内存,用于存放调用栈信息;这么做的原因在于节省一些内存,而不用一下子就把所有内存都申请出来
static int depot_index;//stack_slabs的index
static int next_slab_inited; //指明下一个index的stack_slabs是否已经分配内存了
static size_t depot_offset;//stack_slabs[depot_index]的depot_offset
/**
* depot_save_stack - save stack in a stack depot.
* @trace - the stacktrace to save.
* @alloc_flags - flags for allocating additional memory if required.
*
* Returns the handle of the stack struct stored in depot.
*/
depot_stack_handle_t depot_save_stack(struct stack_trace *trace,
gfp_t alloc_flags)
{
u32 hash;
depot_stack_handle_t retval = 0;
struct stack_record *found = NULL, **bucket;
unsigned long flags;
struct page *page = NULL;
void *prealloc = NULL;
if (unlikely(trace->nr_entries == 0))
goto fast_exit;
hash = hash_stack(trace->entries, trace->nr_entries);
bucket = &stack_table[hash & STACK_HASH_MASK];
/*
* Fast path: look the stack trace up without locking.
* The smp_load_acquire() here pairs with smp_store_release() to
* |bucket| below.
*/
found = find_stack(smp_load_acquire(bucket), trace->entries,
trace->nr_entries, hash);
if (found)
goto exit;
/*
* Check if the current or the next stack slab need to be initialized.
* If so, allocate the memory - we won't be able to do that under the
* lock.
*
* The smp_load_acquire() here pairs with smp_store_release() to
* |next_slab_inited| in depot_alloc_stack() and init_stack_slab().
*/
if (unlikely(!smp_load_acquire(&next_slab_inited))) {
/*
* Zero out zone modifiers, as we don't have specific zone
* requirements. Keep the flags related to allocation in atomic
* contexts and I/O.
*/
alloc_flags &= ~GFP_ZONEMASK;
alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
alloc_flags |= __GFP_NOWARN;
page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER);
if (page)
prealloc = page_address(page);
}
raw_spin_lock_irqsave(&depot_lock, flags);
found = find_stack(*bucket, trace->entries, trace->nr_entries, hash);
if (!found) {
struct stack_record *new =
depot_alloc_stack(trace->entries, trace->nr_entries,
hash, &prealloc, alloc_flags);
if (new) {
new->next = *bucket;
/*
* This smp_store_release() pairs with
* smp_load_acquire() from |bucket| above.
*/
smp_store_release(bucket, new);
found = new;
}
} else if (prealloc) {
/*
* We didn't need to store this stack trace, but let's keep
* the preallocated memory for the future.
*/
WARN_ON(!init_stack_slab(&prealloc));
}
raw_spin_unlock_irqrestore(&depot_lock, flags);
exit:
if (prealloc) {
/* Nobody used this memory, ok to free it. */
free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER);
}
if (found)
retval = found->handle.handle;
fast_exit:
return retval;
}
EXPORT_SYMBOL_GPL(depot_save_stack);
首先使用函数hash_stack确定这个调用栈的句柄信息存放在哪个哈希桶里,用于减少重复内存消耗。
然后,find_stack函数往对应哈希桶里面查找是否有一样的调用栈,有的话,就可以直接返回了,毕竟调用栈已经存放在内存中了。
接下来通过变量next_slab_inited,来判断是否需要申请内存。
再次调用find_stack防止并发问题。
如果之前没有存放此次调用栈,则使用depot_alloc_stack函数存放调用栈信息。
存放完成后,函数退出。
下面继续看看depot_alloc_stack函数的实现
/* Allocation of a new stack in raw storage */
static struct stack_record *depot_alloc_stack(unsigned long *entries, int size,
u32 hash, void **prealloc, gfp_t alloc_flags)
{
int required_size = offsetof(struct stack_record, entries) +
sizeof(unsigned long) * size;
struct stack_record *stack;
required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN);
if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) {
if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) {
WARN_ONCE(1, "Stack depot reached limit capacity");
return NULL;
}
depot_index++;
depot_offset = 0;
/*
* smp_store_release() here pairs with smp_load_acquire() from
* |next_slab_inited| in depot_save_stack() and
* init_stack_slab().
*/
if (depot_index + 1 < STACK_ALLOC_MAX_SLABS)
smp_store_release(&next_slab_inited, 0);
}
init_stack_slab(prealloc);
if (stack_slabs[depot_index] == NULL)
return NULL;
stack = stack_slabs[depot_index] + depot_offset;
stack->hash = hash;
stack->size = size;
stack->handle.slabindex = depot_index;
stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN;
stack->handle.valid = 1;
memcpy(stack->entries, entries, size * sizeof(unsigned long));
depot_offset += required_size;
return stack;
}
嗯,基本就是memcpy了,把调用栈信息拷贝到"大数组中"。
总体框架基本就是这样,还是多看原作者的注释,能够获取到不少信息。