alloc_page函数分析

最新推荐文章于 2023-02-03 19:58:25 发布

大浪淘沙2023

最新推荐文章于 2023-02-03 19:58:25 发布

阅读量1.5k

点赞数

分类专栏：内存管理

内存管理专栏收录该内容

2 篇文章 0 订阅

订阅专栏

http://blog.chinaunix.net/uid-20729583-id-1884604.html

*下面的alloc_pages(gfp_mask,order)函数用来请求2^order个连续的页框
*/
#define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order)

#define numa_node_id() (cpu_to_node(raw_smp_processor_id()))
/* Returns the number of the node containing CPU 'cpu' */
static inline int cpu_to_node(int cpu)
{
return cpu_2_node[cpu];
}
int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0};//每个CPU都有相互对应的节点，__read_mostly是gcc的一个
//属性

//分配页面函数，这个函数比较复杂，所牵涉到的内容也比较多，尤其是进程方面的内容
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
if (unlikely(order >= MAX_ORDER)) /*如果要求分配的页数大于MAX_ORDER就以失败告终，这里的MAX_ORDER指的是最大页面号，这里要注意的是对于伙伴算法，所分配的页面的最大值为2^10，即1024个页面，这一点在伙伴算法中经常会使用到，所以这里的MAX_ORDER的值为11，也就是说如果order的值大于了10，即超出了最大值，那么就会以失败告终，直接以失败返回。*/
return NULL; /*从这个判断可以了解到，所分配页的最大的值为 2^10次方，即1KB个页面，即最大不能超过4MB。*/

/* Unknown node is current node */
if (nid < 0)
nid = numa_node_id();/*具体实现： #define numa_node_id() (cpu_to_node(raw_smp_processor_id()))
//最后得到的值为0，因为假设现在只有一个CPU */
/*
/* Returns the number of the node containing CPU 'cpu'
static inline int cpu_to_node(int cpu)
{
return cpu_2_node[cpu];//int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };/* 这又是C语言中使用的一个新的数组初始化的方法。 //read_mostly是在最后执行的时候被组织到一起，这被认为是为了提高效率，因为在多CPU系统中它改善了访问的时间。*/
}

*/

return __alloc_pages(gfp_mask, order,
NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask)); /*这是伙伴算法的核心实现，node_zonelists是zone_list类型，gfp_zone的返回值为ZONE_DMA或者是ZONE_NORMAL或ZONE_HIGH，这三个区分别对应着一个值，ZONE_DMA为0，ZONE_NORMAL为1，ZONE_HIGH为2，即__alloc_pages分配页面的管理区由的三个参数决定，如果gfp_zone的返回值为0，就是在ZONE_DMA管理区中分配，如果gfp_zone返回值为1，就是在ZONE_NORMAL中进行分配，如果gfp_zone的返回值为2，就是在ZONE_HIGH中进行分配。*/
//下面是NDOE_DATA的具体定义：
/*
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;这里的MAX_NUMNODES的值为1，即就定义一个节点
*/
}
/*
* This is the 'heart' of the zoned buddy allocator.
* 这个算法是伙伴算法的核心操作
*/
struct page * fastcall __alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
const gfp_t wait = gfp_mask & __GFP_WAIT; /*为了实现查看是否允许内核对等待空闲页框的当前进程进行阻塞*/
struct zone **z; //这里为何要使用双重指针？？？
struct page *page; //指向页描述符的指针
struct reclaim_state reclaim_state; //可回收页面操作
/*
* current->reclaim_state points to one of these when a task is running
* memory reclaim
用于回收页面
*/

struct task_struct *p = current; //将p设置成指向当前进程
int do_retry; //
int alloc_flags; //分配标志
int did_some_progress;

might_sleep_if(wait); //对可能睡眠的函数进行注释

if (should_fail_alloc_page(gfp_mask, order)) /*检查内存分配是否可行，如果不可行就直接返回，即以失败告终，否则就继续执行内存分配*/
return NULL;

restart:
z = zonelist->zones; /* the list of zones suitable for gfp_mask *///首先让z指向第一个管理区

if (unlikely(*z == NULL)) { /*unlikely()宏的功能很有意思的，可以自己去进行验证。这里要实现的如果*z==NULL,那么就返回NULL，否则就继续执行。*/
/* Should this ever happen?? */
return NULL;
}
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); //从空闲链表中获取2^order页内存
//这是get_page_from_freelist函数的原型
// get_page_from_freelist(gfp_t gfp_mask, unsigned int order,struct zonelist *zonelist, int alloc_flags)
if (page)
goto got_pg; //如果获得了相应的页就退出,否则继续执行

/*
* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)

* __GFP_NOWARN set) should not cause reclaim since the subsystem
* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
* using a larger set of nodes after it has established that the
* allowed per node queues are empty and that nodes are
* over allocated.
*/
if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) //在不支持NUMA的情况下跳转到nopage处
goto nopage;

for (z = zonelist->zones; *z; z++)
wakeup_kswapd(*z, order);//回收页面操作，待解
/
*
* A zone is low on free memory, so wake its kswapd task to service it.
*
void wakeup_kswapd(struct zone *zone, int order)
{
pg_data_t *pgdat;

if (!populated_zone(zone)) /*return !!(zone->present_pages) zone->present_pages是以页为单位的管理区的总大小，如果以页为单位的管理区的总大小为0，那么就直接结束退出*/
return;

pgdat = zone->zone_pgdat;
if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
return;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
wake_up_interruptible(&pgdat->kswapd_wait);
}

*

/*
* OK, we're below the kswapd watermark and have kicked background
* reclaim. Now things get more complex, so set up alloc_flags according
* to how we want to proceed.
*
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags = ALLOC_WMARK_MIN; //

/
#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all *
#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark *
#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark *
#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark *
#define ALLOC_HARDER 0x10 /* try to alloc harder *
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set *
#define ALLOC_CPUSET 0x40 /* check for correct cpuset *

*/
if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
alloc_flags |= ALLOC_HARDER;
if (gfp_mask & __GFP_HIGH)
alloc_flags |= ALLOC_HIGH;
if (wait)
alloc_flags |= ALLOC_CPUSET;

/*
* Go through the zonelist again. Let __GFP_HIGH and allocations
* coming from realtime tasks go deeper into reserves.
*
* This is the last chance, in general, before the goto nopage.
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);//在进行了页面回收后再次进行页面的分配操作
if (page)
goto got_pg; //如果分配成功，就成功返回

/* This allocation should allow future memory freeing. */

rebalance:
if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))//#define PF_MEMALLOC 0x00000800 /* Allocating memory */ TIF_MEMDIE=16
/
6define test_thread_flag(flag) \
test_ti_thread_flag(current_thread_info(), flag)

static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
{
return test_bit(flag,&ti->flags);
}
*
&& !in_interrupt()) {
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
nofail_alloc:
/* go through the zonelist yet again, ignoring mins */
page = get_page_from_freelist(gfp_mask, order,
zonelist, ALLOC_NO_WATERMARKS);
if (page)
goto got_pg;
if (gfp_mask & __GFP_NOFAIL) {
congestion_wait(WRITE, HZ/50);
goto nofail_alloc;
}
}
goto nopage;
}

/* Atomic allocations - we can't balance anything */
if (!wait) //原子分配，就跳转到nopage，即没有空闲页
goto nopage;

cond_resched();

/* We now go into synchronous reclaim 现在进入异步回收*/
cpuset_memory_pressure_bump();
p->flags |= PF_MEMALLOC;
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);

p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;

cond_resched();

if (likely(did_some_progress)) {
page = get_page_from_freelist(gfp_mask, order,
zonelist, alloc_flags);
if (page)
goto got_pg;
} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {//If set the mark of the __GFP_FS zero,Then it doesn't allow the kernel execute the operation depending the filesystem .The mark of __Gfp_NORETRY means that you can allocate the page only once.Here allows allocate many times
/*
* Go through the zonelist yet one more time, keep
* very high watermark here, this is only to catch
* a parallel oom killing, we must fail if we're still
* under heavy pressure.
*/
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
if (page)
goto got_pg;

/* The OOM killer will not help higher order allocs so fail */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto nopage;
/*
*PAGE_ALLOC_COSTLY_ORDER是那些分配行为被认为是一项花费较大的服务所对应的定值，
* PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
* costly to service. That is between allocation orders which should
* coelesce naturally under reasonable reclaim pressure and those which
* will not.
*
#define PAGE_ALLOC_COSTLY_ORDER 3
*
out_of_memory(zonelist, gfp_mask, order);
goto restart;
}

/*
* Don't let big-order allocations loop unless the caller explicitly
* requests that. Wait for some write requests to complete then retry.
*
* In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
* <= 3, but that may not be true in other implementations.
*/
do_retry = 0;
if (!(gfp_mask & __GFP_NORETRY)) {
if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||
(gfp_mask & __GFP_REPEAT))
do_retry = 1;
if (gfp_mask & __GFP_NOFAIL)
do_retry = 1;
}
if (do_retry) {
congestion_wait(WRITE, HZ/50);
goto rebalance;
}

nopage:
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
printk(KERN_WARNING "%s: page allocation failure."
" order:%d, mode:0x%x\n",
p->comm, order, gfp_mask);
dump_stack();
/
*
* The architecture-independent dump_stack generator
*
void dump_stack(void)
{
unsigned long stack;
show_trace(current, NULL, &stack);
}
2void show_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long * stack)
{
show_trace_log_lvl(task, regs, stack, "");
}
*
show_mem();//如果没有空闲的页就显示内存具体分布，即罗列出相应的信息
}
got_pg:
return page;
}

EXPORT_SYMBOL(__alloc_pages);