===================================
本文系本站原创,欢迎转载!
转载请注明出处:http://blog.csdn.net/gdt_A20
===================================
一、摘要
最重要的还是分配与释放,下面看一下相关的几个函数;
分配函数:
1.alloc_pages(gfp_t gfp_mask, unsigned int order);
用于请求2^order次方个连续的页,返回起始页的描述符;
2. alloc_page(gfp_mask);
请求分配一个单独的页,返回描述符;
3.__get_free_page(gfp_mask) ;
申请单独的页,但是返回它的线性地址;
4._ _get_free_pages(gfp_mask, order);
用于请求2^order次方个连续的页,但是返回起始页的线性地址;
5.get_zeroed_page(gfp_mask);
请求一页,并且将该页清0,返回其线性地址;
6._ _get_dma_pages(gfp_mask, order);
用于请求2^order次方个连续的dma页,返回其描述符
释放函数:
1.free_pages(addr, order);
注意只有到count为0的时候才真正的释放掉;
2.free_page(addr);
3.__free_page(page);
二、下面具体的看一下这些函数.
include/linux/gfp.h
1.alloc_page(gfp_mask);
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
2..__get_free_page(gfp_mask) ;
#define __get_free_page(gfp_mask) \
__get_free_pages((gfp_mask), 0)
3._ _get_free_pages(gfp_mask, order);
mm/page_alloc.c
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;
/*
* __get_free_pages() returns a 32-bit address, which cannot represent
* a highmem page
*/
VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
page = alloc_pages(gfp_mask, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}
4.get_zeroed_page(gfp_mask);
unsigned long get_zeroed_page(gfp_t gfp_mask)
{
return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
}
5.._ _get_dma_pages(gfp_mask, order);
#define __get_dma_pages(gfp_mask, order) \
__get_free_pages((gfp_mask) | GFP_DMA, (order))
看来都是一路货色,都最后调用了alloc_pages(gfp_t gfp_mask, unsigned int order);
详细的看一下这个函数吧:
include/linux/gfp.h
#define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order)
在当前节点中分配页,
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
/* Unknown node is current node */
if (nid < 0)
nid = numa_node_id();
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
}
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}
static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
}
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask); //根据mask选zone
struct zone *preferred_zone;
struct page *page;
int migratetype = allocflags_to_migratetype(gfp_mask); //选择一个类型的空闲表
gfp_mask &= gfp_allowed_mask;
lockdep_trace_alloc(gfp_mask);
might_sleep_if(gfp_mask & __GFP_WAIT); //是否可以睡眠
if (should_fail_alloc_page(gfp_mask, order))
return NULL;
/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
* of GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone)) //如果没有管理区就返回了
return NULL;
get_mems_allowed();
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx, //根据传入参数找到对应的zone进行分配
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
if (!preferred_zone) { //如果失败,返回
put_mems_allowed();
return NULL;
}
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, //对应下面的slow这是一个快速的分配,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, //此时water线比较高也没有关系
preferred_zone, migratetype);
if (unlikely(!page))
page = __alloc_pages_slowpath(gfp_mask, order, //在water高的时候无法分配,可能需要降低一下water线,
zonelist, high_zoneidx, nodemask, //可能会启动页面回收进程进行页面回收
preferred_zone, migratetype);
put_mems_allowed();
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
return page;
}
对于该函数,kernel会根据gfp_mask标志走不同的路径,比如能睡眠情况,不能睡眠情况,高低端内存情况等等;
分配部分就到这里了,就不往下再贴了,大致过程就是这样;
下面看一下释放函数;
1.free_page(addr);
#define free_page(addr) free_pages((addr), 0)
2.__free_page(page);
#define __free_page(page) __free_pages((page), 0)
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
free_hot_cold_page(page, 0);
else
__free_pages_ok(page, order);
}
}
3.free_pages(addr, order);
free_page也会调用掉这里,
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
VM_BUG_ON(!virt_addr_valid((void *)addr));
__free_pages(virt_to_page((void *)addr), order);
}
}
三个释放函数同样最后调用的相同的一个函数__free_pages,
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0) //单个order为0,释放到冷热页缓存
free_hot_cold_page(page, 0);
else
__free_pages_ok(page, order); //否则释放到buddy
}
}
冷热页部分;
/*
* Free a 0-order page
* cold == 1 ? free a cold page : free a hot page
*/
void free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page); //page所在zone
struct per_cpu_pages *pcp;
unsigned long flags;
int migratetype;
int wasMlocked = __TestClearPageMlocked(page);
if (!free_pages_prepare(page, 0))
return;
migratetype = get_pageblock_migratetype(page); //page属于的链表
set_page_private(page, migratetype);
local_irq_save(flags);
if (unlikely(wasMlocked))
free_page_mlock(page);
__count_vm_event(PGFREE);
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Free ISOLATE pages back to the allocator because they are being
* offlined but treat RESERVE as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
if (migratetype >= MIGRATE_PCPTYPES) { //不是冷热页的内存
if (unlikely(migratetype == MIGRATE_ISOLATE)) {
free_one_page(zone, page, 0, migratetype); //释放到对应空闲链表
goto out;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp; //得到该cpu冷热也结构
if (cold)
list_add_tail(&page->lru, &pcp->lists[migratetype]); //冷页加入冷链表
else
list_add(&page->lru, &pcp->lists[migratetype]); //热页加入热链表
pcp->count++;
if (pcp->count >= pcp->high) { //冷热页太多了,超了就释放点到buddy中
free_pcppages_bulk(zone, pcp->batch, pcp);
pcp->count -= pcp->batch;
}
out:
local_irq_restore(flags);
}
释放到伙伴系统中部分;
tatic void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int wasMlocked = __TestClearPageMlocked(page);
if (!free_pages_prepare(page, order))
return;
local_irq_save(flags);
if (unlikely(wasMlocked))
free_page_mlock(page);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, order,
get_pageblock_migratetype(page));
local_irq_restore(flags);
}
static void free_one_page(struct zone *zone, struct page *page, int order,
int migratetype)
{
spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
__free_one_page(page, zone, order, migratetype);
__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
spin_unlock(&zone->lock);
}
最后会调用到__free_one_page这里,
要释放的页,页所在的区,页所在区的order,哪个链表,
static inline void __free_one_page(struct page *page,
struct zone *zone, unsigned int order,
int migratetype)
{
unsigned long page_idx;
unsigned long combined_idx;
unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
if (unlikely(PageCompound(page)))
if (unlikely(destroy_compound_page(page, order)))
return;
VM_BUG_ON(migratetype == -1);
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); //页号
VM_BUG_ON(page_idx & ((1 << order) - 1));
VM_BUG_ON(bad_range(zone, page));
while (order < MAX_ORDER-1) { //2^3,order代表3
buddy_idx = __find_buddy_index(page_idx, order); //找朋友,找伙伴^.^!,找到在伙伴的位置,或者前或者后
buddy = page + (buddy_idx - page_idx); //找到伙伴的下标
if (!page_is_buddy(page, buddy, order)) //不满足合并条件那么退出
break;
/* Our buddy is free, merge with it and move up one order. */
list_del(&buddy->lru); //我们的伙伴很清闲,摘除他
zone->free_area[order].nr_free--; //将对应zone区域的order伙伴的空闲链表-1
rmv_page_order(buddy); //clear buddy标志,
combined_idx = buddy_idx & page_idx; //得到一个新的下标
page = page + (combined_idx - page_idx); //得到新下标page,
page_idx = combined_idx; //更新新下标标号
order++; //以新下标为基础找更大的伙伴进行合并
}
set_page_order(page, order);
//跳出的时候:可能是不满足伙伴,另外可能是循环到达重点
/*
* If this is not the largest possible page, check if the buddy
* of the next-highest order is free. If it is, it's possible
* that pages are being freed that will coalesce soon. In case,
* that is happening, add the free page to the tail of the list
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { //去除循环到达重点的情况,关注伙伴不满足的情况
struct page *higher_page, *higher_buddy;
combined_idx = buddy_idx & page_idx; //得到一个新下标
higher_page = page + (combined_idx - page_idx); //得到新下标对应的页
buddy_idx = __find_buddy_index(combined_idx, order + 1); //得到order大一的伙伴
higher_buddy = page + (buddy_idx - combined_idx); //以新下标得到order大一的伙伴地址
if (page_is_buddy(higher_page, higher_buddy, order + 1)) { //判断新下标和order+1是否满足伙伴合并条件
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]); //如果是把它加入对应链表,并且退出
goto out;
}
}
list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); //如果不满足伙伴条件并且无法再合并那么加入对应order的空闲链表
out:
zone->free_area[order].nr_free++;
}
结果:对应页找到最大的伙伴合并进去
/*
* Locate the struct page for both the matching buddy in our
* pair (buddy1) and the combined O(n+1) page they form (page).
*
* 1) Any buddy B1 will have an order O twin B2 which satisfies
* the following equation:
* B2 = B1 ^ (1 << O)
* For example, if the starting buddy (buddy2) is #8 its order
* 1 buddy is #10:
* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
*
* 2) Any buddy B will have an order O+1 parent P which
* satisfies the following equation:
* P = B & ~(1 << O)
*
* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
*/
static inline unsigned long
__find_buddy_index(unsigned long page_idx, unsigned int order)
{
return page_idx ^ (1 << order);
}
//计算寻找伙伴,如果order为0,page_idx==0,那么伙伴就是0^1 == 1.
//如果order是4,page_idx == 0,那么伙伴就是0^(1<<4) == 16,
符合2^4==16
//总之是以page_idx为起始,2^order为大小的伙伴,返回该伙伴的标号
//如果page_idx == 5,order为0,那么5^(1<<0) == 4 前移一位
//如果page_idx == 5,order为1,那么5^(1<<1) == 7,
//如果page_idx == 5,order为2,那么5^(1<<2) == 1,移动到前面的伙伴
总结:当page_idx > (1 << order)时,如果order为偶数,则返回前一个伙伴,如果是奇数,返回后一个伙伴
当page_idx < (1 << order)时,返回后面的伙伴,
####找到伙伴还要确定是否为可以合并
/*
* This function checks whether a page is free && is the buddy
* we can do coalesce a page and its buddy if
* (a) the buddy is not in a hole &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
* For recording whether a page is in the buddy system, we set ->_mapcount -2.
* Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
static inline int page_is_buddy(struct page *page, struct page *buddy,
int order)
{
if (!pfn_valid_within(page_to_pfn(buddy))) //确定该内存是实际可用内存不是空洞
return 0;
if (page_zone_id(page) != page_zone_id(buddy)) //该页面和伙伴页面在一个zone
return 0;
if (PageBuddy(buddy) && page_order(buddy) == order) { //order必须一样
VM_BUG_ON(page_count(buddy) != 0);
return 1;
}
return 0;
}
####
由此可以看出合并为伙伴的前提条件:
1.不是空洞
2.相同的zone
3.order一样
三、总结
buddy的分配释放函数就到这里了.