linux内核内存管理的伙伴算法

1.     Linux 内核对各个 zone 都有一个 buddy system.

2.     数据结构:

mem_map: 一个 Struct page 数组,对应系统中所有的物理内存页。

而每一个 zone 结构里都有一个 zone_mem_map 域指向这个 zone 的第一个 page mem_map 的位置,还有一个域 size 代表这个区的大小,即总共有多少页。

每一个 zone 都有自己的 buddy system, 由下面的 zone 结构就可以看出。

空闲块是根据其大小做的保存,特别强调的是 struct free_area  free_area[MAX_ORDER];

保存着 zone 中的空闲块。数组中的每一个元素都有个双链表结构。比如说      free_area 中第 K 个元素保存着大小为 2 k 次方大小的块的链表结构。数组中保存的是表头结构,即指向第一个 2 k 次方大小块的第一个页面。那块的剩余的页面怎么办?不用管,因为都是按块来操作的,只需要知道块的第一个页面即可,最后一个页面就是第一个页面加上 2 k 次方。同属于一个链表的块与块之间由每一个块的第一个页面的 struct page 中的 list_head lru 来相互链接。

#ifndef CONFIG_FORCE_MAX_ZONEORDER

#define MAX_ORDER 11

#else

#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER

#endif

#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))

struct free_area {

       struct list_head  free_list;

       unsigned long            nr_free;

};

struct zone {

       /* Fields commonly accessed by the page allocator */

       unsigned long            free_pages;

       unsigned long             pages_min, pages_low, pages_high;

       /*

         * We don't know if the memory that we're going to allocate will be freeable

         * or/and it will be released eventually, so to avoid totally wasting several

         * GB of ram we must reserve some of the lower zone memory (otherwise we risk

         * to run OOM on the lower zones despite there's tons of freeable ram

         * on the higher zones). This array is recalculated at runtime if the

         * sysctl_lowmem_reserve_ratio sysctl changes.

         */

       unsigned long             lowmem_reserve[MAX_NR_ZONES];

 

#ifdef CONFIG_NUMA

       struct per_cpu_pageset     *pageset[NR_CPUS];

#else

       struct per_cpu_pageset     pageset[NR_CPUS];

#endif

       /*

         * free areas of different sizes

         */

       spinlock_t            lock;

#ifdef CONFIG_MEMORY_HOTPLUG

       /* see spanned/present_pages for more description */

       seqlock_t             span_seqlock;

#endif

       struct free_area free_area[MAX_ORDER];

 

       ZONE_PADDING(_pad1_)

 

       /* Fields commonly accessed by the page reclaim scanner */

       spinlock_t            lru_lock;      

       struct list_head    active_list;

       struct list_head    inactive_list;

       unsigned long             nr_scan_active;

       unsigned long             nr_scan_inactive;

       unsigned long             nr_active;

       unsigned long             nr_inactive;

       unsigned long             pages_scanned;      /* since last reclaim */

       int                 all_unreclaimable; /* All pages pinned */

 

       /* A count of how many reclaimers are scanning this zone */

       atomic_t              reclaim_in_progress;

 

       /*

         * timestamp (in jiffies) of the last zone reclaim that did not

         * result in freeing of pages. This is used to avoid repeated scans

         * if all memory in the zone is in use.

         */

       unsigned long             last_unsuccessful_zone_reclaim;

 

       /*

         * prev_priority holds the scanning priority for this zone.  It is

         * defined as the scanning priority at which we achieved our reclaim

         * target at the previous try_to_free_pages() or balance_pgdat()

         * invokation.

         *

         * We use prev_priority as a measure of how much stress page reclaim is

         * under - it drives the swappiness decision: whether to unmap mapped

         * pages.

         *

         * temp_priority is used to remember the scanning priority at which

         * this zone was successfully refilled to free_pages == pages_high.

         *

         * Access to both these fields is quite racy even on uniprocessor.  But

         * it is expected to average out OK.

         */

       int temp_priority;

       int prev_priority;

 

 

       ZONE_PADDING(_pad2_)

       /* Rarely used or read-mostly fields */

 

       /*

         * wait_table             -- the array holding the hash table

         * wait_table_size     -- the size of the hash table array

         * wait_table_bits      -- wait_table_size == (1 << wait_table_bits)

         *

         * The purpose of all these is to keep track of the people

         * waiting for a page to become available and make them

         * runnable again when possible. The trouble is that this

         * consumes a lot of space, especially when so few things

         * wait on pages at a given time. So instead of using

         * per-page waitqueues, we use a waitqueue hash table.

         *

         * The bucket discipline is to sleep on the same queue when

         * colliding and wake all in that wait queue when removing.

         * When something wakes, it must check to be sure its page is

         * truly available, a la thundering herd. The cost of a

         * collision is great, but given the expected load of the

         * table, they should be so rare as to be outweighed by the

         * benefits from the saved space.

         *

         * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the

         * primary users of these fields, and in mm/page_alloc.c

         * free_area_init_core() performs the initialization of them.

         */

       wait_queue_head_t    * wait_table;

       unsigned long             wait_table_size;

       unsigned long             wait_table_bits;

 

       /*

         * Discontig memory support fields.

         */

       struct pglist_data       *zone_pgdat;

       /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */

       unsigned long             zone_start_pfn;

 

       /*

         * zone_start_pfn, spanned_pages and present_pages are all

         * protected by span_seqlock.  It is a seqlock because it has

         * to be read outside of zone->lock, and it is done in the main

         * allocator path.  But, it is written quite infrequently.

         *

         * The lock is declared along with zone->lock because it is

         * frequently read in proximity to zone->lock.  It's good to

         * give them a chance of being in the same cacheline.

         */

       unsigned long             spanned_pages;   /* total size, including holes */

       unsigned long             present_pages;     /* amount of memory (excluding holes) */

 

       /*

         * rarely used fields:

         */

       char                     *name;

} ____cacheline_internodealigned_in_smp;

 

分配页面块

分配一个大小为 2 m 次方的页面块,首先看 freearea 的第 m 个元素,如果其 nr_free 大于 0 ,则从这个链表中取出来一个块来满足要求,如果不大于 0 ,则看数组中 m+1 个元素,那要下去。如果找到能够分配的,那么就将块的第一部分大小为 2 m 次方的块分出去,剩下的继续保存在 buddy system 中。看代码会比较详细:

/*

  * Do the hard work of removing an element from the buddy allocator.

  * Call me with the zone->lock already held.

  */

static struct page *__rmqueue(struct zone *zone, unsigned int order)

{

       struct free_area * area;

       unsigned int current_order;

       struct page *page;

 

       for (current_order = order; current_order < MAX_ORDER; ++current_order) {

              area = zone->free_area + current_order;

              if (list_empty(&area->free_list))

                     continue;

 

              page = list_entry(area->free_list.next, struct page, lru);

              list_del(&page->lru);

              rmv_page_order(page);

              area->nr_free--;

              zone->free_pages -= 1UL << order;

              expand(zone, page, order, current_order, area);

              return page;

       }

 

       return NULL;

}

 

 


释放页面块

 

 

/*

  * Locate the struct page for both the matching buddy in our

  * pair (buddy1) and the combined O(n+1) page they form (page).

  *

  * 1) Any buddy B1 will have an order O twin B2 which satisfies

  * the following equation:

  *     B2 = B1 ^ (1 << O)

  * For example, if the starting buddy (buddy2) is #8 its order

  * 1 buddy is #10:

  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

  *

  * 2) Any buddy B will have an order O+1 parent P which

  * satisfies the following equation:

  *      P = B & ~(1 << O)

  *

  * Assumption: *_mem_map is contigious at least up to MAX_ORDER

  */

static inline unsigned long

__find_combined_index(unsigned long page_idx, unsigned int order)

{

       return (page_idx & ~(1 << order));

}

 

static void free_pages_bulk(struct zone *zone, int count,
struct list_head *list, int order)
{
spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
while (count--) {
struct page *page;

BUG_ON(list_empty(list));
page = list_entry(list->prev, struct page, lru);
/* have to delete it as __free_one_page list manipulates */
list_del(&page->lru);
__free_one_page(page, zone, order);
}
spin_unlock(&zone->lock);
}

static inline void __free_one_page(struct page *page,
struct zone *zone, unsigned int order)
{
unsigned long page_idx;
int order_size = 1 << order;

if (unlikely(PageCompound(page)))
destroy_compound_page(page, order);

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

BUG_ON(page_idx & (order_size - 1));
BUG_ON(bad_range(zone, page));

zone->free_pages += order_size;
while (order < MAX_ORDER-1) {
unsigned long combined_idx;
struct free_area *area;
struct page *buddy;

buddy = __page_find_buddy(page, page_idx, order);
if (!page_is_buddy(buddy, order))
break; /* Move the buddy up one level. */

list_del(&buddy->lru);
area = zone->free_area + order;
area->nr_free--;
rmv_page_order(buddy);
combined_idx = __find_combined_index(page_idx, order);
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
order++;
}
set_page_order(page, order);
list_add(&page->lru, &zone->free_area[order].free_list);
zone->free_area[order].nr_free++;
}

第一个函数的作用是释放 count 个具有 1<<order 大小的块 ,list 连着这些块的第一个页框 . 而第二个函数是释放一个具有 1<<order 的块。还有第二个函数中
buddy = __page_find_buddy(page, page_idx, order);
static inline struct page *
__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
{
unsigned long buddy_idx = page_idx ^ (1 << order);

return page + (buddy_idx - page_idx);
}
page_idx ^ (1 << order)
操作是找到 buddy index, 根据是 1) Any buddy B1 will have an order O twin B2 which satisfies the following equation:

   B2 = B1 ^ (1 << O)

An exclusive OR XOR using the(1<<order)mask switches the value of the order-th bit of page_idx.Therefore,if the bit was previously zero,buddy_idx is equal to page_idx+order_size;conversely,if the bit was previouslyone,buddy_idx is equal to page_idx-order_size.

两个合并后,要得到合并后的块的 index

combined_idx = __find_combined_index(page_idx, order);

__find_combined_index 为:

static inline unsigned long

__find_combined_index(unsigned long page_idx, unsigned int order)

{

      return (page_idx & ~(1 << order));

}

其原理是:

2) Any buddy B will have an order O+1 parent P which satisfies the following equation:

    P = B & ~(1 << O)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值