启用伙伴算法

5.8 初始化内存管理

回到start_kernel,下一个函数执行mm_init()。这个函数很重要了,来自同一个文件。

 

static void __init mm_init(void)

{

       /*

        * page_cgroup requires countinous pages as memmap

        * and it's bigger than MAX_ORDER unless SPARSEMEM.

        */

       page_cgroup_init_flatmem();

       mem_init();

       kmem_cache_init();

       pgtable_cache_init();

       vmalloc_init();

}

 

这五个函数,其中由于我们没有配置CONFIG_CGROUP_MEM_RES_CTLR,所以第一个函数page_cgroup_init_flatmem是个空函数。其余几个函数各个都是重点。

 

该函数执行完后不能再用像alloc_bootmem()alloc_bootmem_low()alloc_bootmem_pages()等申请低端内存的函数来申请内存,也就不能申请大块的连续物理内存了。

 

5.8.1 启用伙伴算法

首先是mem_init,来自arch/x86/mm/init_32.c

867void __init mem_init(void)

 868{

 869        int codesize, reservedpages, datasize, initsize;

 870        int tmp;

 871

 872        pci_iommu_alloc();

 873

 874#ifdef CONFIG_FLATMEM

 875        BUG_ON(!mem_map);

 876#endif

 877        /* this will put all low memory onto the freelists */

 878        totalram_pages += free_all_bootmem();

 879

 880        reservedpages = 0;

 881        for (tmp = 0; tmp < max_low_pfn; tmp++)

 882                /*

 883                 * Only count reserved RAM pages:

 884                 */

 885                if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))

 886                        reservedpages++;

 887

 888        set_highmem_pages_init();

 889

 890        codesize =  (unsigned long) &_etext - (unsigned long) &_text;

 891        datasize =  (unsigned long) &_edata - (unsigned long) &_etext;

 892        initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

 893

 894        printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "

 895                        "%dk reserved, %dk data, %dk init, %ldk highmem)/n",

 896                nr_free_pages() << (PAGE_SHIFT-10),

 897                num_physpages << (PAGE_SHIFT-10),

 898                codesize >> 10,

 899                reservedpages << (PAGE_SHIFT-10),

 900                datasize >> 10,

 901                initsize >> 10,

 902                totalhigh_pages << (PAGE_SHIFT-10));

 903

 904        printk(KERN_INFO "virtual kernel memory layout:/n"

 905                "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)/n"

 906#ifdef CONFIG_HIGHMEM

 907                "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)/n"

 908#endif

 909                "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)/n"

 910                "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)/n"

 911                "      .init : 0x%08lx - 0x%08lx   (%4ld kB)/n"

 912                "      .data : 0x%08lx - 0x%08lx   (%4ld kB)/n"

 913                "      .text : 0x%08lx - 0x%08lx   (%4ld kB)/n",

 914                FIXADDR_START, FIXADDR_TOP,

 915                (FIXADDR_TOP - FIXADDR_START) >> 10,

 916

 917#ifdef CONFIG_HIGHMEM

 918                PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,

 919                (LAST_PKMAP*PAGE_SIZE) >> 10,

 920#endif

 921

 922                VMALLOC_START, VMALLOC_END,

 923                (VMALLOC_END - VMALLOC_START) >> 20,

 924

 925                (unsigned long)__va(0), (unsigned long)high_memory,

 926                ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,

 927

 928                (unsigned long)&__init_begin, (unsigned long)&__init_end,

 929                ((unsigned long)&__init_end -

 930                 (unsigned long)&__init_begin) >> 10,

 931

 932                (unsigned long)&_etext, (unsigned long)&_edata,

 933                ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,

 934

 935                (unsigned long)&_text, (unsigned long)&_etext,

 936                ((unsigned long)&_etext - (unsigned long)&_text) >> 10);

 937

 938        /*

 939         * Check boundaries twice: Some fundamental inconsistencies can

 940         * be detected at build time already.

 941         */

 942#define __FIXADDR_TOP (-PAGE_SIZE)

 943#ifdef CONFIG_HIGHMEM

 944        BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE  > FIXADDR_START);

 945        BUILD_BUG_ON(VMALLOC_END                        > PKMAP_BASE);

 946#endif

 947#define high_memory (-128UL << 20)

 948        BUILD_BUG_ON(VMALLOC_START                      >= VMALLOC_END);

 949#undef high_memory

 950#undef __FIXADDR_TOP

 951

 952#ifdef CONFIG_HIGHMEM

 953        BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE        > FIXADDR_START);

 954        BUG_ON(VMALLOC_END                              > PKMAP_BASE);

 955#endif

 956        BUG_ON(VMALLOC_START                            >= VMALLOC_END);

 957        BUG_ON((unsigned long)high_memory               > VMALLOC_START);

 958

 959        if (boot_cpu_data.wp_works_ok < 0)

 960                test_wp_bit();

 961

 962        save_pg_dir();

 963        zap_low_mappings(true);

 964}

 

872行,Intel IOMMU架构在Linux上的初始化函数pci_iommu_alloc。这个函数不是我们关注的重点,我们就不深入下去了,这里仅仅粗略地介绍一下。该函数首先通过读取 DMA Remapping table,来判断判断是否支持DMAR设备。随后调用pci_swiotlb_init函数对其进行初始化,解析DMAR table,并逐一打印每个dmar项。最后设置全局变量dma_ops,把初始化后的swiotlb_dma_ops传递给它,后者定义了IOMMU架构中所有的swiotlb方法。对IOMMU感兴趣的同学可以去查阅相关资料,这里就不详细介绍了。

 

878行,totalram_pages这个全局变量我们第一次遇见。它编译的时候初始化为0,现在它就等于free_all_bootmem函数的返回值,该函数在mm/bootmem.c中定义:

 

unsigned long __init free_all_bootmem(void)

{

#ifdef CONFIG_NO_BOOTMEM

       /*

        * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id

        *  because in some case like Node0 doesnt have RAM installed

        *  low ram will be on Node1

        * Use MAX_NUMNODES will make sure all ranges in early_node_map[]

        *  will be used instead of only Node0 related

        */

       return free_all_memory_core_early(MAX_NUMNODES);

#else

       unsigned long total_pages = 0;

       bootmem_data_t *bdata;

 

       list_for_each_entry(bdata, &bdata_list, list)

              total_pages += free_all_bootmem_core(bdata);

 

       return total_pages;

#endif

}

 

我们看到,由于CONFIG_NO_BOOTMEM起作用,并且MAX_NUMNODES1,所以函数直接调用free_all_memory_core_early(1),怎么样,前面说得没错吧,终于碰到了这个函数:

 

200unsigned long __init free_all_memory_core_early(int nodeid)

 201{

 202        int i;

 203        u64 start, end;

 204        unsigned long count = 0;

 205        struct range *range = NULL;

 206        int nr_range;

 207

 208        nr_range = get_free_all_memory_range(&range, nodeid);

 209

 210        for (i = 0; i < nr_range; i++) {

 211                start = range[i].start;

 212                end = range[i].end;

 213                count += end - start;

 214                __free_pages_memory(start, end);

 215        }

 216

 217        return count;

 218}

 

205行的那个range结构很简单:

struct range {

       u64   start;

       u64   end;

};

 

所以首先208行调用get_free_all_memory_range函数:

 

393int __init get_free_all_memory_range(struct range **rangep, int nodeid)

 394{

 395        int i, count;

 396        u64 start = 0, end;

 397        u64 size;

 398        u64 mem;

 399        struct range *range;

 400        int nr_range;

 401

 402        count  = 0;

 403        for (i = 0; i < max_early_res && early_res[i].end; i++)

 404                count++;

 405

 406        count *= 2;

 407

 408        size = sizeof(struct range) * count;

 409        end = get_max_mapped();

 410#ifdef MAX_DMA32_PFN

 411        if (end > (MAX_DMA32_PFN << PAGE_SHIFT))

 412                start = MAX_DMA32_PFN << PAGE_SHIFT;

 413#endif

 414        mem = find_fw_memmap_area(start, end, size, sizeof(struct range));

 415        if (mem == -1ULL)

 416                panic("can not find more space for range free");

 417

 418        range = __va(mem);

 419        /* use early_node_map[] and early_res to get range array at first */

 420        memset(range, 0, size);

 421        nr_range = 0;

 422

 423        /* need to go over early_node_map to find out good range for node */

 424        nr_range = add_from_early_node_map(range, count, nr_range, nodeid);

 425#ifdef CONFIG_X86_32

 426        subtract_range(range, count, max_low_pfn, -1ULL);

 427#endif

 428        subtract_early_res(range, count);

 429        nr_range = clean_sort_range(range, count);

 430

 431        /* need to clear it ? */

 432        if (nodeid == MAX_NUMNODES) {

 433                memset(&early_res[0], 0,

 434                         sizeof(struct early_res) * max_early_res);

 435                early_res = NULL;

 436                max_early_res = 0;

 437        }

 438

 439        *rangep = range;

 440        return nr_range;

 441}

 

403行,全局变量max_early_researly_res[]数组,老熟人了,一个循环得到目前已经分配了early_res元素的个数,把它的值乘以2赋给size409行,调用get_max_mapped函数:

u64 __init get_max_mapped(void)

{

       u64 end = max_pfn_mapped;

       end <<= PAGE_SHIFT;

       return end;

}

 

该函数返回我们的老熟人最后一个页框max_pfn_mapped对应的物理地址赋值给内部变量endstart396行被赋值为0。然后414调用find_fw_memmap_area函数传给他的参数是startendsizerange结构的大小

u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)

{

       return find_e820_area(start, end, size, align);

}

 

find_e820_area不用多说了吧,从e820.map[]数组中寻找到一块能够容纳size个字节的内存段,该内存段的首物理地址赋值给get_free_all_memory_range的内部变量mem418~421行初始化这块区域。随后424行调用add_from_early_node_map函数:

 

int __init add_from_early_node_map(struct range *range, int az,

                               int nr_range, int nid)

{

       int i;

       u64 start, end;

 

       /* need to go over early_node_map to find out good range for node */

       for_each_active_range_index_in_nid(i, nid) {

              start = early_node_map[i].start_pfn;

              end = early_node_map[i].end_pfn;

              nr_range = add_range(range, az, nr_range, start, end);

       }

       return nr_range;

}

 

int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)

{

       if (start >= end)

              return nr_range;

 

       /* Out of slots: */

       if (nr_range >= az)

              return nr_range;

 

       range[nr_range].start = start;

       range[nr_range].end = end;

 

       nr_range++;

 

       return nr_range;

}

 

执行完毕add_from_early_node_map函数之后,range执行的这块区域中,就形成了一个range[nr_range]数组,每个数组元素对应early_node_map[]的数组元素,表示nr_range块空闲内存空间的起始页框号和结束页框号。426subtract_range函数检验一下这个range是否有问题,并进行调整。428行,调用subtract_early_res对产生冲突的地址进行调整:

 

static void __init subtract_early_res(struct range *range, int az)

{

       int i, count;

       u64 final_start, final_end;

       int idx = 0;

 

       count  = 0;

       for (i = 0; i < max_early_res && early_res[i].end; i++)

              count++;

 

       /* need to skip first one ?*/

       if (early_res != early_res_x)

              idx = 1;

 

#define DEBUG_PRINT_EARLY_RES 1

 

#if DEBUG_PRINT_EARLY_RES

       printk(KERN_INFO "Subtract (%d early reservations)/n", count);

#endif

       for (i = idx; i < count; i++) {

              struct early_res *r = &early_res[i];

#if DEBUG_PRINT_EARLY_RES

              printk(KERN_INFO "  #%d [%010llx - %010llx] %15s/n", i,

                     r->start, r->end, r->name);

#endif

              final_start = PFN_DOWN(r->start);

              final_end = PFN_UP(r->end);

              if (final_start >= final_end)

                     continue;

              subtract_range(range, az, final_start, final_end);

       }

 

}

 

early_res体系熟悉的同学对上述代码一定不会困惑,我们看到subtract_early_res对地址进行调整,去掉那些已经被占用了的地址空间。回到get_free_all_memory_range,最后两行,把range赋给结果参数rangep,并且返回最终的range数组的元素个数nr_range

 

回到free_all_memory_core_early函数中,内部变量range有了,其元素个数nr_range也有了,那么210~215执行一个循环,将range数组的每一个元素调用__free_pages_memory进行释放:

 

174static void __init __free_pages_memory(unsigned long start, unsigned long end)

 175{

 176        int i;

 177        unsigned long start_aligned, end_aligned;

 178        int order = ilog2(BITS_PER_LONG);

 179

 180        start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);

 181        end_aligned = end & ~(BITS_PER_LONG - 1);

 182

 183        if (end_aligned <= start_aligned) {

 184                for (i = start; i < end; i++)

 185                        __free_pages_bootmem(pfn_to_page(i), 0);

 186

 187                return;

 188        }

 189

 190        for (i = start; i < start_aligned; i++)

 191                __free_pages_bootmem(pfn_to_page(i), 0);

 192

 193        for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)

 194                __free_pages_bootmem(pfn_to_page(i), order);

 195

 196        for (i = end_aligned; i < end; i++)

 197                __free_pages_bootmem(pfn_to_page(i), 0);

 198}

 

函数主要执行183~188行代码,通过__free_pages_bootmem函数释放对应号码的页框,从号码从startend号。

 

下面来看看__free_pages_bootmem

 

637void __meminit __free_pages_bootmem(struct page *page, unsigned int order)

 638{

 639        if (order == 0) {

 640                __ClearPageReserved(page);

 641                set_page_count(page, 0);

 642                set_page_refcounted(page);

 643                __free_page(page);

 644        } else {

 645                int loop;

 646

 647                prefetchw(page);

 648                for (loop = 0; loop < BITS_PER_LONG; loop++) {

 649                        struct page *p = &page[loop];

 650

 651                        if (loop + 1 < BITS_PER_LONG)

 652                                prefetchw(p + 1);

 653                        __ClearPageReserved(p);

 654                        set_page_count(p, 0);

 655                }

 656

 657                set_page_refcounted(page);

 658                __free_pages(page, order);

 659        }

 660}

 

我们传递进来的参数order0,所以来到643行,针对这个页面page,著名的伙伴算法到来了,我们来看它的定义:

#define __free_page(page) __free_pages((page), 0)

 

释放页框的所有内核宏和函数都依赖于__free_pages()函数。它接收的参数为将要释放的第一个页框的页描述符的地址(page)和将要释放的一组连续页框的数量的对数(order)。该函数执行如下步骤:

1.       检查第一个页框是否真正属于动态内存(它的PG_reserved 标志被清0);如果不是,则终止。

2.       减少page->_count 使用计数器的值;如果它仍然大于或等于0,则终止。

3.       如果order 等于0,那么该函数调用free_hot_page()来释放页框给适当内存管理区的每CPU 热高速缓存。

4.       如果order大于0,那么它将页框加入到本地链表中,并调用free_pages_bulk()函数把它们释放到适当内存管理区的伙伴系统中。

 

我们这里order0,所以调用free_hot_page(),最终会调用__free_one_page。由于前面的pglistzone的体系已经建立好,该函数对当前页面page对应的那个zonefree_area数组进行处理。由于这个地方是第一次触及该数组,那么这一次free_hot_page调用的__free_one_page将会找到全部伙伴,等于是初始化了整个伙伴算法系统。好了,怀疑我这句话的同志可以去看看博客“伙伴系统算法”

http://blog.csdn.net/yunsongice/archive/2010/01/22/5225155.aspx

 

回到mem_init函数中,伙伴系统建立起来以后,free_all_bootmem返回空闲页面的总数给全局参数totalram_pages。随后880~886行代码计算被保留的页面数,保存在内部变量reservedpages中。888行,set_highmem_pages_init函数,通过调用add_highpages_work_fn函数初始化876MB以上的高端页面,并把他们加入伙伴系统,最后计算出包含了这些高端页面的新的可用页面的数量totalram_pages

 

继续走,890行,让内部变量codesizedatasizeinitsize分别等于内核代码段、数据段和初始化相关函数指针空间段的大小。随后894~936行打印相关信息。942~957是一群调试信息,略去。962save_pg_dir()函数,来自同一文件:

char swsusp_pg_dir[PAGE_SIZE]

       __attribute__ ((aligned(PAGE_SIZE)));

static inline void save_pg_dir(void)

{

       memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);

}

 

很简单,就是把页全局目录拷贝到全局变量swsusp_pg_dir数组中,做个备份。963行,执行zap_low_mappings(true)函数,这个函数也来自于同一个文件:

 

 

void zap_low_mappings(bool early)

{

       int i;

 

       /*

        * Zap initial low-memory mappings.

        *

        * Note that "pgd_clear()" doesn't do it for

        * us, because pgd_clear() is a no-op on i386.

        */

       for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {

#ifdef CONFIG_X86_PAE

              set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));

#else

              set_pgd(swapper_pg_dir+i, __pgd(0));

#endif

       }

 

       if (early)

              __flush_tlb();

       else

              flush_tlb_all();

}

 

这个函数很简单,就是把前面我们在arch/x86/kernel/head_32.S中设置的页全局目录的前若干项清零。这若干项到底是多少项呢?我们看看KERNEL_PGD_BOUNDARY是什么东西:

#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)

#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))

#define PGDIR_SHIFT  22

#define PTRS_PER_PGD     1024

 

不错,0xc0000000>>22 & 1023= 768,这些也全局目录项代表虚拟地址前3G的页面,也就是所谓的用户区,我们在这里把它全清零了。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值