linux 内存管理 - paging_init 函数

源码为 2.6.37内核,x86_64架构,内存模型为Sparse Memory



paging_init函数在setup_arch函数中被调用,用于初始化所有节点的pg_data_t结构,以及节点对应的管理区zone结构,和page结构。


调用的大致过程为:

start_kernel()
   --> setup_arch()
      --> paging_init()
         --> free_area_init_nodes()
            --> free_area_init_node()
               --> free_area_init_core()
                  --> memmap_init()



具体如下:



paging_init()在setup_arch()中被调用,定义为:


void __init paging_init(void)
{
        unsigned long max_zone_pfns[MAX_NR_ZONES]; /* MAX_NR_ZONES = 4 */

        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
        max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; /* 16M */
        max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; /* 4G */
        max_zone_pfns[ZONE_NORMAL] = max_pfn; /* 最大物理内存 */

        /* 为所有node所拥有的物理page建立mem_sections 
         * 将放入early_node_map中所有节点的memory region的page放入另外一个mem_section的数组中。
         * 使用mem_section目的可能是内存分配更加高效。
         * kernel中一个mem_section包含了物理地址相连的固定数目个page。
         * 给定一个物理地址,我们可以得到它所在的page,也能得到它所在的mem_section的下标。
         * 而sparse_memory_present_with_active_regions的作用就是为给定node所拥有的物理page建立mem_sections.
         */
        sparse_memory_present_with_active_regions(MAX_NUMNODES);
        sparse_init();

        /*   
         * clear the default setting with node 0
         * note: don't use nodes_clear here, that is really clearing when
         *       numa support is not compiled in, and later node_set_state
         *       will not set it back.
         */
        node_clear_state(0, N_NORMAL_MEMORY);

        /* 初始化所有pg_data_t和zone、page的数据 */
        free_area_init_nodes(max_zone_pfns);
}





paging_init()调用了free_area_init_nodes函数初始化所有结点的pg_data_t和zone、page的数据,并打印了管理区信息:


/**
 * free_area_init_nodes - Initialise all pg_data_t and zone data
 * @max_zone_pfn: an array of max PFNs for each zone
 *
 * This will call free_area_init_node() for each active node in the system.
 * Using the page ranges provided by add_active_range(), the size of each
 * zone in each node and their holes is calculated. If the maximum PFN
 * between two adjacent zones match, it is assumed that the zone is empty.
 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
 * starts where the previous one ended. For example, ZONE_DMA32 starts
 * at arch_max_dma_pfn.
 */
/* 初始化各个节点的所有pg_data_t和zone、page的数据 */
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
{
        unsigned long nid; 
        int i;

        /* Sort early_node_map as initialisation assumes it is sorted */
        sort_node_map();

        /*   
         * 以下设置arch_zone_lowest_possible_pfn和arch_zone_highest_possible_pfn为各个管理区的边界
         * 即 arch_zone_lowest_possible_pfn  = {最低pfn-0或64K, 16M   , 4G     , 0      }
         *    arch_zone_highest_possible_pfn = {     16M      , 4G    , max_pfn, 0      }
         *                                     {     DMA      , DMA_32, NORMAL , MOVABLE}
         */
        /* Record where the zone boundaries are */
	memset(arch_zone_lowest_possible_pfn, 0,
                                sizeof(arch_zone_lowest_possible_pfn));
        memset(arch_zone_highest_possible_pfn, 0,
                                sizeof(arch_zone_highest_possible_pfn));
        /* 查找early_node_map中的最低pfn */
        arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
        arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; /* max_zone_pfn = {16M, 4G, max_pfn} */
        for (i = 1; i < MAX_NR_ZONES; i++) {
                if (i == ZONE_MOVABLE)
                        continue;
                arch_zone_lowest_possible_pfn[i] =
                        arch_zone_highest_possible_pfn[i-1];
                arch_zone_highest_possible_pfn[i] =
                        max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
        }
        arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; /* 现在arch_zone[ZONE_MOVABLE]暂时是空的 */
        arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

        /* Find the PFNs that ZONE_MOVABLE begins at in each node */
        /*  为每一个cpu node建立zone_movable_pfn。
         *  ZONE_MOVABLE是在kernel启动时由命令行传入的参数,
         *  意义在于指明内核空间中哪些page是可以移动的,
         *  其他的内核page则称为kernel core,是不可以移动的。
	 *  find_zone_movable_pfns_for_nodes的作用就是按照
         *  early_node_map根据每个node的不同内存分布计算出
         *  每一个node中movable page的数量.
         */
        memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
        find_zone_movable_pfns_for_nodes(zone_movable_pfn);

        /* 打印DMA、DMA_32、ZONE_NORMAL的信息 */
        /* Print out the zone ranges */
        printk("Zone PFN ranges:\n");
        for (i = 0; i < MAX_NR_ZONES; i++) {
                if (i == ZONE_MOVABLE)
                        continue;
                printk("  %-8s ", zone_names[i]);
                if (arch_zone_lowest_possible_pfn[i] ==
                                arch_zone_highest_possible_pfn[i])
                        printk("empty\n");
                else
                        printk("%0#10lx -> %0#10lx\n",
                                arch_zone_lowest_possible_pfn[i],
                                arch_zone_highest_possible_pfn[i]);
        }

        /* 打印ZONE_MOVABLE的信息 */
        /* Print out the PFNs ZONE_MOVABLE begins at in each node */
        printk("Movable zone start PFN for each node\n");
        for (i = 0; i < MAX_NUMNODES; i++) {
                if (zone_movable_pfn[i])
                        printk("  Node %d: %lu\n", i, zone_movable_pfn[i]);
        }

	/* 打印 early_node_map 数组的信息 */
        /* Print out the early_node_map[] */
        printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
        for (i = 0; i < nr_nodemap_entries; i++)
                printk("  %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
                                                early_node_map[i].start_pfn,
                                                early_node_map[i].end_pfn);

        /* Initialise every node */
        mminit_verify_pageflags_layout();
        setup_nr_node_ids();

        /* 对所有节点循环 */
        for_each_online_node(nid) {
                pg_data_t *pgdat = NODE_DATA(nid); /* 获得节点的pg_data_t结构 */

                /* 调用free_area_init_node初始化节点nid对应的pg_data_t和zone、page的数据 */
                free_area_init_node(nid, NULL,
                                find_min_pfn_for_node(nid), NULL); /* find_min_pfn_for_node函数从
                                                        early_node_map数组中找出该节点的最低pfn */

                /* Any memory on that node */
                if (pgdat->node_present_pages)
                        node_set_state(nid, N_HIGH_MEMORY);  /* 设置该节点拥有regular memory */
                check_for_regular_memory(pgdat);          /* 没定义CONFIG_HIGHMEM函数为空 */
        }
}








而在free_area_init_nodes函数中通过循环遍历各个节点,循环中调用了free_area_init_node函数初始化该节点对应的pg_data_t和zone、page的数据 :


/* 初始化节点nid对应的pg_data_t和zone、page的数据
 * @ nid 为节点标识符
 * @ zone_size 为null
 * @ node_start_pfn 为nid节点的起始pfn
 * @ zholes_size 为null
 */     
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
                unsigned long node_start_pfn, unsigned long *zholes_size)
{       
        pg_data_t *pgdat = NODE_DATA(nid); /* 获得该节点的pg_data_t结构 */

        pgdat->node_id = nid; /* 设置节点标识符 */
        pgdat->node_start_pfn = node_start_pfn; /* 设置节点中第一个页框的下标 */

        /* 计算对于该node来说有多少pages可用,
         * 设置pg_data_t中node_spanned_pages成员为节点总的页框数,包括洞
         *                node_present_pages成员为总的页框数,不包括洞
         */
        calculate_node_totalpages(pgdat, zones_size, zholes_size);

        /* 在没有定义CONFIG_FLAT_NODE_MEM_MAP的情况下,此函数没作用 */
        alloc_node_mem_map(pgdat);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
        printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
                nid, (unsigned long)pgdat,
                (unsigned long)pgdat->node_mem_map);
#endif

        /* 调用free_area_init_core继续初始化pg_data_t结构,初始化zone以及page结构 */
        free_area_init_core(pgdat, zones_size, zholes_size);
}






继续调用free_area_init_core函数,继续初始化该节点的pg_data_t结构,初始化zone以及page结构 ,

free_area_init_core函数是初始化zone的核心:


/*
 * Set up the zone data structures:
 *   - mark all pages reserved
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
 */
/* 继续初始化pg_data_t结构,初始化zone以及page结构 */
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long *zholes_size)
{
        enum zone_type j;
        int nid = pgdat->node_id;
        unsigned long zone_start_pfn = pgdat->node_start_pfn;
        int ret;

        pgdat_resize_init(pgdat); /* 初始化pgdat->node_size_lock自旋锁 */
        pgdat->nr_zones = 0;
        init_waitqueue_head(&pgdat->kswapd_wait); /* 初始化pgdat->kswapd_wait等待队列 */
        pgdat->kswapd_max_order = 0; /* 初始化页换出守护进程创建空闲块的大小,为2^kswapd_max_order */
        pgdat_page_cgroup_init(pgdat); /* 空函数 */

        /* 遍历每个管理区 */
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
                unsigned long size, realsize, memmap_pages;
                enum lru_list l;

                /* size为该管理区中的页框数,包括洞 */
                size = zone_spanned_pages_in_node(nid, j, zones_size);
                /* realsize为管理区中的页框数,不包括洞 */
                realsize = size - zone_absent_pages_in_node(nid, j,
                                                                zholes_size);

		/*
                 * Adjust realsize so that it accounts for how much memory
                 * is used by this zone for memmap. This affects the watermark
                 * and per-cpu initialisations
                 */
                /* 调整realsize的大小,即减去page结构体占用的内存大小 */
                memmap_pages =  /* memmap_pags为包括洞的所有页框的page结构体所占的大小 */
                        PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
                if (realsize >= memmap_pages) {
                        realsize -= memmap_pages;
                        if (memmap_pages)
                                printk(KERN_DEBUG
                                       "  %s zone: %lu pages used for memmap\n",
                                       zone_names[j], memmap_pages);
                } else /* 内存不够存放page结构体 */
                        printk(KERN_WARNING
                                "  %s zone: %lu pages exceeds realsize %lu\n",
                                zone_names[j], memmap_pages, realsize);

                /* 调整realsize的大小,即减去DMA保留页的大小 */
                /* Account for reserved pages */
                if (j == 0 && realsize > dma_reserve) {
                        realsize -= dma_reserve;
                        printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
                                        zone_names[0], dma_reserve);
                }

                if (!is_highmem_idx(j))
                        nr_kernel_pages += realsize;
                nr_all_pages += realsize;

                zone->spanned_pages = size; /* 设置zone->spanned_pages为包括洞的页框数 */
                zone->present_pages = realsize; /* 设置zone->present+pages为不包括洞的页框数 */
#ifdef CONFIG_NUMA
                zone->node = nid;       /* 设置zone中的节点标识符 */
                /* 设置可回收页面比率 */
                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
                                                / 100;
                /* 设置slab回收缓存页的比率 */
                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
#endif
                zone->name = zone_names[j]; /* 设置zone的名称 */

                /* 初始化各种锁 */
                spin_lock_init(&zone->lock);
                spin_lock_init(&zone->lru_lock);
                zone_seqlock_init(zone);
                zone->zone_pgdat = pgdat; /* 设置管理区属于的节点对应的pg_data_t结构 */

                zone_pcp_init(zone); /* 初始化cpu的页面缓存 */

                /* 初始化lru相关成员 */
                for_each_lru(l) {
                        INIT_LIST_HEAD(&zone->lru[l].list);
                        zone->reclaim_stat.nr_saved_scan[l] = 0;
                }
                zone->reclaim_stat.recent_rotated[0] = 0;
                zone->reclaim_stat.recent_rotated[1] = 0;
                zone->reclaim_stat.recent_scanned[0] = 0;
                zone->reclaim_stat.recent_scanned[1] = 0;

                zap_zone_vm_stats(zone); /* 初始化zone->vm_stat为0 */
                zone->flags = 0;
                if (!size)
                        continue;

                set_pageblock_order(pageblock_default_order()); /* pageblock_default_order()返回9*/
                setup_usemap(pgdat, zone, size);        /* 定义了CONFIG_SPARSEMEM该函数为空 */

                /* 设置pgdat->nr_zones和zone->zone_start_pfn成员 
                 * 初始化zone->free_area成员
                 * 初始化zone->wait_table相关成员 
                 */
                ret = init_currently_empty_zone(zone, zone_start_pfn,
                                                size, MEMMAP_EARLY);
                BUG_ON(ret);
                memmap_init(size, nid, j, zone_start_pfn); /* 初始化该zone对应的page结构 */
                zone_start_pfn += size; /* 调整zone_start_pfn为下一个zone的起始页面 */
        }
}







free_area_init_core函数调用memmap_init函数来初始化page结构:

#define memmap_init(size, nid, zone, start_pfn) \
        memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)


/*
 * Initially all pages are reserved - free ones are freed
 * up by free_all_bootmem() once the early boot process is
 * done. Non-atomic initialization, single-pass.
 */
/* 初始化该zone对应的page结构体 */
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                unsigned long start_pfn, enum memmap_context context)
{
      struct page *page;
      unsigned long end_pfn = start_pfn + size;
    unsigned long pfn;
        struct zone *z;

        if (highest_memmap_pfn < end_pfn - 1)  /* 调整最高mem_map的页面数 */
                highest_memmap_pfn = end_pfn - 1;

        z = &NODE_DATA(nid)->node_zones[zone]; /* 取得zone的指针 */
        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                /*
                 * There can be holes in boot-time mem_map[]s
                 * handed to this function.  They do not
                 * exist on hotplugged memory.
                 */
                if (context == MEMMAP_EARLY) {
                        if (!early_pfn_valid(pfn))
                                continue;
                        if (!early_pfn_in_nid(pfn, nid))
                                continue;
                }
                page = pfn_to_page(pfn); /* 获得pfn对应的page结构, 此时page还未初始化 */
                set_page_links(page, zone, nid, pfn);/* 设置page->flags中关于zone、node、section的标志位 */

 		mminit_verify_page_links(page, zone, nid, pfn);/* DEBUG用,无视之 */
                init_page_count(page); /* 设置page->_count引用计数为1 */
                reset_page_mapcount(page); /* 设置page->_mapcount为-1 */
                SetPageReserved(page); /* 无此函数 */

                /*
                 * Mark the block movable so that blocks are reserved for
                 * movable at startup. This will force kernel allocations
                 * to reserve their blocks rather than leaking throughout
                 * the address space during boot when many long-lived
                 * kernel allocations are made. Later some blocks near
                 * the start are marked MIGRATE_RESERVE by
                 * setup_zone_migrate_reserve()
                 *
                 * bitmap is created for zone's valid pfn range. but memmap
                 * can be created for invalid pages (for alignment)
                 * check here not to call set_pageblock_migratetype() against
                 * pfn out of zone.
                 */
                if ((z->zone_start_pfn <= pfn)
                    && (pfn < z->zone_start_pfn + z->spanned_pages)
                    && !(pfn & (pageblock_nr_pages - 1)))
                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);

                INIT_LIST_HEAD(&page->lru); /* 初始化lru链表 */
#ifdef WANT_PAGE_VIRTUAL
                /* The shift won't overflow because ZONE_NORMAL is below 4G. */
                if (!is_highmem_idx(zone))
                        /* 设置page->virtual为页框的虚拟地址 */
                        set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
        }
}




这样经过paging_init函数,pg_data_t、zone、page等结构完成了初始化。

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值