再读内核存储管理(4)：存储区域管理

最新推荐文章于 2023-03-10 15:36:12 发布

嵌云阁主

最新推荐文章于 2023-03-10 15:36:12 发布

阅读量2k

点赞数

分类专栏： bf561-uclinux 文章标签：存储 struct table build initialization list

本文链接：https://blog.csdn.net/lights_joy/article/details/2556822

版权

bf561-uclinux 专栏收录该内容

390 篇文章 1 订阅

订阅专栏

快乐虾

http://blog.csdn.net/lights_joy/

lights@hb165.com

本文适用于

ADI bf561 DSP

uclinux-2008r1-rc8 (移植到vdsp5)

Visual DSP++ 5.0

欢迎转载，但请保留作者信息

1.1 存储区域管理

Linux支持非一致内存访问(Non-Uniform Memory Access, NUMA)模型，在这种模型中，给定CPU对不同内存单元的访问时间可能不一样。系统的物理内存被划分为几个节点。在一个单独的节点内，任一给定CPU访问页面所需的时间都是相同的。但是对不同的CPU，这个时间可能就不同。对每个CPU而言，内核都试图把耗时节点的访问次数减到最少，这就需要小心地选择CPU最常引用的内核数据结构的存放位置。每个节点中的物理内存又可以分为几个管理区(zone)。

在x86的体系结构下，Linux将内存分为3个管理区：

ZONE_DMA：包含低于16M的内存页，因为ISA总线的DMA处理器有严格限制，只能对RAM的前16M进行寻址。

ZONE_NORMAL：包含高于16M且低于896M的内存。

ZONE_HIGHMEM：包含高于896M的内存。

对于uclinux而言，DMA将可以访问整个内存区域，当然，由于anomaly-05000263的缘故，在启用ICACHE的情况下，可用内存将限制为60M。所以在内核中，实际只使用了一个内存区，ZONE_DMA，在这个内存区中，包含了所有的内存范围。

下面试图通过对zone的初始化来分析下内核的数据表示。

1.1.1 paging_init

在bootmem初始化完成之后，setup_arch开始进行下一个初始化工作：

* get kmalloc into gear

paging_init();

这个函数的实现位于arch/mm/init.c：

* paging_init() continues the virtual memory environment setup which

* was begun by the code in arch/head.S.

* The parameters are pointers to where to stick the starting and ending

* addresses of available kernel virtual memory.

void __init paging_init(void)

{

* make sure start_mem is page aligned, otherwise bootmem and

* page_alloc get different views og the world

unsigned long end_mem = memory_end & PAGE_MASK;

pr_debug("start_mem is %#lx virtual_end is %#lx/n", PAGE_ALIGN(memory_start), end_mem);

* initialize the bad page table and bad page to point

* to a couple of allocated pages

empty_bad_page_table = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);

empty_bad_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);

empty_zero_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);

memset((void *)empty_zero_page, 0, PAGE_SIZE);

* Set up SFC/DFC registers (user data space)

set_fs(KERNEL_DS);

pr_debug("free_area_init -> start_mem is %#lx virtual_end is %#lx/n",

PAGE_ALIGN(memory_start), end_mem);

{

unsigned long zones_size[MAX_NR_ZONES] = { 0, };

zones_size[ZONE_DMA] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT;

zones_size[ZONE_NORMAL] = 0;

#ifdef CONFIG_HIGHMEM

zones_size[ZONE_HIGHMEM] = 0;

#endif

free_area_init(zones_size);

}

这个函数看起来相当简单，当内核执行到这里的时候， memory_end指向SDRAM的最后一个页的首字节，对于64M的SDRAM(实际限制为60M)而言，其值为0x3bff000。alloc_bootmem_pages函数将以页(4096字节)为单位分配指定大小的内存。

这里比较有意思的是最后一个调用free_area_init。

1.1.2 free_area_init

这个函数的实现在mm/page_alloc.c中：

void __init free_area_init(unsigned long *zones_size)

{

free_area_init_node(0, NODE_DATA(0), zones_size,

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

}

其中的__init标记指明了这个函数将只调用一次，当内核执行到此的时候，其参数zone_size将有2个元素，其值分别为0x3bff和0，代表了ZONE_DMA和ZONE_NORMAL这两个区间的页面数量。

在这个函数中，NODE_DATA定义为：

extern struct pglist_data contig_page_data;

#define NODE_DATA(nid) (&contig_page_data)

PAGE_OFFSET定义为0。

__pa定义为：

#define __pa(vaddr) virt_to_phys((void *)(vaddr))

#define virt_to_phys(vaddr) ((unsigned long) (vaddr))

1.1.3 pglist_data初始化

1.1.3.1 free_area_init_node

这个函数的实现在mm/page_alloc.c中：

void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,

unsigned long *zones_size, unsigned long node_start_pfn,

unsigned long *zholes_size)

{

pgdat->node_id = nid;

pgdat->node_start_pfn = node_start_pfn;

calculate_node_totalpages(pgdat, zones_size, zholes_size);

alloc_node_mem_map(pgdat);

free_area_init_core(pgdat, zones_size, zholes_size);

}

内核执行到这里时，参数nid为0；pgdat指向一个固定的全局变量 contig_page_data ，且此变量的struct bootmem_data *bdata已经初始化完成；zone_size是一个有两个元素的数组，其值为{0x3bff, 0}，分别代表了ZONE_DMA和ZONE_NORMAL两个区域的页表数量；node_start_pfn为0；zholes_size为NULL。

1.1.3.2 calculate_node_totalpages

此函数的实现在mm/page_alloc.c中：

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

unsigned long *zones_size, unsigned long *zholes_size)

{

unsigned long realtotalpages, totalpages = 0;

enum zone_type i;

for (i = 0; i < MAX_NR_ZONES; i++)

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

zones_size);

pgdat->node_spanned_pages = totalpages;

realtotalpages = totalpages;

for (i = 0; i < MAX_NR_ZONES; i++)

realtotalpages -=

zone_absent_pages_in_node(pgdat->node_id, i,

zholes_size);

pgdat->node_present_pages = realtotalpages;

printk(KERN_DEBUG "On node %d totalpages: %lu/n", pgdat->node_id,

realtotalpages);

}

很简单，就是设置 pgdat->node_spanned_pages和 pgdat->node_present_pages两个成员的值，在这里调用的 zone_spanned_pages_in_node和 zone_absent_pages_in_node两个函数都非常简单：

static inline unsigned long zone_spanned_pages_in_node(int nid,

unsigned long zone_type,

unsigned long *zones_size)

{

return zones_size[zone_type];

}

static inline unsigned long zone_absent_pages_in_node(int nid,

unsigned long zone_type,

unsigned long *zholes_size)

{

if (!zholes_size)

return 0;

return zholes_size[zone_type];

}

因此最终 pgdat->node_spanned_pages和 pgdat->node_present_pages两个成员的值都将为SDRAM的页表数量，对于64M内存而言(实际限制为60M)，其值为0x3bff。

1.1.3.3 alloc_node_mem_map

此函数的实现为：

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

{

/* Skip empty nodes */

if (!pgdat->node_spanned_pages)

return;

#ifdef CONFIG_FLAT_NODE_MEM_MAP

/* ia64 gets its own node_mem_map, before this, without bootmem */

if (!pgdat->node_mem_map) {

unsigned long size, start, end;

struct page *map;

* The zone's endpoints aren't required to be MAX_ORDER

* aligned but the node_mem_map endpoints must be in order

* for the buddy allocator to function correctly.

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

end = pgdat->node_start_pfn + pgdat->node_spanned_pages;

end = ALIGN(end, MAX_ORDER_NR_PAGES);

size = (end - start) * sizeof(struct page);

map = alloc_remap(pgdat->node_id, size);

if (!map)

map = alloc_bootmem_node(pgdat, size);

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

}

#ifndef CONFIG_NEED_MULTIPLE_NODES

* With no DISCONTIG, the global mem_map is just set as node 0's

if (pgdat == NODE_DATA(0)) {

mem_map = NODE_DATA(0)->node_mem_map;

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

mem_map -= pgdat->node_start_pfn;

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

}

#endif

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

}

当内核执行到此函数时， pgdat指向全局变量

struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };

可以认为在内核中所有的pglist_data的指针都指向这个全局变量。

pgdat->node_spanned_pages的值为SDRAM中的页表数量，对于64M而言(实际限制为60M)，其值为0x3bff (16K)。

pgdat->node_start_pfn为SDRAM的起始位置，为0。

alloc_remap函数直接返回一个空指针。

alloc_bootmem_node用于以页为单位分配指定的空间。

MAX_ORDER_NR_PAGES的定义为：

/* Free memory management - zoned buddy allocator. */

#ifndef CONFIG_FORCE_MAX_ZONEORDER

#define MAX_ORDER 11

#else

#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER

#endif

#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))

因此这个函数的功能就很简单了，就是为 pgdat->node_mem_map分配内存空间。

1.1.4 zone初始化

1.1.4.1 free_area_init_core

这个函数也在mm/page_alloc.c中，其代码如下：

* Set up the zone data structures:

* - mark all pages reserved

* - mark all memory queues empty

* - clear the memory bitmaps

static void __meminit free_area_init_core(struct pglist_data *pgdat,

unsigned long *zones_size, unsigned long *zholes_size)

{

enum zone_type j;

int nid = pgdat->node_id;

unsigned long zone_start_pfn = pgdat->node_start_pfn;

int ret;

// 空语句，啥也不做

pgdat_resize_init(pgdat);

pgdat->nr_zones = 0;

// 初始化kswapd_wait这个链表，不过加上了spinlock的支持

init_waitqueue_head(&pgdat->kswapd_wait);

pgdat->kswapd_max_order = 0;

for (j = 0; j < MAX_NR_ZONES; j++) {

struct zone *zone = pgdat->node_zones + j;

unsigned long size, realsize, memmap_pages;

// size = realsize = SDRAM 的页表数量，对M SDRAM，其值为x3fff

size = zone_spanned_pages_in_node(nid, j, zones_size);

realsize = size - zone_absent_pages_in_node(nid, j,

zholes_size);

* Adjust realsize so that it accounts for how much memory

* is used by this zone for memmap. This affects the watermark

* and per-cpu initialisations

memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;

if (realsize >= memmap_pages) {

realsize -= memmap_pages;

printk(KERN_DEBUG

" %s zone: %lu pages used for memmap/n",

zone_names[j], memmap_pages);

} else

printk(KERN_WARNING

" %s zone: %lu pages exceeds realsize %lu/n",

zone_names[j], memmap_pages, realsize);

/* Account for reserved pages */

// dma_reserve 的值可以从引导程序导入，在此为

if (j == 0 && realsize > dma_reserve) {

realsize -= dma_reserve;

printk(KERN_DEBUG " %s zone: %lu pages reserved/n",

zone_names[0], dma_reserve);

}

// is_highmem_idx 恒为

if (!is_highmem_idx(j))

nr_kernel_pages += realsize;

nr_all_pages += realsize;

zone->spanned_pages = size;

zone->present_pages = realsize;

zone->name = zone_names[j];

spin_lock_init(&zone->lock);

spin_lock_init(&zone->lru_lock);

zone_seqlock_init(zone); // 空语句

zone->zone_pgdat = pgdat;

zone->prev_priority = DEF_PRIORITY;

zone_pcp_init(zone);

INIT_LIST_HEAD(&zone->active_list);

INIT_LIST_HEAD(&zone->inactive_list);

zone->nr_scan_active = 0;

zone->nr_scan_inactive = 0;

zap_zone_vm_stats(zone); // 对vm_stat成员清

atomic_set(&zone->reclaim_in_progress, 0);

if (!size)

continue;

ret = init_currently_empty_zone(zone, zone_start_pfn,

size, MEMMAP_EARLY);

BUG_ON(ret);

zone_start_pfn += size;

}

当程序运行到此的时候， pgdat->node_id的值为0， pgdat->node_start_pfn的值也为0。

从上述代码可以看出，nr_kernel_pages和nr_all_pages这两个值都表示可用的页的数量，其表示的内存范围从0到60M，不包含 page 数组所占用的页。对于64MSDRAM(实际限制为60M)，不启用MTD的情况，其值为0x3b6a。

从上述代码还可以看出zone->spanned_pages和zone->present_pages这两个成员都表示可用的SDRAM的页的数量，但 present_pages在spanned_pages的基础上减去了page数组所占用的页数。对于64MSDRAM，不启用MTD而言，内存实际限制在60M。spanned_pages的值为0x3bff，而 present_pages的值则为0x3b6a。

1.1.4.2 init_currently_empty_zone

这个函数位于mm/page_alloc.c：

__meminit int init_currently_empty_zone(struct zone *zone,

unsigned long zone_start_pfn,

unsigned long size,

enum memmap_context context)

{

struct pglist_data *pgdat = zone->zone_pgdat;

int ret;

ret = zone_wait_table_init(zone, size);

if (ret)

return ret;

pgdat->nr_zones = zone_idx(zone) + 1;

zone->zone_start_pfn = zone_start_pfn;

memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);

zone_init_free_lists(pgdat, zone, zone->spanned_pages);

return 0;

}

当调用此函数时，zone_start_pfn的值为0，size的值为整个SDRAM区域的页面数量，对于64M内存(实际限制为60M)，这个值为0x3bff。context的值则为 MEMMAP_EARLY。

1.1.4.3 zone_wait_table_init

关于zone里面的wait_table相关的3个成员，注释已经说得很清楚了：

* wait_table -- the array holding the hash table

* wait_table_hash_nr_entries -- the size of the hash table array

* wait_table_bits -- wait_table_size == (1 << wait_table_bits)

* The purpose of all these is to keep track of the people

* waiting for a page to become available and make them

* runnable again when possible. The trouble is that this

* consumes a lot of space, especially when so few things

* wait on pages at a given time. So instead of using

* per-page waitqueues, we use a waitqueue hash table.

* The bucket discipline is to sleep on the same queue when

* colliding and wake all in that wait queue when removing.

* When something wakes, it must check to be sure its page is

* truly available, a la thundering herd. The cost of a

* collision is great, but given the expected load of the

* table, they should be so rare as to be outweighed by the

* benefits from the saved space.

* __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the

* primary users of these fields, and in mm/page_alloc.c

* free_area_init_core() performs the initialization of them.

下面来看看它们是怎样初始化的：

static noinline __init_refok

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

{

int i;

struct pglist_data *pgdat = zone->zone_pgdat;

size_t alloc_size;

* The per-page waitqueue mechanism uses hashed waitqueues

* per zone.

zone->wait_table_hash_nr_entries =

wait_table_hash_nr_entries(zone_size_pages);

zone->wait_table_bits =

wait_table_bits(zone->wait_table_hash_nr_entries);

alloc_size = zone->wait_table_hash_nr_entries

* sizeof(wait_queue_head_t);

if (system_state == SYSTEM_BOOTING) {

zone->wait_table = (wait_queue_head_t *)

alloc_bootmem_node(pgdat, alloc_size);

} else {

* This case means that a zone whose size was 0 gets new memory

* via memory hot-add.

* But it may be the case that a new node was hot-added. In

* this case vmalloc() will not be able to use this new node's

* memory - this wait_table must be initialized to use this new

* node itself as well.

* To use this new node's memory, further consideration will be

* necessary.

zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);

}

if (!zone->wait_table)

return -ENOMEM;

for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)

init_waitqueue_head(zone->wait_table + i);

return 0;

}

这里比较值得关注的是 wait_table_hash_nr_entries的计算，它通过 wait_table_hash_nr_entries函数来计算：

* Helper functions to size the waitqueue hash table.

* Essentially these want to choose hash table sizes sufficiently

* large so that collisions trying to wait on pages are rare.

* But in fact, the number of active page waitqueues on typical

* systems is ridiculously low, less than 200. So this is even

* conservative, even though it seems large.

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

* waitqueues, i.e. the size of the waitq table given the number of pages.

#define PAGES_PER_WAITQUEUE 256

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

{

unsigned long size = 1;

pages /= PAGES_PER_WAITQUEUE;

while (size < pages)

size <<= 1;

* Once we have dozens or even hundreds of threads sleeping

* on IO we've got bigger problems than wait queue collision.

* Limit the size of the wait table to a reasonable size.

size = min(size, 4096UL);

return max(size, 4UL);

}

在这里pages为内存区的总页数，对于64M内存(限制为60M)，其值为0x3bff。此函数计算所得的结果将为0x40。即 zone->wait_table_hash_nr_entries的值将为0x40，而zone-> wait_table_bits的值将为6。

1.1.4.4 zone_init_free_lists

void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,

unsigned long size)

{

int order;

for (order = 0; order < MAX_ORDER ; order++) {

INIT_LIST_HEAD(&zone->free_area[order].free_list);

zone->free_area[order].nr_free = 0;

}

#define MAX_ORDER 11

在buddy算法中，将空闲页面分为11个块链表，每个块链表分别包含大小为1、2、4、8、16、32、64、128、256、512和1024个连续的页。为了表示此链表，在zone结构体中使用了

* free areas of different sizes

spinlock_t lock;

struct free_area free_area[MAX_ORDER];

进行表示，在这个函数中实际就是初始化这个成员。

1.1.5 zonelist初始化

在pglist_data这个结构体中，有一个成员：

struct zonelist node_zonelists[MAX_NR_ZONES];

下面就来看看它的初始化过程。

1.1.5.1 build_all_zonelists

在start_kernel函数中调用了这个函数：

build_all_zonelists();

它的实现在mm/page_alloc.c中：

void __meminit build_all_zonelists(void)

{

if (system_state == SYSTEM_BOOTING) {

__build_all_zonelists(NULL);

cpuset_init_current_mems_allowed();

} else {

// /* we have to stop all cpus to guaranntee there is no user

// of zonelist */

// stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);

// /* cpuset refresh routine should be here */

WARN();

}

vm_total_pages = nr_free_pagecache_pages();

printk("Built %i zonelists. Total pages: %ld/n",

num_online_nodes(), vm_total_pages);

}

对于这个函数，只会在start_kernel中调用一次，此时 system_state == SYSTEM_BOOTING。当然，如果需要支持memory_hot_plug，还会有另外的调用，在此忽略它，因此实际只会执行if的第一个分支。

在这个函数中对 cpuset_init_current_mems_allowed的调用什么事也不做。

1.1.5.2 __build_all_zonelists

此函数实现为：

/* return values int ....just for stop_machine_run() */

static int __meminit __build_all_zonelists(void *dummy)

{

int nid;

for_each_online_node(nid) {

build_zonelists(NODE_DATA(nid));

build_zonelist_cache(NODE_DATA(nid));

}

return 0;

}

在这个函数中 for_each_online_node只会执行一次，因为在整个系统中，只有唯一一个pglist_data!

1.1.5.3 build_zonelists

static void __meminit build_zonelists(pg_data_t *pgdat)

{

int node, local_node;

enum zone_type i,j;

local_node = pgdat->node_id;

for (i = 0; i < MAX_NR_ZONES; i++) {

struct zonelist *zonelist;

zonelist = pgdat->node_zonelists + i;

j = build_zonelists_node(pgdat, zonelist, 0, i);

* Now we build the zonelist so that it contains the zones

* of all the other nodes.

* We don't want to pressure a particular node, so when

* building the zones for node N, we make sure that the

* zones coming right after the local ones are those from

* node N+1 (modulo N)

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

if (!node_online(node))

continue;

j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);

}

for (node = 0; node < local_node; node++) {

if (!node_online(node))

continue;

j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);

}

zonelist->zones[j] = NULL;

}

在这个函数中有

#define NODES_SHIFT 0

#define MAX_NUMNODES (1 << NODES_SHIFT)

而localnode的值为0。因此这个函数实际上就相当于：

static void __meminit build_zonelists(pg_data_t *pgdat)

{

int node, local_node;

enum zone_type i,j;

local_node = pgdat->node_id;

for (i = 0; i < MAX_NR_ZONES; i++) {

struct zonelist *zonelist;

zonelist = pgdat->node_zonelists + i;

j = build_zonelists_node(pgdat, zonelist, 0, i);

zonelist->zones[j] = NULL;

}

实际上，我们只要关心 build_zonelists_node函数就行了。

* Builds allocation fallback zone lists.

* Add all populated zones of a node to the zonelist.

static int __meminit build_zonelists_node(pg_data_t *pgdat,

struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)

{

struct zone *zone;

BUG_ON(zone_type >= MAX_NR_ZONES);

zone_type++;

do {

zone_type--;

zone = pgdat->node_zones + zone_type;

if (populated_zone(zone)) { // 只要present_pages不为，则此条件为真

zonelist->zones[nr_zones++] = zone;

check_highest_zone(zone_type); // 空调用

}

} while (zone_type);

return nr_zones;

}

在内核中有两个ZONE，ZONE_DMA和ZONE_NORMAL，但是ZONE_NORMAL的内存大小为0，其present_pages也为0，因此在初始化后，zonelist->zones数组实际只有一个元素，它指向ZONE_DMA，即contig_page_data->zone[0]。

1.1.5.4 build_zonelist_cache

这个函数仅在__build_all_zonelists中被调用一次：

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

static void __meminit build_zonelist_cache(pg_data_t *pgdat)

{

int i;

for (i = 0; i < MAX_NR_ZONES; i++)

pgdat->node_zonelists[i].zlcache_ptr = NULL;

}

很简单，没什么可说的。

1.1.5.5 nr_free_pagecache_pages

这个函数的实现为：

* Amount of free RAM allocatable within all zones

unsigned int nr_free_pagecache_pages(void)

{

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));

}

在这里 gfp_zone(GFP_HIGHUSER)将返回GFP_HIGHUSER所在的内存区域，因为内核只使用ZONE_DMA，故这个调用返回0，即ZONE_DMA。

下面看看nr_free_zone_pages的实现：

static unsigned int nr_free_zone_pages(int offset)

{

/* Just pick one node, since fallback list is circular */

pg_data_t *pgdat = NODE_DATA(numa_node_id());

unsigned int sum = 0;

struct zonelist *zonelist = pgdat->node_zonelists + offset;

struct zone **zonep = zonelist->zones;

struct zone *zone;

for (zone = *zonep++; zone; zone = *zonep++) {

unsigned long size = zone->present_pages;

unsigned long high = zone->pages_high;

if (size > high)

sum += size - high;

}

return sum;

}

传递进来的参数为0，而且我们知道 zonelist->zones实际只有一个元素，且指向pgdat->node_zones[0]，即ZONE_DMA的描述结构zone。因而这个函数的功能就简单了，就是返回ZONE_DMA的空闲页数。对于64M内存(限制为60M)，其值将为0x3b6a。

1.1.6 page初始化

1.1.6.1 memmap_init

这个宏用于初始化mem_map中每个page的数据。

#define memmap_init(size, nid, zone, start_pfn) /

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

* Initially all pages are reserved - free ones are freed

* up by free_all_bootmem() once the early boot process is

* done. Non-atomic initialization, single-pass.

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

unsigned long start_pfn, enum memmap_context context)

{

struct page *page;

unsigned long end_pfn = start_pfn + size;

unsigned long pfn;

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

* There can be holes in boot-time mem_map[]s

* handed to this function. They do not

* exist on hotplugged memory.

if (context == MEMMAP_EARLY) {

if (!early_pfn_valid(pfn))

continue;

if (!early_pfn_in_nid(pfn, nid))

continue;

}

page = pfn_to_page(pfn);

set_page_links(page, zone, nid, pfn);

init_page_count(page);

reset_page_mapcount(page);

SetPageReserved(page);

INIT_LIST_HEAD(&page->lru);

}

调用此函数时size为整个SDRAM区域的页数，对于64M内存(限制为60M)，其值为0x3bff。其余几个参数均为0。

从这个函数可以看出，它依次对每个页的描述符page均进行了初始化。

pfn_to_page的定义如下：

#define pfn_to_page(pfn) virt_to_page(pfn_to_virt(pfn))

#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT)

#define __va(paddr) phys_to_virt((unsigned long)(paddr))

#define phys_to_virt(vaddr) ((void *) (vaddr))

#define virt_to_page(addr) (mem_map + (((unsigned long)(addr)-PAGE_OFFSET) >> PAGE_SHIFT))

其实就是根据一个内存地址找到它所在页的描述结构page。

* Setup the page count before being freed into the page allocator for

* the first time (boot or memory hotplug)

static inline void init_page_count(struct page *page)

{

atomic_set(&page->_count, 1);

}

将_count成员初始化为1。

* The atomic page->_mapcount, like _count, starts from -1:

* so that transitions both from it and to it can be tracked,

* using atomic_inc_and_test and atomic_add_negative(-1).

static inline void reset_page_mapcount(struct page *page)

{

atomic_set(&(page)->_mapcount, -1);

}

比较简单，将_mapcount值设置为1。

#define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags)

从这里可以看出，初始化后的flags成员将带有唯一的标记PG_reserved。

1.1.6.2 set_page_links

此函数位于include/linux/mm.h：

static inline void set_page_links(struct page *page, enum zone_type zone,

unsigned long node, unsigned long pfn)

{

set_page_zone(page, zone);

set_page_node(page, node);

set_page_section(page, pfn_to_section_nr(pfn));

}

static inline void set_page_zone(struct page *page, enum zone_type zone)

{

page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);

page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;

}

在这个函数中，zone值为ZONE_DMA，即0值，因此这个函数相当于啥也不做。

static inline void set_page_section(struct page *page, unsigned long var_section)

{

page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);

page->flags |= (var_section & SECTIONS_MASK) << SECTIONS_PGSHIFT;

}

#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)

#define PFN_SECTION_SHIFT 0

#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))

#define SECTIONS_WIDTH 0

#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)

这个函数也相当于什么都不做。

因而 set_page_links这个函数相当于一个空函数。

1.1.7 per_cpu_pageset初始化

内核为每个zone定义了一个“每CPU”页面高速缓存。所有高速缓存包含一些预先分配的页，它们被用于满足本地CPU发出的单一内存请求。实际上，这里为每个内存管理区和每个CPU提供了两个高速缓存：一个热高速缓存，它存放的页框中所包含的内容很可能就在CPU硬件高速缓存中；还有一个冷高速缓存。

内核使用两个位标来监视热高速缓存和冷高速缓存的大小：如果页个数低于下界low，内核通过buddy系统分配batch个单一页面来补充对应的高速缓存；否则，如果页框个数高过上界high，内核从高速缓存中释放batch个页框到buddy系统中。

1.1.7.1 zone_pcp_init

这个函数在 free_area_init_core函数中调用。其实现在mm/page_alloc.c中：

static __meminit void zone_pcp_init(struct zone *zone)

{

int cpu;

unsigned long batch = zone_batchsize(zone);

for (cpu = 0; cpu < NR_CPUS; cpu++) {

setup_pageset(zone_pcp(zone,cpu), batch);

}

if (zone->present_pages)

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu/n",

zone->name, zone->present_pages, batch);

}

这个函数的功能也简单，就是初始化zone结构体中的pageset成员。

在这里有：

#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])

1.1.7.2 zone_batchsize

这个函数用于计算batchsize。

static int __devinit zone_batchsize(struct zone *zone)

{

int batch;

* The per-cpu-pages pools are set to around 1000th of the

* size of the zone. But no more than 1/2 of a meg.

* OK, so we don't know how big the cache is. So guess.

batch = zone->present_pages / 1024;

if (batch * PAGE_SIZE > 512 * 1024)

batch = (512 * 1024) / PAGE_SIZE;

batch /= 4; /* We effectively *= 4 below */

if (batch < 1)

batch = 1;

* Clamp the batch to a 2^n - 1 value. Having a power

* of 2 value was found to be more likely to have

* suboptimal cache aliasing properties in some cases.

* For example if 2 tasks are alternately allocating

* batches of pages, one task can end up with a lot

* of pages of one half of the possible page colors

* and the other with pages of the other colors.

batch = (1 << (fls(batch + batch/2)-1)) - 1;

return batch;

}

对于64M内存，batch计算的结果为3。

1.1.7.3 setup_pageset

inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

{

struct per_cpu_pages *pcp;

memset(p, 0, sizeof(*p));

pcp = &p->pcp[0]; /* hot */

pcp->count = 0;

pcp->high = 6 * batch;

pcp->batch = max(1UL, 1 * batch);

INIT_LIST_HEAD(&pcp->list);

pcp = &p->pcp[1]; /* cold*/

pcp->count = 0;

pcp->high = 2 * batch;

pcp->batch = max(1UL, batch/2);

INIT_LIST_HEAD(&pcp->list);

}

这个函数比较简单，没啥可说的。

嵌云阁主

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
0
评论
再读内核存储管理(4)：存储区域管理

快乐虾http://blog.csdn.net/lights_joy/lights@hb165.com 本文适用于ADI bf561 DSPuclinux-2008r1-rc8 (移植到vdsp5)Visual DSP++ 5.0 欢迎转载，但请保留作者信息 1.1 存储区域管理Linux支持非一致内存访问(N
复制链接

扫一扫