内核版本:2.6.11.12
1 内核节点的定义
1.1 多节点node_data的定义
选中CONFIG_DISCONTIGMEM,node_data节点定义在arch/i386/mm/discontig.c中:
struct pglist_data *node_data[MAX_NUMNODES];
宏NODE_DATA的声明如下:
375 #ifndef CONFIG_DISCONTIGMEM
376
377 extern struct pglist_data contig_page_data;
378 #define NODE_DATA(nid) (&contig_page_data)
.......
400 #endif /* !CONFIG_DISCONTIGMEM */
1.2 单节点node_data定义
未选中CONFIG_DISCONTIGMEM,node_data节点定义在mm/page_alloc.c中:
1721 #ifndef CONFIG_DISCONTIGMEM
1722 static bootmem_data_t contig_bootmem_data;
1723 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
宏NODE_DATA的声明在include/linux/mmzone.h中:
375 #ifndef CONFIG_DISCONTIGMEM
376
377 extern struct pglist_data contig_page_data;
378 #define NODE_DATA(nid) (&contig_page_data)
节点初始化
2.1 节点pglist_data结构体
2.1.1结构体pglist_data的定义:
250 typedef struct pglist_data {
251 struct zone node_zones[MAX_NR_ZONES];
252 struct zonelist node_zonelists[GFP_ZONETYPES];
253 int nr_zones;
254 struct page *node_mem_map;
255 struct bootmem_data *bdata;
256 unsigned long node_start_pfn;
257 unsigned long node_present_pages; /* total number of physical pages */
258 unsigned long node_spanned_pages; /* total size of physical page
259 range, including holes */
260 int node_id;
261 struct pglist_data *pgdat_next;
262 wait_queue_head_t kswapd_wait;
263 struct task_struct *kswapd;
264 int kswapd_max_order;
265 } pg_data_t;
2.1.2 结构体bootmem_data
29 typedef struct bootmem_data {
30 unsigned long node_boot_start;
31 unsigned long node_low_pfn;
32 void *node_bootmem_map;
33 unsigned long last_offset;
34 unsigned long last_pos;
35 unsigned long last_success; /* Previous allocation point. To speed
36 * up searching */
37 } bootmem_data_t;
2.2 初始化流程:
start_kernel—>setup_arch----->setup_memory—>init_bootmem—>init_bootmem_core—>paging_init----->zone_sizes_init
---->build_all_zonelists—>build_zonelists
1)初始化bootmem
2)初始化zone大小
3)建立每个zone
2.2.1 初始化节点中的bootmem部分
说明:启动的虚拟机的内存是2G,对应的低端内存是896M,最高低端内存对应的页帧是0x38000。
bootmem内存位表的大小是:mapsize=(0x38000-0+7)/8 =0x7000
contig_page_data->bdata->mapsize=0x7000
//内存管理图的大小
contig_page_data->bdata->node_bootmem_map=0x0xC040 1000
//内存位图存放的地址,这个由init_pg_tables_end变量指定的,在kernel/head.S赋值
contig_page_data->bdata->node_boot_start =0x0
//启始页帧号
contig_page_data->bdata->node_low_pfn =0x38000
//结束页帧号
详细的分析参看下面链接:
3 节点中zone
3.1 计算每个zone大小
414 void __init zone_sizes_init(void)
415 {
416 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
417 unsigned int max_dma, high, low;
418
419 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
420 low = max_low_pfn;
421 high = highend_pfn;
422
423 if (low < max_dma)
424 zones_size[ZONE_DMA] = low;
425 else {
426 zones_size[ZONE_DMA] = max_dma;
427 zones_size[ZONE_NORMAL] = low - max_dma;
428 #ifdef CONFIG_HIGHMEM
429 zones_size[ZONE_HIGHMEM] = high - low;
430 #endif
431 }
432 free_area_init(zones_size);
433 }
说明:通过max_dma,和max_low_pfn,higend_pfn,得出每个zone的大小。
tom max_dma=0x1000 low=0x38000 high=0x7cffd,这样得到:
zones_size[ZONE_DMA] = 0x1000
zones_size[ZONE_NORMAL] = 0x37000
zones_size[ZONE_HIGHMEM] = 0x44ffd
3.2 分配mem_map数组
node_alloc_mem_map函数
1696 void __init node_alloc_mem_map(struct pglist_data *pgdat)
1697 {
1698 unsigned long size;
1699
1700 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
1701 pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
1702 #ifndef CONFIG_DISCONTIGMEM
1703 mem_map = contig_page_data.node_mem_map;
1704 #endif
1705 }
说明:就是分配物理页框描述符数组,大小为页框的总数乘以page结构的大小(32个字节),然后pgdat->node_mem_map和mem_map都指向这个数组的起始位置。
3.3 按zone初始化mem_map数组元素
1546 void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1547 unsigned long start_pfn)
1548 {
1549 struct page *start = pfn_to_page(start_pfn);
1550 struct page *page;
1551
1552 for (page = start; page < (start + size); page++) {
1553 set_page_zone(page, NODEZONE(nid, zone));
1554 set_page_count(page, 0);
1555 reset_page_mapcount(page);
1556 SetPageReserved(page);
1557 if((page-start)==1)
1558 printk(KERN_ERR "tom flag=%x _count=%x _mapcount=%x\r\n",page->flags,page->_count,page->_mapcount);
1559 INIT_LIST_HEAD(&page->lru);
1560 #ifdef WANT_PAGE_VIRTUAL
1561
1562 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1563 if (!is_highmem_idx(zone))
1564 set_page_address(page, __va(start_pfn << PAGE_SHIFT));
1565 #endif
1566 start_pfn++;
1567 }
1568 }
主要工作:
初始化物理页描述符page中flag,flag包含属于页属于哪个node,哪个页,设置页是保留页。
_count,_mapcount字段的值为-1。
3.4 初始化链表
1568 void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1569 unsigned long size)
1570 {
1571 int order;
1572 for (order = 0; order < MAX_ORDER ; order++) {
1573 INIT_LIST_HEAD(&zone->free_area[order].free_list);
1574 zone->free_area[order].nr_free = 0;
1575 }
1576 }
主要工作:
1)链表初始化为空链表
2)内存块数量初始化为0
3.5 设置zone结构数据
1589 static void __init free_area_init_core(struct pglist_data *pgdat,
1590 unsigned long *zones_size, unsigned long *zholes_size)
1591 {
1592 unsigned long i, j;
1593 const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
1594 int cpu, nid = pgdat->node_id;
1595 unsigned long zone_start_pfn = pgdat->node_start_pfn;
1596
1597 pgdat->nr_zones = 0;
1598 init_waitqueue_head(&pgdat->kswapd_wait);
1599 pgdat->kswapd_max_order = 0;
1600
1601 for (j = 0; j < MAX_NR_ZONES; j++) {
1602 struct zone *zone = pgdat->node_zones + j;
1603 unsigned long size, realsize;
1604 unsigned long batch;
1605
1606 zone_table[NODEZONE(nid, j)] = zone;
1607 realsize = size = zones_size[j];
1608 if (zholes_size)
1609 realsize -= zholes_size[j];
1610
1611 if (j == ZONE_DMA || j == ZONE_NORMAL)
1612 nr_kernel_pages += realsize;
1613 nr_all_pages += realsize;
1614
1615 zone->spanned_pages = size;
1616 zone->present_pages = realsize;
1617 zone->name = zone_names[j];
1618 spin_lock_init(&zone->lock);
1619 spin_lock_init(&zone->lru_lock);
1620 zone->zone_pgdat = pgdat;
1621 zone->free_pages = 0;
1622
1623 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1624
1625 /*
1626 * The per-cpu-pages pools are set to around 1000th of the
1627 * size of the zone. But no more than 1/4 of a meg - there's
1628 * no point in going beyond the size of L2 cache.
1629 *
1630 * OK, so we don't know how big the cache is. So guess.
1631 */
1632 batch = zone->present_pages / 1024;
1633 if (batch * PAGE_SIZE > 256 * 1024)
1634 batch = (256 * 1024) / PAGE_SIZE;
1635 batch /= 4; /* We effectively *= 4 below */
1636 if (batch < 1)
1637 batch = 1;
1638
1639 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1640 struct per_cpu_pages *pcp;
1641
1642 pcp = &zone->pageset[cpu].pcp[0]; /* hot */
1643 pcp->count = 0;
1644 pcp->low = 2 * batch;
1645 pcp->high = 6 * batch;
1646 pcp->batch = 1 * batch;
1647 INIT_LIST_HEAD(&pcp->list);
1648
1649 pcp = &zone->pageset[cpu].pcp[1]; /* cold */
1650 pcp->count = 0;
1651 pcp->low = 0;
1652 pcp->high = 2 * batch;
1653 pcp->batch = 1 * batch;
1654 INIT_LIST_HEAD(&pcp->list);
1655 }
1656 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
1657 zone_names[j], realsize, batch);
1658 INIT_LIST_HEAD(&zone->active_list);
1659 INIT_LIST_HEAD(&zone->inactive_list);
1660 zone->nr_scan_active = 0;
1661 zone->nr_scan_inactive = 0;
1662 zone->nr_active = 0;
1663 zone->nr_inactive = 0;
1664 if (!size)
1665 continue;
1667 /*
1668 * The per-page waitqueue mechanism uses hashed waitqueues
1669 * per zone.
1670 */
1671 zone->wait_table_size = wait_table_size(size);
1672 zone->wait_table_bits =
1673 wait_table_bits(zone->wait_table_size);
1674 zone->wait_table = (wait_queue_head_t *)
1675 alloc_bootmem_node(pgdat, zone->wait_table_size
1676 * sizeof(wait_queue_head_t));
1677
1678 for(i = 0; i < zone->wait_table_size; ++i)
1679 init_waitqueue_head(zone->wait_table + i);
1680
1681 pgdat->nr_zones = j+1;
1682
1683 zone->zone_mem_map = pfn_to_page(zone_start_pfn);
1684 zone->zone_start_pfn = zone_start_pfn;
1685
1686 if ((zone_start_pfn) & (zone_required_alignment-1))
1687 printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
1688 printk(KERN_ERR "tom init_core 1=%x 2=%x 3=%x 4=%x\r\n",zone_start_pfn,size,nid,j);
1689
1690 memmap_init(size, nid, j, zone_start_pfn);
1691
1692 zone_start_pfn += size;
1693
1694 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1695 }
1696 }
设置contig_page_data的zone参数,主要设置下面参数:
1)设置每个zone的大小
realsize = size = zones_size[j]
zone->spanned_pages=size;
2)设置zone属于那个节点
zone->zone_pgdat = pgdat;
3)设置每个zone的起始页帧,每个zone的起始页帧对应的描述符的地址
1683 zone->zone_mem_map = pfn_to_page(zone_start_pfn);
1684 zone->zone_start_pfn = zone_start_pfn;
1685
1686 if ((zone_start_pfn) & (zone_required_alignment-1))
1687 printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
1692 zone_start_pfn += size;
4)初始化mem_map数组元素
memmap_init(size, nid, j, zone_start_pfn);
- 初始化zone中的链表
zone_init_free_lists(pgdat, zone, zone->spanned_pages);
整个zone的流程是:如上图所示。
zone页分配策略
pgdata_t结构
250 typedef struct pglist_data {
251 struct zone node_zones[MAX_NR_ZONES];
252 struct zonelist node_zonelists[GFP_ZONETYPES];
...
265 } pg_data_t;
1)node_zones有DMA,NORMAL,HIGH三个区,在free_area_init_core函数中,初始化zone对象成员,比如zone的大小,zone的起始页帧号等等。
2)node_zonelists[3],对应着DMA,NORMAL,HIGH三个ZONE的分配页策略。
struct zonelist结构定义
233 struct zonelist {
234 struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
235 };
在单节点中就是zones[4]。
比如:
node_zonelist[0].zones[0]= 节点3.node_zones[0];
node_zonelist[0].zones[1]=节点2.node_zones[0];
node_zonelist[0].zones[2]=节点1.node_zones[0];
node_zonelist[0].zones[3]= NULL
这个意思说:如果想在DMA zone中分配页时,先尝试着节点3的DMA ZONE,
然后尝试着节点2的DMA ZONE,最后是节点1的DMA ZONE。
初始化pgdat中的zonelists
static void __init build_zonelists(pg_data_t *pgdat)
1422 {
1423 int i, j, k, node, local_node;
1424
1425 local_node = pgdat->node_id;
1426 for (i = 0; i < GFP_ZONETYPES; i++) {
1427 struct zonelist *zonelist;
1428
1429 zonelist = pgdat->node_zonelists + i;
1430 memset(zonelist, 0, sizeof(*zonelist));
1431
1432 j = 0;
1433 k = ZONE_NORMAL;
1434 if (i & __GFP_HIGHMEM)
1435 k = ZONE_HIGHMEM;
1436 if (i & __GFP_DMA)
1437 k = ZONE_DMA;
1438
1439 j = build_zonelists_node(pgdat, zonelist, j, k);
1440 /*
1441 * Now we build the zonelist so that it contains the zones
1442 * of all the other nodes.
1443 * We don't want to pressure a particular node, so when
1444 * building the zones for node N, we make sure that the
1445 * zones coming right after the local ones are those from
1446 * node N+1 (modulo N)
1447 */
1448 printk(KERN_ERR "tom %s(%s:%u)\n", __FUNCTION__, __FILE__, __LINE__);
1449 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1450 if (!node_online(node))
1451 continue;
1452 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1453 }
1454 for (node = 0; node < local_node; node++) {
1455 if (!node_online(node))
1456 continue;
1457 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1458 }
1459
1460 zonelist->zones[j] = NULL;
1461 }
1462 }
build_zonelists_node函数:
1278 static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
1279 {
1280 switch (k) {
1281 struct zone *zone;
1282 default:
1283 BUG();
1284 case ZONE_HIGHMEM:
1285 zone = pgdat->node_zones + ZONE_HIGHMEM;
1286 if (zone->present_pages) {
1287 printk(KERN_ERR "tom present_pages=%x k=%x\n",zone->present_pages,k);
1288 #ifndef CONFIG_HIGHMEM
1289 BUG();
1290 #endif
1291 zonelist->zones[j++] = zone;
1292 }
1293 case ZONE_NORMAL:
1294 zone = pgdat->node_zones + ZONE_NORMAL;
1295 printk(KERN_ERR "tom present_pages=%x k=%x\n",zone->present_pages,k);
1296 if (zone->present_pages)
1297 zonelist->zones[j++] = zone;
1298 case ZONE_DMA:
1299 zone = pgdat->node_zones + ZONE_DMA;
1300 printk(KERN_ERR "tom present_pages=%x k=%x\n",zone->present_pages,k);
1301 if (zone->present_pages)
1302 zonelist->zones[j++] = zone;
1303 }
1304
1305 return j;
1306 }
打印信息
tom j=0 k=1 i=0
tom present_pages=37000 k=1 j=0
tom present_pages=1000 k=1 j=1
11111111
tom j=0 k=0 i=1
tom present_pages=1000 k=0 j=0
11111111
tom j=0 k=2 i=2
tom present_pages=44ffd k=2 j=0
tom present_pages=37000 k=2 j=1
tom present_pages=1000 k=2 j=2
11111111
最后策略是:
说明:
pgdat->node_zonelists[0]----------------->策略1
pgdat->node_zonelists[1]----------------->策略2
pgdat->node_zonelists[2]----------------->策略3
分配策略和区不关联,不是之前想的请求指定区的分页,然后就分配什么页码;全是分配策略,这边初始化了3中分配策略。
可以通过GFP_USER & GFP_ZONEMASK标志来分配页。
pgdat->node_zonelists[0].zones[0]=zone+1
pgdat->node_zonelists[0].zones[1]=zone+0
说明:先分配normal zone页,然后dma zone页
pgdat->node_zonelists[1].zones[0]=zone+0
说明:先分配dma zone页。
pgdat->node_zonelists[2].zones[0]=zone+2
pgdat->node_zonelists[2].zones[1]=zone+1
pgdat->node_zonelists[2].zones[2]=zone+0
说明:先分配high zone页,然后normal zone页.,然后dma页。
3 相关API
3.1 __alloc_bootmem_core
153 static void * __init
154 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
155 unsigned long align, unsigned long goal)
156 {
157 unsigned long offset, remaining_size, areasize, preferred;
158 unsigned long i, start = 0, incr, eidx;
159 void *ret;
160
161 if(!size) {
162 printk("__alloc_bootmem_core(): zero-sized request\n");
163 BUG();
164 }
165 BUG_ON(align & (align-1));
166
167 eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
168 offset = 0;
169 if (align &&
170 (bdata->node_boot_start & (align - 1UL)) != 0)
171 {
172 offset = (align - (bdata->node_boot_start & (align - 1UL)));
173 }
174 offset >>= PAGE_SHIFT;
178 /*
179 * We try to allocate bootmem pages above 'goal'
180 * first, then we try to allocate lower pages.
181 */
182 if (goal && (goal >= bdata->node_boot_start) &&
183 ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {
184 preferred = goal - bdata->node_boot_start;
185
188 if (bdata->last_success >= preferred)
189 preferred = bdata->last_success;
190 } else
191 preferred = 0;
193 preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT;
194 preferred += offset;
195 areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
196 incr = align >> PAGE_SHIFT ? : 1;
199
200 restart_scan:
201 for (i = preferred; i < eidx; i += incr) {
202 unsigned long j;
203 i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
204 i = ALIGN(i, incr);
205 if (test_bit(i, bdata->node_bootmem_map))
206 continue;
207 for (j = i + 1; j < i + areasize; ++j) {
210 if (j >= eidx)
211 goto fail_block;
212 if (test_bit (j, bdata->node_bootmem_map))
213 goto fail_block;
214 }
215 start = i;
216 goto found;
217 fail_block:
218 i = ALIGN(j, incr);
225
226 if (preferred > offset) {
227 preferred = offset;
228 goto restart_scan;
229 }
230 return NULL;
231
232 found:
233 bdata->last_success = start << PAGE_SHIFT;
234 BUG_ON(start >= eidx);
235
236 /*
237 * Is the next page of the previous allocation-end the start
238 * of this allocation's buffer? If yes then we can 'merge'
239 * the previous partial page with this allocation.
240 */
241 if (align < PAGE_SIZE &&
242 bdata->last_offset && bdata->last_pos+1 == start) {
243 offset = (bdata->last_offset+align-1) & ~(align-1);
244 BUG_ON(offset > PAGE_SIZE);
245 remaining_size = PAGE_SIZE-offset;
246 if (size < remaining_size) {
247 areasize = 0;
248 /* last_pos unchanged */
249 bdata->last_offset = offset+size;
250 ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
251 bdata->node_boot_start);
252 } else {
253 remaining_size = size - remaining_size;
254 areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE;
255 ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
256 bdata->node_boot_start);
257 bdata->last_pos = start+areasize-1;
258 bdata->last_offset = remaining_size;
259 }
260 bdata->last_offset &= ~PAGE_MASK;
261 } else {
262 bdata->last_pos = start + areasize - 1;
263 bdata->last_offset = size & ~PAGE_MASK;
264 ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
265 }
266
267 /*
268 * Reserve the area now:
269 */
270 for (i = start; i < start+areasize; i++)
271 if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
272 BUG();
273 memset(ret, 0, size);
274 return ret;
275 }
说明:这个函数主要是向bootmem系统请求内存,请求后返回地址,具体步骤:
1)计算出请求内存的大小对应areasize个页
2)然后从内存管理位图node_bootmem_map找出第i个空闲的页,然后接着这个页,循环看接下来的areasize个页是否也是空闲。
3)如果第i个接下来的areasize个页也是空闲的,则把i对应的虚拟地址给ret,返回,同时在内存管理位图node_bootmem_map记录第i个到i+areasize-1个物理页已经使用,置为1.
4)同时在bdata->last_success = start << PAGE_SHIFT,就是这次分配成功的开始页地址;
bdata->last_offset = size & ~PAGE_MASK;就是分配分配的偏移量
bdata->last_pos = start + areasize - 1,bootmem内存上次分配的坐标,方便下一次分配。
在include/linux/nodemask.h中
#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
#if MAX_NUMNODES > 1
#define for_each_node_mask(node, mask) \
for ((node) = first_node(mask); \
(node) < MAX_NUMNODES; \
(node) = next_node((node), (mask)))
#else /* MAX_NUMNODES == 1 */
#define for_each_node_mask(node, mask) \
if (!nodes_empty(mask)) \
for ((node) = 0; (node) < 1; (node)++)
#endif /* MAX_NUMNODES */
#define first_node(src) __first_node(&(src)) static inline int __first_node(const nodemask_t *srcp)
{
return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}
在include/asm-i386/bitops.h中:
/* Returns the bit-number of the first set bit, not the number of
* the byte containing a bit.
* /
static inline int find_first_bit(const unsigned long *addr, unsigned size)
{
int d0, d1;
int res;
/* This looks at memory. Mark it volatile to tell gcc not to move it around */
__asm__ __volatile__(
"xorl %%eax,%%eax\n\t"
"repe; scasl\n\t"
"jz 1f\n\t"
"leal -4(%%edi),%%edi\n\t"
"bsfl (%%edi),%%eax\n"
"1:\tsubl %%ebx,%%edi\n\t"
"shll $3,%%edi\n\t"
"addl %%edi,%%eax"
:"=a" (res), "=&c" (d0), "=&D" (d1)
:"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory");
return res;
}
在include/asm-i386/numnodes.h中:
#ifdef CONFIG_X86_NUMAQ
#define NODES_SHIFT 4
#elif defined(CONFIG_ACPI_SRAT)
#define NODES_SHIFT 3
#endif
#endif
流程:
paging_init-->zone_sizes_init-->
对node节点的初始化在arch/i386/mm/discontig.c中
for_each_online_node(nid) {
92 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
393 unsigned long *zholes_size;
394 unsigned int max_dma;
395
396 unsigned long low = max_low_pfn;
397 unsigned long start = node_start_pfn[nid];
398 unsigned long high = node_end_pfn[nid];
399
400 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
401
402 if (start > low) {
403 #ifdef CONFIG_HIGHMEM
404 BUG_ON(start > high);
405 zones_size[ZONE_HIGHMEM] = high - start;
406 #endif
407 } else {
408 if (low < max_dma)
409 zones_size[ZONE_DMA] = low;
410 else {
411 BUG_ON(max_dma > low);
412 BUG_ON(low > high);
413 zones_size[ZONE_DMA] = max_dma;
414 zones_size[ZONE_NORMAL] = low - max_dma;
415 #ifdef CONFIG_HIGHMEM
416 zones_size[ZONE_HIGHMEM] = high - low;
417 #endif
418 }
419 }
}
在mm/page_alloc.c中
nodemask_t node_online_map = { { [0] = 1UL } };
node_oneline_map是个结构体,结构体含有一个数组unsigned long bits[1]的成员,并赋值bits[0]=1。
arch/i386/mm/discontig.c
说明:
根据流程图,有三处对node进行初始化。
1)arch/i386/kernel/numaq.c
static void __init smp_dump_qct(void)
{
int node;
struct eachquadmem *eq;
struct sys_cfg_data *scd =
(struct sys_cfg_data*)__va(SYS_CFG_DATA_PRIV_ADDR);
nodes_clear(node_online_map);
for_each_node(node) {
if (scd->quads_present31_0 & (1 << node)) {
node_set_online(node);
eq = &scd->eq[node];
/* Convert to pages */
node_start_pfn[node] = MB_TO_PAGES(
eq->hi_shrd_mem_start - eq->priv_mem_size);
node_end_pfn[node] = MB_TO_PAGES(
eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
}
}
}
2)arch/i386/mm/discontig.c
unsigned long __init setup_memory(void)
{
---
for_each_online_node(nid) {
----
}
----
}
3)arch/i386/mm/discontig.c
void __init zone_sizes_init(void)
{
....
for_each_online_node(nid) {
.....
}
}
考虑到,2.6.11这个版本numa无法编译通过,所以暂时不去分析这个版本numa.
c语言结构体的数组成员的赋值:
#include <stdio.h>
struct student
{
int aa[1];
};
int
main()
{
struct student a ={{[0]=5}}; struct student b ={.aa[0]=100};
printf("aa=%x",a.aa[0]);
printf(".aa=%x",b.aa[0]);
}
位图知识
#测试代码
tom@tom-linuxer:~/study$ cat test5.c
#include <stdio.h>
#define BITS_PER_LONG 32
#define NODES_SHIFT 4
#define MAX_NUMNODES (1 << NODES_SHIFT)
#define BITS_TO_LONGS(bits) \
(((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
#define DECLARE_BITMAP(name,bits) \
unsigned long name[BITS_TO_LONGS(bits)]
#define NODES_SHIFT 4
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
#define min_t(type,x,y) \
({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
#define max_t(type,x,y) \
({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })
int find_first_bit(const unsigned long *addr, unsigned size)
{
int d0, d1;
int res;
/* This looks at memory. Mark it volatile to tell gcc not to move it around */
__asm__ __volatile__(
"xorl %%eax,%%eax\n\t"
"repe; scasl\n\t"
"jz 1f\n\t"
"leal -4(%%edi),%%edi\n\t"
"bsfl (%%edi),%%eax\n"
"1:\tsubl %%ebx,%%edi\n\t"
"shll $3,%%edi\n\t"
"addl %%edi,%%eax"
:"=a" (res), "=&c" (d0), "=&D" (d1)
:"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory");
return res;
}
int find_next_bit(const unsigned long *addr, int size, int offset)
{
const unsigned long *p = addr + (offset >> 5);
/*int *p = addr + (offset >> 5);*/
int set = 0, bit = offset & 31, res;
if (bit) {
/*
* Look for nonzero in the first 32 bits:
*/
__asm__("bsfl %1,%0\n\t"
"jne 1f\n\t"
"movl $32, %0\n"
"1:"
: "=r" (set)
: "r" (*p >> bit));
if (set < (32 - bit))
return set + offset;
set = 32 - bit;
p++;
}
/*
* No set bit yet, search remaining full words for a bit
*/
res = find_first_bit (p, size - 32 * (p - addr));
return (offset + set + res);
}
#define first_node(src) __first_node(&(src))
int __first_node(const nodemask_t *srcp)
{
return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}
#define next_node(n, src) __next_node((n), &(src))
int __next_node(int n, const nodemask_t *srcp)
{
return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}
#define for_each_node_mask(node, mask) \
for ((node) = first_node(mask); \
(node) < MAX_NUMNODES; \
(node) = next_node((node), (mask)))
#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
int
main()
{
nodemask_t node_online_map = { { [0] = 7UL } };
int nid;
for_each_online_node(nid){
printf("nid=%d\n",nid);
}
return 0;
}
打印结果是:
nid=0
nid=1
nid=2
说明:
编译要加-m32选项,要不然会报错:
- register type mismatch for `bsf’
报错在find_next_bit函数的汇编中 - 执行会在报错,报错在find_first-bit函数的汇编中
内核结点的数组
- 关于node_data的定义:
在include/asm-i386/mmzone.h中
extern struct pglist_data *node_data[];
在文件arch/i386/mm/discontig.c中
struct pglist_data *node_data[MAX_NUMNODES];
没打开CONFIG_DISCONTIGMEM是不会用的这个discontig.c文件。
目前理解:就是使用外部node_data,外部没定义,就默认是初始化。
-
关于NODE_DATA的作用
#define NODE_DATA(nid) (node_data[nid]) -
for_each_online_node(node)的作用,
就是循环,看有多少个节点,主要使用整数的node,因为定义了
nodemask_t node_online_map = { { [0] = 1UL } };也就是node=0,然后根据node这个数,结合NODE_DATA(node)来操作node_data这个数组。
下面分析怎么初始化一个节点
setup_memory---->init_bootmem—>init_bootmem_core
#参考
初始化内存域和节点