内核的节点初始化

内核版本:2.6.11.12

1 内核节点的定义

1.1 多节点node_data的定义

选中CONFIG_DISCONTIGMEM,node_data节点定义在arch/i386/mm/discontig.c中:

struct pglist_data *node_data[MAX_NUMNODES];

宏NODE_DATA的声明如下:

375 #ifndef CONFIG_DISCONTIGMEM                                                     
376                                                                                 
377 extern struct pglist_data contig_page_data;                                     
378 #define NODE_DATA(nid)          (&contig_page_data)  
.......
400 #endif /* !CONFIG_DISCONTIGMEM */

1.2 单节点node_data定义

未选中CONFIG_DISCONTIGMEM,node_data节点定义在mm/page_alloc.c中:

1721 #ifndef CONFIG_DISCONTIGMEM                                                     
1722 static bootmem_data_t contig_bootmem_data;                                      
1723 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 

宏NODE_DATA的声明在include/linux/mmzone.h中:

375 #ifndef CONFIG_DISCONTIGMEM                                                     
376                                                                                 
377 extern struct pglist_data contig_page_data;                                                                                                      
378 #define NODE_DATA(nid)          (&contig_page_data) 

节点初始化

2.1 节点pglist_data结构体

2.1.1结构体pglist_data的定义:

250 typedef struct pglist_data {                                                    
251         struct zone node_zones[MAX_NR_ZONES];                                   
252         struct zonelist node_zonelists[GFP_ZONETYPES];                          
253         int nr_zones;                                                           
254         struct page *node_mem_map;                                              
255         struct bootmem_data *bdata;                                             
256         unsigned long node_start_pfn;                                           
257         unsigned long node_present_pages; /* total number of physical pages */                                                                   
258         unsigned long node_spanned_pages; /* total size of physical page        
259                                              range, including holes */          
260         int node_id;                                                            
261         struct pglist_data *pgdat_next;                                         
262         wait_queue_head_t kswapd_wait;                                          
263         struct task_struct *kswapd;                                             
264         int kswapd_max_order;                                                   
265 } pg_data_t; 

2.1.2 结构体bootmem_data

 29 typedef struct bootmem_data {                                                   
 30         unsigned long node_boot_start;                                          
 31         unsigned long node_low_pfn;                                             
 32         void *node_bootmem_map;                                                 
 33         unsigned long last_offset;                                              
 34         unsigned long last_pos;                                                 
 35         unsigned long last_success;     /* Previous allocation point.  To speed 
 36                                          * up searching */                      
 37 } bootmem_data_t;

2.2 初始化流程:

start_kernel—>setup_arch----->setup_memory—>init_bootmem—>init_bootmem_core—>paging_init----->zone_sizes_init
---->build_all_zonelists—>build_zonelists
1)初始化bootmem
2)初始化zone大小
3)建立每个zone

2.2.1 初始化节点中的bootmem部分

说明:启动的虚拟机的内存是2G,对应的低端内存是896M,最高低端内存对应的页帧是0x38000。
bootmem内存位表的大小是:mapsize=(0x38000-0+7)/8 =0x7000

  contig_page_data->bdata->mapsize=0x7000   
  //内存管理图的大小
  contig_page_data->bdata->node_bootmem_map=0x0xC040 1000  
  //内存位图存放的地址,这个由init_pg_tables_end变量指定的,在kernel/head.S赋值
   contig_page_data->bdata->node_boot_start =0x0
   //启始页帧号
   contig_page_data->bdata->node_low_pfn =0x38000 
   //结束页帧号

详细的分析参看下面链接:

bootmem分配器

3 节点中zone

3.1 计算每个zone大小

414 void __init zone_sizes_init(void)                                               
415 {                                                                               
416         unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};                     
417         unsigned int max_dma, high, low;                                        
418                                                                                 
419         max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;          
420         low = max_low_pfn;                                                      
421         high = highend_pfn;                                                     
422                                                                                 
423         if (low < max_dma)                                                                                                                                                                                    
424                 zones_size[ZONE_DMA] = low;                                     
425         else {                                                                  
426                 zones_size[ZONE_DMA] = max_dma;                                 
427                 zones_size[ZONE_NORMAL] = low - max_dma;                        
428 #ifdef CONFIG_HIGHMEM                                                           
429                 zones_size[ZONE_HIGHMEM] = high - low;                          
430 #endif                                                                          
431         }                                                                       
432         free_area_init(zones_size);                                             
433 } 

说明:通过max_dma,和max_low_pfn,higend_pfn,得出每个zone的大小。
tom max_dma=0x1000 low=0x38000 high=0x7cffd,这样得到:

                 zones_size[ZONE_DMA] = 0x1000                               
                 zones_size[ZONE_NORMAL] = 0x37000                      
                 zones_size[ZONE_HIGHMEM] = 0x44ffd

3.2 分配mem_map数组

node_alloc_mem_map函数

1696 void __init node_alloc_mem_map(struct pglist_data *pgdat)                       
1697 {                                                                               
1698         unsigned long size;                                                     
1699                                                                                 
1700         size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);           
1701         pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);                  
1702 #ifndef CONFIG_DISCONTIGMEM                                                                                                                                   
1703         mem_map = contig_page_data.node_mem_map;                                
1704 #endif                                                                          
1705 } 

说明:就是分配物理页框描述符数组,大小为页框的总数乘以page结构的大小(32个字节),然后pgdat->node_mem_map和mem_map都指向这个数组的起始位置。

3.3 按zone初始化mem_map数组元素

1546 void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,   
1547                 unsigned long start_pfn)                                        
1548 {                                                                                                                                                                                                            
1549         struct page *start = pfn_to_page(start_pfn);                            
1550         struct page *page;                                                      
1551                                                                                 
1552         for (page = start; page < (start + size); page++) {                     
1553                 set_page_zone(page, NODEZONE(nid, zone));                       
1554                 set_page_count(page, 0);                                        
1555                 reset_page_mapcount(page);                                      
1556                 SetPageReserved(page);                                          
1557                 if((page-start)==1)                                             
1558                        printk(KERN_ERR "tom flag=%x _count=%x _mapcount=%x\r\n",page->flags,page->_count,page->_mapcount);
1559                 INIT_LIST_HEAD(&page->lru);                                     
1560 #ifdef WANT_PAGE_VIRTUAL                                                        
1561                      
1562                 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 
1563                 if (!is_highmem_idx(zone))                                      
1564                         set_page_address(page, __va(start_pfn << PAGE_SHIFT));  
1565 #endif                                                                          
1566                 start_pfn++;                                                    
1567         }                                                                       
1568 }   

主要工作:
初始化物理页描述符page中flag,flag包含属于页属于哪个node,哪个页,设置页是保留页。
_count,_mapcount字段的值为-1。

3.4 初始化链表

1568 void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,                                                                                                                                      
1569                                 unsigned long size)                             
1570 {                                                                               
1571         int order;                                                              
1572         for (order = 0; order < MAX_ORDER ; order++) {                          
1573                 INIT_LIST_HEAD(&zone->free_area[order].free_list);              
1574                 zone->free_area[order].nr_free = 0;                             
1575         }                                                                       
1576 } 

主要工作:
1)链表初始化为空链表
2)内存块数量初始化为0

3.5 设置zone结构数据

1589 static void __init free_area_init_core(struct pglist_data *pgdat,               
1590                 unsigned long *zones_size, unsigned long *zholes_size)          
1591 {                                                                               
1592         unsigned long i, j;                                                     
1593         const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);     
1594         int cpu, nid = pgdat->node_id;                                          
1595         unsigned long zone_start_pfn = pgdat->node_start_pfn;                   
1596                                                                                 
1597         pgdat->nr_zones = 0;                                                    
1598         init_waitqueue_head(&pgdat->kswapd_wait);                               
1599         pgdat->kswapd_max_order = 0;                                            
1600                                                                                 
1601         for (j = 0; j < MAX_NR_ZONES; j++) {                                    
1602                 struct zone *zone = pgdat->node_zones + j;                      
1603                 unsigned long size, realsize;                                   
1604                 unsigned long batch;                                            
1605                                                                                 
1606                 zone_table[NODEZONE(nid, j)] = zone;                            
1607                 realsize = size = zones_size[j];                                
1608                 if (zholes_size)                                                
1609                         realsize -= zholes_size[j];                             
1610                                                                                 
1611                 if (j == ZONE_DMA || j == ZONE_NORMAL)                          
1612                         nr_kernel_pages += realsize;                            
1613                 nr_all_pages += realsize;                                       
1614                                                                                 
1615                 zone->spanned_pages = size;                                     
1616                 zone->present_pages = realsize;                                 
1617                 zone->name = zone_names[j];                                     
1618                 spin_lock_init(&zone->lock);                                    
1619                 spin_lock_init(&zone->lru_lock);      
1620                 zone->zone_pgdat = pgdat;                                       
1621                 zone->free_pages = 0;                                           
1622                                                                                 
1623                 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;       
1624                                                                                 
1625                 /*                                                              
1626                  * The per-cpu-pages pools are set to around 1000th of the      
1627                  * size of the zone.  But no more than 1/4 of a meg - there's   
1628                  * no point in going beyond the size of L2 cache.               
1629                  *                                                              
1630                  * OK, so we don't know how big the cache is.  So guess.        
1631                  */                                                             
1632                 batch = zone->present_pages / 1024;                             
1633                 if (batch * PAGE_SIZE > 256 * 1024)                             
1634                         batch = (256 * 1024) / PAGE_SIZE;                       
1635                 batch /= 4;             /* We effectively *= 4 below */         
1636                 if (batch < 1)                                                  
1637                         batch = 1;                                                                                                                                                                           
1638                                                                                 
1639                 for (cpu = 0; cpu < NR_CPUS; cpu++) {                           
1640                         struct per_cpu_pages *pcp;      
1641                                                                                 
1642                         pcp = &zone->pageset[cpu].pcp[0];       /* hot */       
1643                         pcp->count = 0;                                         
1644                         pcp->low = 2 * batch;                                   
1645                         pcp->high = 6 * batch;                                  
1646                         pcp->batch = 1 * batch;                                 
1647                         INIT_LIST_HEAD(&pcp->list);                             
1648                                                                                 
1649                         pcp = &zone->pageset[cpu].pcp[1];       /* cold */      
1650                         pcp->count = 0;                                         
1651                         pcp->low = 0;                                           
1652                         pcp->high = 2 * batch;                                  
1653                         pcp->batch = 1 * batch;                                 
1654                         INIT_LIST_HEAD(&pcp->list);                             
1655                 }                                                               
1656                 printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",     
1657                                 zone_names[j], realsize, batch);                
1658                 INIT_LIST_HEAD(&zone->active_list);                             
1659                 INIT_LIST_HEAD(&zone->inactive_list);                           
1660                 zone->nr_scan_active = 0;                                       
1661                 zone->nr_scan_inactive = 0;                                     
1662                 zone->nr_active = 0;                                            
1663                 zone->nr_inactive = 0;                                          
1664                 if (!size)                                                      
1665                         continue;                                     
1667                 /*                                                              
1668                  * The per-page waitqueue mechanism uses hashed waitqueues      
1669                  * per zone.                                                    
1670                  */                                                             
1671                 zone->wait_table_size = wait_table_size(size);                  
1672                 zone->wait_table_bits =                                         
1673                         wait_table_bits(zone->wait_table_size);                 
1674                 zone->wait_table = (wait_queue_head_t *)                        
1675                         alloc_bootmem_node(pgdat, zone->wait_table_size         
1676                                                 * sizeof(wait_queue_head_t));   
1677                                                                                 
1678                 for(i = 0; i < zone->wait_table_size; ++i)                      
1679                         init_waitqueue_head(zone->wait_table + i);              
1680                                                                                 
1681                 pgdat->nr_zones = j+1;                                          
1682                                                                                 
1683                 zone->zone_mem_map = pfn_to_page(zone_start_pfn);               
1684                 zone->zone_start_pfn = zone_start_pfn;                          
1685                                                                                 
1686                 if ((zone_start_pfn) & (zone_required_alignment-1))             
1687                         printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
1688                 printk(KERN_ERR "tom init_core 1=%x 2=%x 3=%x 4=%x\r\n",zone_start_pfn,size,nid,j);
1689                                                                                 
1690                 memmap_init(size, nid, j, zone_start_pfn);                      
1691                                                                                 
1692                 zone_start_pfn += size;                                         
1693                                                                                 
1694                 zone_init_free_lists(pgdat, zone, zone->spanned_pages);         
1695         }                                                                       
1696 }  

设置contig_page_data的zone参数,主要设置下面参数:
1)设置每个zone的大小

realsize = size = zones_size[j]
zone->spanned_pages=size;

2)设置zone属于那个节点

zone->zone_pgdat = pgdat;

3)设置每个zone的起始页帧,每个zone的起始页帧对应的描述符的地址

1683                 zone->zone_mem_map = pfn_to_page(zone_start_pfn);               
1684                 zone->zone_start_pfn = zone_start_pfn;                                                                                                                                                       
1685                                                                                 
1686                 if ((zone_start_pfn) & (zone_required_alignment-1))             
1687                         printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
                                                                 
1692                 zone_start_pfn += size;   

4)初始化mem_map数组元素

memmap_init(size, nid, j, zone_start_pfn);
  1. 初始化zone中的链表
zone_init_free_lists(pgdat, zone, zone->spanned_pages);

整个zone的流程是:如上图所示。

zone页分配策略

pgdata_t结构

250 typedef struct pglist_data {                                                    
251         struct zone node_zones[MAX_NR_ZONES];                                   
252         struct zonelist node_zonelists[GFP_ZONETYPES]; 
...
265 } pg_data_t;

1)node_zones有DMA,NORMAL,HIGH三个区,在free_area_init_core函数中,初始化zone对象成员,比如zone的大小,zone的起始页帧号等等。

2)node_zonelists[3],对应着DMA,NORMAL,HIGH三个ZONE的分配页策略。

struct zonelist结构定义

233 struct zonelist {                                                               
234         struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited                                                                   
235 };

在单节点中就是zones[4]。

比如:
node_zonelist[0].zones[0]= 节点3.node_zones[0];
node_zonelist[0].zones[1]=节点2.node_zones[0];
node_zonelist[0].zones[2]=节点1.node_zones[0];
node_zonelist[0].zones[3]= NULL
这个意思说:如果想在DMA zone中分配页时,先尝试着节点3的DMA ZONE,
然后尝试着节点2的DMA ZONE,最后是节点1的DMA ZONE。

初始化pgdat中的zonelists

static void __init build_zonelists(pg_data_t *pgdat)                            
1422 {                                                                               
1423         int i, j, k, node, local_node;                                          
1424                                                                                 
1425         local_node = pgdat->node_id;                                                                                                            
1426         for (i = 0; i < GFP_ZONETYPES; i++) {                                   
1427                 struct zonelist *zonelist;                                      
1428                                                                                 
1429                 zonelist = pgdat->node_zonelists + i;                           
1430                 memset(zonelist, 0, sizeof(*zonelist));                         
1431                                                                                 
1432                 j = 0;                                                          
1433                 k = ZONE_NORMAL;                                                
1434                 if (i & __GFP_HIGHMEM)                                          
1435                         k = ZONE_HIGHMEM;                                       
1436                 if (i & __GFP_DMA)                                              
1437                         k = ZONE_DMA;                                           
1438                                                                                 
1439                 j = build_zonelists_node(pgdat, zonelist, j, k);                
1440                 /*                                                              
1441                  * Now we build the zonelist so that it contains the zones      
1442                  * of all the other nodes.                                      
1443                  * We don't want to pressure a particular node, so when                                                                         
1444                  * building the zones for node N, we make sure that the         
1445                  * zones coming right after the local ones are those from       
1446                  * node N+1 (modulo N)                                          
1447                  */                                                             
1448                 printk(KERN_ERR "tom %s(%s:%u)\n", __FUNCTION__, __FILE__, __LINE__);
1449                 for (node = local_node + 1; node < MAX_NUMNODES; node++) {      
1450                         if (!node_online(node))                                 
1451                                 continue;                                       
1452                         j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1453                 }                                                               
1454                 for (node = 0; node < local_node; node++) {                     
1455                         if (!node_online(node))                                 
1456                                 continue;                                       
1457                         j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1458                 }                                                               
1459                                                                                 
1460                 zonelist->zones[j] = NULL;                                      
1461         }                                                                       
1462 }        

build_zonelists_node函数:

1278 static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
1279 {                                                                               
1280         switch (k) {                                                            
1281                 struct zone *zone;                                              
1282         default:                                                                
1283                 BUG();                                                                                                                          
1284         case ZONE_HIGHMEM:                                                      
1285                 zone = pgdat->node_zones + ZONE_HIGHMEM;                        
1286                 if (zone->present_pages) {                                      
1287         printk(KERN_ERR "tom present_pages=%x k=%x\n",zone->present_pages,k);   
1288 #ifndef CONFIG_HIGHMEM                                                          
1289                         BUG();                                                  
1290 #endif                                                                          
1291                         zonelist->zones[j++] = zone;                            
1292                 }                                                               
1293         case ZONE_NORMAL:                                                       
1294                 zone = pgdat->node_zones + ZONE_NORMAL;                         
1295         printk(KERN_ERR "tom present_pages=%x k=%x\n",zone->present_pages,k);   
1296                 if (zone->present_pages)                                        
1297                         zonelist->zones[j++] = zone;                            
1298         case ZONE_DMA:                                                          
1299                 zone = pgdat->node_zones + ZONE_DMA;                            
1300         printk(KERN_ERR "tom present_pages=%x k=%x\n",zone->present_pages,k);   
1301                 if (zone->present_pages)                                        
1302                         zonelist->zones[j++] = zone;                            
1303         }                                                                       
1304                                                                                 
1305         return j;                                                               
1306 }  

打印信息

tom j=0 k=1 i=0
tom present_pages=37000 k=1 j=0
tom present_pages=1000 k=1 j=1
11111111
tom j=0 k=0 i=1
tom present_pages=1000 k=0 j=0
11111111
tom j=0 k=2 i=2
tom present_pages=44ffd k=2 j=0
tom present_pages=37000 k=2 j=1
tom present_pages=1000 k=2 j=2
11111111

最后策略是:

说明:
pgdat->node_zonelists[0]----------------->策略1
pgdat->node_zonelists[1]----------------->策略2
pgdat->node_zonelists[2]----------------->策略3

分配策略和区不关联,不是之前想的请求指定区的分页,然后就分配什么页码;全是分配策略,这边初始化了3中分配策略。
可以通过GFP_USER & GFP_ZONEMASK标志来分配页。

pgdat->node_zonelists[0].zones[0]=zone+1  
pgdat->node_zonelists[0].zones[1]=zone+0
说明:先分配normal zone页,然后dma zone页

pgdat->node_zonelists[1].zones[0]=zone+0
说明:先分配dma zone页。

pgdat->node_zonelists[2].zones[0]=zone+2  
pgdat->node_zonelists[2].zones[1]=zone+1
pgdat->node_zonelists[2].zones[2]=zone+0
说明:先分配high zone页,然后normal zone页.,然后dma页。

3 相关API

3.1 __alloc_bootmem_core

153 static void * __init                                                                                                                                                                                          
154 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,            
155                 unsigned long align, unsigned long goal)                        
156 {                                                                               
157         unsigned long offset, remaining_size, areasize, preferred;              
158         unsigned long i, start = 0, incr, eidx;                                 
159         void *ret;                                                              
160                                                                                 
161         if(!size) {                                                             
162                 printk("__alloc_bootmem_core(): zero-sized request\n");         
163                 BUG();                                                          
164         }                                                                       
165         BUG_ON(align & (align-1));                                              
166                                                                                 
167         eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);    
168         offset = 0;                                                             
169         if (align &&                                                            
170             (bdata->node_boot_start & (align - 1UL)) != 0)                      
171         {                                                                       
172                 offset = (align - (bdata->node_boot_start & (align - 1UL)));    
173         }                                                                       
174         offset >>= PAGE_SHIFT;                                                  
                          
178         /*                                                                      
179          * We try to allocate bootmem pages above 'goal'                        
180          * first, then we try to allocate lower pages.                          
181          */                                                                     
182         if (goal && (goal >= bdata->node_boot_start) &&                         
183             ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {                     
184                 preferred = goal - bdata->node_boot_start;                      
185                                                                                 

188                 if (bdata->last_success >= preferred)                           
189                         preferred = bdata->last_success;                        
190         } else                                                                  
191                 preferred = 0;                   
193         preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT;     
194         preferred += offset;                                                    
195         areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;                                
196         incr = align >> PAGE_SHIFT ? : 1;                                       

199                                                                                 
200 restart_scan:                                                                   
201         for (i = preferred; i < eidx; i += incr) {                              
202                 unsigned long j;                                                
203                 i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);       
204                 i = ALIGN(i, incr);                                             
205                 if (test_bit(i, bdata->node_bootmem_map))                       
206                         continue;                                               
207                 for (j = i + 1; j < i + areasize; ++j) {                        
     
210                         if (j >= eidx)                                          
211                                 goto fail_block;                                
212                         if (test_bit (j, bdata->node_bootmem_map))              
213                                 goto fail_block;                                
214                 }                                                               
215                 start = i;                                                      
216                 goto found;                                                     
217         fail_block:                                                             
218                 i = ALIGN(j, incr);                                             
                                                               
225                                                                                 
226         if (preferred > offset) {                                               
227                 preferred = offset;                                             
228                 goto restart_scan;                                              
229         }                                                                                                                                                                                                     
230         return NULL;            
231                                                                                 
232 found:                                                                          
233         bdata->last_success = start << PAGE_SHIFT;                              
234         BUG_ON(start >= eidx);                                                  
235                                                                                 
236         /*                                                                      
237          * Is the next page of the previous allocation-end the start            
238          * of this allocation's buffer? If yes then we can 'merge'              
239          * the previous partial page with this allocation.                      
240          */                                                                     
241         if (align < PAGE_SIZE &&                                                
242             bdata->last_offset && bdata->last_pos+1 == start) {                 
243                 offset = (bdata->last_offset+align-1) & ~(align-1);             
244                 BUG_ON(offset > PAGE_SIZE);                                     
245                 remaining_size = PAGE_SIZE-offset;                              
246                 if (size < remaining_size) {                                    
247                         areasize = 0;                                           
248                         /* last_pos unchanged */                                
249                         bdata->last_offset = offset+size;                       
250                         ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + 
251                                                 bdata->node_boot_start);        
252                 } else {                                                        
253                         remaining_size = size - remaining_size;                 
254                         areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE;      
255                         ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + 
256                                                 bdata->node_boot_start);        
257                         bdata->last_pos = start+areasize-1;                     
258                         bdata->last_offset = remaining_size;                    
259                 }                                                               
260                 bdata->last_offset &= ~PAGE_MASK;                               
261         } else {                                                                
262                 bdata->last_pos = start + areasize - 1;                         
263                 bdata->last_offset = size & ~PAGE_MASK;                         
264                 ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); 
265         }                                                                       
266                                                                                 
267         /*                                                                                                                                                                                                    
268          * Reserve the area now:                                                
269          */                                                                     
270         for (i = start; i < start+areasize; i++)                                
271                 if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))     
272                         BUG();                                                       
273         memset(ret, 0, size);                                                   
274         return ret;                                                             
275 }   

说明:这个函数主要是向bootmem系统请求内存,请求后返回地址,具体步骤:
1)计算出请求内存的大小对应areasize个页
2)然后从内存管理位图node_bootmem_map找出第i个空闲的页,然后接着这个页,循环看接下来的areasize个页是否也是空闲。
3)如果第i个接下来的areasize个页也是空闲的,则把i对应的虚拟地址给ret,返回,同时在内存管理位图node_bootmem_map记录第i个到i+areasize-1个物理页已经使用,置为1.
4)同时在bdata->last_success = start << PAGE_SHIFT,就是这次分配成功的开始页地址;
bdata->last_offset = size & ~PAGE_MASK;就是分配分配的偏移量
bdata->last_pos = start + areasize - 1,bootmem内存上次分配的坐标,方便下一次分配。

在include/linux/nodemask.h中

#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
#if MAX_NUMNODES > 1                                                            
#define for_each_node_mask(node, mask)                  \                                                        
         for ((node) = first_node(mask);                 \                       
                 (node) < MAX_NUMNODES;                  \                       
                 (node) = next_node((node), (mask)))                             
#else /* MAX_NUMNODES == 1 */                                                   
#define for_each_node_mask(node, mask)                  \                       
         if (!nodes_empty(mask))                         \                       
                 for ((node) = 0; (node) < 1; (node)++)                          
#endif /* MAX_NUMNODES */   

#define first_node(src) __first_node(&(src))                                         static inline int __first_node(const nodemask_t *srcp)                          
  {                                                                               
  return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));                               
  } 
                                                                            

在include/asm-i386/bitops.h中:

/* Returns the bit-number of the first set bit, not the number of 
*  the byte containing a bit.
* /

static inline int find_first_bit(const unsigned long *addr, unsigned size)      
{                                                                               
        int d0, d1;                                                             
        int res;                                                                
                                                                                 
        /* This looks at memory. Mark it volatile to tell gcc not to move it around */
         __asm__ __volatile__(                                                   
                 "xorl %%eax,%%eax\n\t"                                          
                 "repe; scasl\n\t"                                               
                 "jz 1f\n\t"                                                     
                 "leal -4(%%edi),%%edi\n\t"                                      
                 "bsfl (%%edi),%%eax\n"                                          
                 "1:\tsubl %%ebx,%%edi\n\t"                                      
                 "shll $3,%%edi\n\t"                                             
                 "addl %%edi,%%eax"                                              
                 :"=a" (res), "=&c" (d0), "=&D" (d1)                             
                 :"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory");    
         return res;                                                             
} 

在include/asm-i386/numnodes.h中:

#ifdef CONFIG_X86_NUMAQ
#define NODES_SHIFT     4
#elif defined(CONFIG_ACPI_SRAT)
#define NODES_SHIFT     3
#endif
#endif

流程:

paging_init-->zone_sizes_init-->

对node节点的初始化在arch/i386/mm/discontig.c中

for_each_online_node(nid) { 
92                 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};             
393                 unsigned long *zholes_size;                                     
394                 unsigned int max_dma;                                           
395                                                                                 
396                 unsigned long low = max_low_pfn;                                
397                 unsigned long start = node_start_pfn[nid];                      
398                 unsigned long high = node_end_pfn[nid];                         
399                                                                                 
400                 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;  
401                                                                                 
402                 if (start > low) {                                              
403 #ifdef CONFIG_HIGHMEM                                                           
404                         BUG_ON(start > high);                                   
405                         zones_size[ZONE_HIGHMEM] = high - start;                
406 #endif                                                                          
407                 } else {                                                        
408                         if (low < max_dma)                                      
409                                 zones_size[ZONE_DMA] = low;                     
410                         else {                                                  
411                                 BUG_ON(max_dma > low);                          
412                                 BUG_ON(low > high);                             
413                                 zones_size[ZONE_DMA] = max_dma;                 
414                                 zones_size[ZONE_NORMAL] = low - max_dma;        
415 #ifdef CONFIG_HIGHMEM                                                           
416                                 zones_size[ZONE_HIGHMEM] = high - low;          
417 #endif                                                                          
418                         }                                                       
419                 }  



}

在mm/page_alloc.c中

nodemask_t node_online_map = { { [0] = 1UL } };

node_oneline_map是个结构体,结构体含有一个数组unsigned long bits[1]的成员,并赋值bits[0]=1。

arch/i386/mm/discontig.c

                                       
Created with Raphaël 2.2.0 Start start_kernel setup_arch ifdef CONFIG_DISCONTIGMEM? arch/i386/kernel/setup.c->setup_memory arch/i386/mm/init.c->paging_init ifdef CONFIG_DISCONTIGMEM? arch/i386/mm/discontig.c->zone_sizes_init End arch/i386/kernel/setup.c->zone_sizes_init free_area_init_node free_area_init_core arch/i386/mm/discontig.c->setup_memory get_memcfg_numa get_memcfg_numaq i386/kernel/numaq.c->smp_dump_get yes no yes no

说明:
根据流程图,有三处对node进行初始化。
1)arch/i386/kernel/numaq.c

  static void __init smp_dump_qct(void)                                           
 {                                                                               
          int node;                                                               
          struct eachquadmem *eq;                                                 
          struct sys_cfg_data *scd =                                              
                 (struct sys_cfg_data*)__va(SYS_CFG_DATA_PRIV_ADDR);            
                                                                                  
         nodes_clear(node_online_map);                                           
         for_each_node(node) {                                                   
                 if (scd->quads_present31_0 & (1 << node)) {                     
                         node_set_online(node);                                  
                         eq = &scd->eq[node];                                    
                        /* Convert to pages */                                  
                         node_start_pfn[node] = MB_TO_PAGES(                     
                        eq->hi_shrd_mem_start - eq->priv_mem_size);     
                          node_end_pfn[node] = MB_TO_PAGES(                                                                                        
                      eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);  
                 }                                                               
          }                                                                       
 } 

2)arch/i386/mm/discontig.c

unsigned long __init setup_memory(void)
{
---
for_each_online_node(nid) {
----
}
----
}

3)arch/i386/mm/discontig.c

void __init zone_sizes_init(void)
{
....
for_each_online_node(nid) {
.....
}

}

考虑到,2.6.11这个版本numa无法编译通过,所以暂时不去分析这个版本numa.

c语言结构体的数组成员的赋值:

#include <stdio.h>                                                              
struct student                                                                  
{                                                                               
    int aa[1];                                                              
};                                                                              
int                                                                             
main()                                                                          
{                                                                               
    struct student a ={{[0]=5}};                                                             struct student b ={.aa[0]=100};                                                 
   printf("aa=%x",a.aa[0]);                                                        
printf(".aa=%x",b.aa[0]);
}

位图知识

#测试代码

tom@tom-linuxer:~/study$ cat test5.c
#include <stdio.h>
#define BITS_PER_LONG 32
#define NODES_SHIFT 4
#define MAX_NUMNODES    (1 << NODES_SHIFT)
#define BITS_TO_LONGS(bits) \
	(((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
#define DECLARE_BITMAP(name,bits) \
	unsigned long name[BITS_TO_LONGS(bits)]
#define NODES_SHIFT 4
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;

#define min_t(type,x,y) \
	({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
#define max_t(type,x,y) \
	({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })



int find_first_bit(const unsigned long *addr, unsigned size)
{
	int d0, d1;
	int res;

	/* This looks at memory. Mark it volatile to tell gcc not to move it around */
	__asm__ __volatile__(
		"xorl %%eax,%%eax\n\t"
		"repe; scasl\n\t"
		"jz 1f\n\t"
		"leal -4(%%edi),%%edi\n\t"
		"bsfl (%%edi),%%eax\n"
		"1:\tsubl %%ebx,%%edi\n\t"
		"shll $3,%%edi\n\t"
		"addl %%edi,%%eax"
		:"=a" (res), "=&c" (d0), "=&D" (d1)
		:"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory");
	return res;
}
int find_next_bit(const unsigned long *addr, int size, int offset)
{
	const unsigned long *p = addr + (offset >> 5);
	/*int *p = addr + (offset >> 5);*/
	int set = 0, bit = offset & 31, res;

	if (bit) {
		/*
		 * Look for nonzero in the first 32 bits:
		 */
		__asm__("bsfl %1,%0\n\t"
			"jne 1f\n\t"
			"movl $32, %0\n"
			"1:"
			: "=r" (set)
			: "r" (*p >> bit));
		if (set < (32 - bit))
			return set + offset;
		set = 32 - bit;
		p++;
	}
	/*
	 * No set bit yet, search remaining full words for a bit
	 */
	res = find_first_bit (p, size - 32 * (p - addr));
	return (offset + set + res);
}
#define first_node(src) __first_node(&(src))
int __first_node(const nodemask_t *srcp)
{
	return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}

#define next_node(n, src) __next_node((n), &(src))
int __next_node(int n, const nodemask_t *srcp)
{
	return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}
#define for_each_node_mask(node, mask)			\
	for ((node) = first_node(mask);			\
		(node) < MAX_NUMNODES;			\
		(node) = next_node((node), (mask)))
#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
int
main()
{

	nodemask_t node_online_map = { { [0] = 7UL } };
	int nid;
	for_each_online_node(nid){
	    printf("nid=%d\n",nid);
	}
	return 0;
}

打印结果是:
nid=0
nid=1
nid=2

说明:
编译要加-m32选项,要不然会报错:

  1. register type mismatch for `bsf’
    报错在find_next_bit函数的汇编中
  2. 执行会在报错,报错在find_first-bit函数的汇编中

内核结点的数组

  1. 关于node_data的定义:
    在include/asm-i386/mmzone.h中
extern struct pglist_data *node_data[];

在文件arch/i386/mm/discontig.c中

struct pglist_data *node_data[MAX_NUMNODES];

没打开CONFIG_DISCONTIGMEM是不会用的这个discontig.c文件。

目前理解:就是使用外部node_data,外部没定义,就默认是初始化。

  1. 关于NODE_DATA的作用
    #define NODE_DATA(nid) (node_data[nid])

  2. for_each_online_node(node)的作用,
    就是循环,看有多少个节点,主要使用整数的node,因为定义了
    nodemask_t node_online_map = { { [0] = 1UL } };也就是node=0,然后根据node这个数,结合NODE_DATA(node)来操作node_data这个数组。

下面分析怎么初始化一个节点

setup_memory---->init_bootmem—>init_bootmem_core

#参考
初始化内存域和节点

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值