Linux伙伴系统分配器

最新推荐文章于 2024-03-01 11:19:53 发布

velanjun

最新推荐文章于 2024-03-01 11:19:53 发布

阅读量1.3k

点赞数

分类专栏： linux

linux 专栏收录该内容

124 篇文章 1 订阅

订阅专栏

一、Linux伙伴系统分配器

伙伴系统分配器大体上分为两类。__get_free_pages()类函数返回分配的第一个页面的线性地址；alloc_pages()类函数返回页面描述符地址。不管以哪种函数进行分配，最终会调用alloc_pages()进行分配页面。

为清楚了解其分配制度，先给个伙伴系统数据的存储框图

也就是每个order对应一个free_area结构，free_area以不同的类型以链表的方式存储这些内存块。

二、主分配函数

下面我们来看这个函数（在UMA模式下）

1. #define alloc_pages(gfp_mask, order) \

2. alloc_pages_node(numa_node_id(), gfp_mask, order)

1. static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,

2. unsigned int order)

3. {

4. /* Unknown node is current node */

5. if (nid < 0)

6. nid = numa_node_id();

8. return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));

9. }

1. static inline struct page *

2. __alloc_pages(gfp_t gfp_mask, unsigned int order,

3. struct zonelist *zonelist)

4. {

5. return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);

6. }

上层分配函数__alloc_pages_nodemask()

1. /*

2. * This is the 'heart' of the zoned buddy allocator.

3. */

4. /*上层分配器运用了各种方式进行*/

5. struct page *

6. __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

7. struct zonelist *zonelist, nodemask_t *nodemask)

8. {

9. enum zone_type high_zoneidx = gfp_zone(gfp_mask);

10. struct zone *preferred_zone;

11. struct page *page;

12.

13. /* Convert GFP flags to their corresponding migrate type */

14. int migratetype = allocflags_to_migratetype(gfp_mask);

15.

16. gfp_mask &= gfp_allowed_mask;

17. /*调试用*/

18. lockdep_trace_alloc(gfp_mask);

19. /*如果__GFP_WAIT标志设置了，需要等待和重新调度*/

20. might_sleep_if(gfp_mask & __GFP_WAIT);

21. /*没有设置对应的宏*/

22. if (should_fail_alloc_page(gfp_mask, order))

23. return NULL;

24.

25. /*

26. * Check the zones suitable for the gfp_mask contain at least one

27. * valid zone. It's possible to have an empty zonelist as a result

28. * of GFP_THISNODE and a memoryless node

29. */

30. if (unlikely(!zonelist->_zonerefs->zone))

31. return NULL;

32.

33. /* The preferred zone is used for statistics later */

34. /* 英文注释所说*/

35. first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);

36. if (!preferred_zone)

37. return NULL;

38.

39. /* First allocation attempt */

40. /*从pcp和伙伴系统中正常的分配内存空间*/

41. page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

42. zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,

43. preferred_zone, migratetype);

44. if (unlikely(!page))/*如果上面没有分配到空间，调用下面函数慢速分配，允许等待和回收*/

45. page = __alloc_pages_slowpath(gfp_mask, order,

46. zonelist, high_zoneidx, nodemask,

47. preferred_zone, migratetype);

48. /*调试用*/

49. trace_mm_page_alloc(page, order, gfp_mask, migratetype);

50. return page;

51. }

三、从pcp和伙伴系统中正常的分配内存空间

函数get_page_from_freelist()

1. /*

2. * get_page_from_freelist goes through the zonelist trying to allocate

3. * a page.

4. */

5. /*为分配制定内存空间，遍历每个zone*/

6. static struct page *

7. get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

8. struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

9. struct zone *preferred_zone, int migratetype)

10. {

11. struct zoneref *z;

12. struct page *page = NULL;

13. int classzone_idx;

14. struct zone *zone;

15. nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

16. int zlc_active = 0; /* set if using zonelist_cache */

17. int did_zlc_setup = 0; /* just call zlc_setup() one time */

18. /*zone对应的下标*/

19. classzone_idx = zone_idx(preferred_zone);

20. zonelist_scan:

21. /*

22. * Scan zonelist, looking for a zone with enough free.

23. * See also cpuset_zone_allowed() comment in kernel/cpuset.c.

24. */

25. /*遍历每个zone，进行分配*/

26. for_each_zone_zonelist_nodemask(zone, z, zonelist,

27. /*在UMA模式下不成立*/ high_zoneidx, nodemask) {

28. if (NUMA_BUILD && zlc_active &&

29. !zlc_zone_worth_trying(zonelist, z, allowednodes))

30. continue;

31. if ((alloc_flags & ALLOC_CPUSET) &&

32. !cpuset_zone_allowed_softwall(zone, gfp_mask))

33. goto try_next_zone;

34.

35. BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

36. /*需要关注水位*/

37. if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {

38. unsigned long mark;

39. int ret;

40. /*从flags中取的mark*/

41. mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

42. /*如果水位正常，从本zone中分配*/

43. if (zone_watermark_ok(zone, order, mark,

44. classzone_idx, alloc_flags))

45. goto try_this_zone;

46.

47. if (zone_reclaim_mode == 0)/*如果上面检查的水位低于正常值，且没有设置页面回收值*/

48. goto this_zone_full;

49. /*在UMA模式下下面函数直接返回0*/

50. ret = zone_reclaim(zone, gfp_mask, order);

51. switch (ret) {

52. case ZONE_RECLAIM_NOSCAN:

53. /* did not scan */

54. goto try_next_zone;

55. case ZONE_RECLAIM_FULL:

56. /* scanned but unreclaimable */

57. goto this_zone_full;

58. default:

59. /* did we reclaim enough */

60. if (!zone_watermark_ok(zone, order, mark,

61. classzone_idx, alloc_flags))

62. goto this_zone_full;

63. }

64. }

65.

66. try_this_zone:/*本zone正常水位*/

67. /*先从pcp中分配，然后不行的话再从伙伴系统中分配*/

68. page = buffered_rmqueue(preferred_zone, zone, order,

69. gfp_mask, migratetype);

70. if (page)

71. break;

72. this_zone_full:

73. if (NUMA_BUILD)/*UMA模式为0*/

74. zlc_mark_zone_full(zonelist, z);

75. try_next_zone:

76. if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {

77. /*

78. * we do zlc_setup after the first zone is tried but only

79. * if there are multiple nodes make it worthwhile

80. */

81. allowednodes = zlc_setup(zonelist, alloc_flags);

82. zlc_active = 1;

83. did_zlc_setup = 1;

84. }

85. }

86.

87. if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {

88. /* Disable zlc cache for second zonelist scan */

89. zlc_active = 0;

90. goto zonelist_scan;

91. }

92. return page;/*返回页面*/

93. }

主分配函数

1. /*

2. * Really, prep_compound_page() should be called from __rmqueue_bulk(). But

3. * we cheat by calling it from here, in the order > 0 path. Saves a branch

4. * or two.

5. */

6. /*先考虑从pcp中分配空间，当order大于0时再考虑从伙伴系统中分配*/

7. static inline

8. struct page *buffered_rmqueue(struct zone *preferred_zone,

9. struct zone *zone, int order, gfp_t gfp_flags,

10. int migratetype)

11. {

12. unsigned long flags;

13. struct page *page;

14. int cold = !!(gfp_flags & __GFP_COLD);/*如果分配参数指定了__GFP_COLD标志，则设置cold标志*/

15. int cpu;

16.

17. again:

18. cpu = get_cpu();

19. if (likely(order == 0)) {/*分配一个页面时，使用pcp*/

20. struct per_cpu_pages *pcp;

21. struct list_head *list;

22. /*找到zone对应的pcp*/

23. pcp = &zone_pcp(zone, cpu)->pcp;

24. list = &pcp->lists[migratetype];/*pcp中对应类型的list*/

25.

26. /* 这里需要关中断，因为内存回收过程可能发送核间中断，强制每个核从每CPU

27. 缓存中释放页面。而且中断处理函数也会分配单页。 */

28. local_irq_save(flags);

29. if (list_empty(list)) {/*如果pcp中没有页面,需要补充*/

30. /*从伙伴系统中获得batch个页面

31. batch为一次分配的页面数*/

32. pcp->count += rmqueue_bulk(zone, 0,

33. pcp->batch, list,

34. migratetype, cold);

35. /*如果链表仍然为空，申请失败返回*/

36. if (unlikely(list_empty(list)))

37. goto failed;

38. }

39. /* 如果分配的页面不需要考虑硬件缓存(注意不是每CPU页面缓存)

40. ，则取出链表的最后一个节点返回给上层*/

41. if (cold)

42. page = list_entry(list->prev, struct page, lru);

43. else/* 如果要考虑硬件缓存，则取出链表的第一个页面，这个页面是最近刚释放到每CPU

44. 缓存的，缓存热度更高 */

45. page = list_entry(list->next, struct page, lru);

46.

47. list_del(&page->lru);/*从pcp中脱离*/

48. pcp->count--;/*pcp计数减一*/

49. }

50. else {/*当order为大于1时，不从pcp中分配，直接考虑从伙伴系统中分配*/

51. if (unlikely(gfp_flags & __GFP_NOFAIL)) {

52. /*

53. * __GFP_NOFAIL is not to be used in new code.

54. *

55. * All __GFP_NOFAIL callers should be fixed so that they

56. * properly detect and handle allocation failures.

57. *

58. * We most definitely don't want callers attempting to

59. * allocate greater than order-1 page units with

60. * __GFP_NOFAIL.

61. */

62. WARN_ON_ONCE(order > 1);

63. }

64. /* 关中断，并获得管理区的锁*/

65. spin_lock_irqsave(&zone->lock, flags);

66. /*从伙伴系统中相应类型的相应链表中分配空间*/

67. page = __rmqueue(zone, order, migratetype);

68. /* 已经分配了1 << order个页面，这里进行管理区空闲页面统计计数*/

69. __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));

70. spin_unlock(&zone->lock);/* 这里仅仅打开自旋锁，待后面统计计数设置完毕后再开中断*/

71. if (!page)

72. goto failed;

73. }

74. /*事件统计计数，调试*/

75. __count_zone_vm_events(PGALLOC, zone, 1 << order);

76. zone_statistics(preferred_zone, zone);

77. local_irq_restore(flags);/*恢复中断*/

78. put_cpu();

79.

80. VM_BUG_ON(bad_range(zone, page));

81.

82. /* 这里进行安全性检查，并进行一些善后工作。

83. 如果页面标志破坏，返回的页面出现了问题，则返回试图分配其他页面*/

84. if (prep_new_page(page, order, gfp_flags))

85. goto again;

86. return page;

87.

88. failed:

89. local_irq_restore(flags);

90. put_cpu();

91. return NULL;

92. }

3.1 pcp缓存补充

从伙伴系统中获得batch个页面，batch为一次分配的页面数rmqueue_bulk()函数。

1. /*

2. * Obtain a specified number of elements from the buddy allocator, all under

3. * a single hold of the lock, for efficiency. Add them to the supplied list.

4. * Returns the number of new pages which were placed at *list.

5. */

6. /*该函数返回的是1<<order个页面，但是在pcp

7. 处理中调用，其他地方没看到，order为0

8. 也就是说返回的是页面数，加入的链表为

9. 对应调用pcp的链表*/

10. static int rmqueue_bulk(struct zone *zone, unsigned int order,

11. unsigned long count, struct list_head *list,

12. int migratetype, int cold)

13. {

14. int i;

15.

16. spin_lock(&zone->lock);/* 上层函数已经关了中断，这里需要操作管理区，获取管理区的自旋锁 */

17. for (i = 0; i < count; ++i) {/* 重复指定的次数，从伙伴系统中分配页面*/

18. /* 从伙伴系统中取出页面 */

19. struct page *page = __rmqueue(zone, order, migratetype);

20. if (unlikely(page == NULL))/*分配失败*/

21. break;

22.

23. /*

24. * Split buddy pages returned by expand() are received here

25. * in physical page order. The page is added to the callers and

26. * list and the list head then moves forward. From the callers

27. * perspective, the linked list is ordered by page number in

28. * some conditions. This is useful for IO devices that can

29. * merge IO requests if the physical pages are ordered

30. * properly.

31. */

32. if (likely(cold == 0))/*根据调用者的要求，将页面放到每CPU缓存链表的头部或者尾部*/

33. list_add(&page->lru, list);

34. else

35. list_add_tail(&page->lru, list);

36. set_page_private(page, migratetype);/*设置private属性为页面的迁移类型*/

37. list = &page->lru;

38. }

39. /*递减管理区的空闲页面计数*/

40. __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

41. spin_unlock(&zone->lock);/*释放管理区的子璇锁*/

42. return i;

43. }

3.2 从伙伴系统中取出页面

__rmqueue()函数

1. /*

2. * Do the hard work of removing an element from the buddy allocator.

3. * Call me with the zone->lock already held.

4. */

5. /*采用两种范式试着分配order个page*/

6. static struct page *__rmqueue(struct zone *zone, unsigned int order,

7. int migratetype)

8. {

9. struct page *page;

10.

11. retry_reserve:

12. /*从指定order开始从小到达遍历,优先从指定的迁移类型链表中分配页面*/

13. page = __rmqueue_smallest(zone, order, migratetype);

14.

15. /*

16. * 如果满足以下两个条件,就从备用链表中分配页面:

17. * 快速流程没有分配到页面,需要从备用迁移链表中分配.

18. * 当前不是从保留的链表中分配.因为保留的链表是最后可用的链表,

19. * 不能从该链表分配的话,说明本管理区真的没有可用内存了.

20. */

21. if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

22. /*order从大到小遍历，从备用链表中分配页面*/

23. page = __rmqueue_fallback(zone, order, migratetype);

24.

25. /*

26. * Use MIGRATE_RESERVE rather than fail an allocation. goto

27. * is used because __rmqueue_smallest is an inline function

28. * and we want just one call site

29. */

30. if (!page) {/* 备用链表中没有分配到页面,从保留链表中分配页面了 */

31. migratetype = MIGRATE_RESERVE;

32. goto retry_reserve;/* 跳转到retry_reserve,从保留的链表中分配页面*/

33. }

34. }

35. /*调试代码*/

36. trace_mm_page_alloc_zone_locked(page, order, migratetype);

37. return page;

38. }

3.2.1 从指定的迁移类型链表中分配页面

从指定order开始从小到达遍历,优先从指定的迁移类型链表中分配页面__rmqueue_smallest(zone, order, migratetype);

1. /*

2. * Go through the free lists for the given migratetype and remove

3. * the smallest available page from the freelists

4. */

5. /*从给定的order开始，从小到大遍历；

6. 找到后返回页面基址，合并分割后的空间*/

7. static inline

8. struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

9. int migratetype)

10. {

11. unsigned int current_order;

12. struct free_area * area;

13. struct page *page;

14.

15. /* Find a page of the appropriate size in the preferred list */

16. for (current_order = order; current_order < MAX_ORDER; ++current_order) {

17. area = &(zone->free_area[current_order]);/*得到指定order的area*/

18. /*如果area指定类型的伙伴系统链表为空*/

19. if (list_empty(&area->free_list[migratetype]))

20. continue;/*查找下一个order*/

21. /*对应的链表不空，得到链表中数据*/

22. page = list_entry(area->free_list[migratetype].next,

23. struct page, lru);

24. list_del(&page->lru);/*从伙伴系统中删除；*/

25. rmv_page_order(page);/*移除page中order的变量*/

26. area->nr_free--;/*空闲块数减一*/

27. /*拆分、合并*/

28. expand(zone, page, order, current_order, area, migratetype);

29. return page;

30. }

31.

32. return NULL;

33. }

伙伴系统内存块拆分和合并

看一个辅助函数，用于伙伴系统中内存块的拆分、合并

1. /*

2. * The order of subdivision here is critical for the IO subsystem.

3. * Please do not alter this order without good reasons and regression

4. * testing. Specifically, as large blocks of memory are subdivided,

5. * the order in which smaller blocks are delivered depends on the order

6. * they're subdivided in this function. This is the primary factor

7. * influencing the order in which pages are delivered to the IO

8. * subsystem according to empirical testing, and this is also justified

9. * by considering the behavior of a buddy system containing a single

10. * large block of memory acted on by a series of small allocations.

11. * This behavior is a critical factor in sglist merging's success.

12. *

13. * -- wli

14. */

15. /*此函数主要用于下面这种情况:

16. 分配函数从high中分割出去了low大小的内存；

17. 然后要将high留下的内存块合并放到伙伴系统中；*/

18. static inline void expand(struct zone *zone, struct page *page,

19. int low, int high, struct free_area *area,

20. int migratetype)

21. {

22. unsigned long size = 1 << high;

23.

24. while (high > low) {/*因为去掉了low的大小，所以最后肯定剩下的

25. 是low的大小(2的指数运算)*/

26. area--;/*减一到order减一的area*/

27. high--;/*order减一*/

28. size >>= 1;/*大小除以2*/

29. VM_BUG_ON(bad_range(zone, &page[size]));

30. /*加到指定的伙伴系统中*/

31. list_add(&page[size].lru, &area->free_list[migratetype]);

32. area->nr_free++;/*空闲块加一*/

33. set_page_order(&page[size], high);/*设置相关order*/

34. }

35. }

3.2.2 从备用链表中分配页面

1. /* Remove an element from the buddy allocator from the fallback list */

2. static inline struct page *

3. __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

4. {

5. struct free_area * area;

6. int current_order;

7. struct page *page;

8. int migratetype, i;

10. /* Find the largest possible block of pages in the other list */

11.

12. /* 从最高阶搜索,这样可以尽量的将其他迁移列表中的大块分割,避免形成过多的碎片 */

13. for (current_order = MAX_ORDER-1; current_order >= order;

14. --current_order) {

15. for (i = 0; i < MIGRATE_TYPES - 1; i++) {

16. /*回调到下一个migratetype*/

17. migratetype = fallbacks[start_migratetype][i];

18.

19. /* MIGRATE_RESERVE handled later if necessary */

20.

21. /* 本函数不处理MIGRATE_RESERVE类型的迁移链表,如果本函数返回NULL,

22. 则上层函数直接从MIGRATE_RESERVE中分配 */

23. if (migratetype == MIGRATE_RESERVE)

24. continue;/*访问下一个类型*/

25.

26. area = &(zone->free_area[current_order]);

27. /*如果指定order和类型的链表为空*/

28. if (list_empty(&area->free_list[migratetype]))

29. continue;/*访问下一个类型*/

30. /*得到指定类型和order的页面基址*/

31. page = list_entry(area->free_list[migratetype].next,

32. struct page, lru);

33. area->nr_free--;/*空闲块数减一*/

34.

35. /*

36. * If breaking a large block of pages, move all free

37. * pages to the preferred allocation list. If falling

38. * back for a reclaimable kernel allocation, be more

39. * agressive about taking ownership of free pages

40. */

41. if (unlikely(current_order >= (pageblock_order >> 1)) ||/* 要分割的页面是一个大页面,则将整个页面全部迁移到当前迁移类型的链表中,

42. 这样可以避免过多的碎片 */

43. start_migratetype == MIGRATE_RECLAIMABLE ||/* 目前分配的是可回收页面,这类页面有突发的特点,将页面全部迁移到可回收链表中,

44. 可以避免将其他迁移链表分割成太多的碎片 */

45. page_group_by_mobility_disabled) {/* 指定了迁移策略,总是将被分割的页面迁移 */

46.

47. unsigned long pages;

48. /*移动到先前类型的伙伴系统中*/

49. pages = move_freepages_block(zone, page,

50. start_migratetype);

51.

52. /* Claim the whole block if over half of it is free */

53.

54. /* pages是移动的页面数,如果可移动的页面数量较多,

55. 则将整个大内存块的迁移类型修改 */

56. if (pages >= (1 << (pageblock_order-1)) ||

57. page_group_by_mobility_disabled)

58. /*设置页面标示*/

59. set_pageblock_migratetype(page,

60. start_migratetype);

61.

62. migratetype = start_migratetype;

63. }

64.

65. /* Remove the page from the freelists */

66. list_del(&page->lru);

67. rmv_page_order(page);

68.

69. /* Take ownership for orders >= pageblock_order */

70. if (current_order >= pageblock_order)//大于pageblock_order的部分设置相应标示

71. /*这个不太可能，因为pageblock_order为10*/

72. change_pageblock_range(page, current_order,

73. start_migratetype);

74. /*拆分和合并*/

75. expand(zone, page, order, current_order, area, migratetype);

76.

77. trace_mm_page_alloc_extfrag(page, order, current_order,

78. start_migratetype, migratetype);

79.

80. return page;

81. }

82. }

83.

84. return NULL;

85. }

备用链表

1. /*

2. * This array describes the order lists are fallen back to when

3. * the free lists for the desirable migrate type are depleted

4. */

5. /*指定类型的链表为空时，这个数组规定

6. 回调的到那个类型的链表*/

7. static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {

8. [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

9. [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

10. [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

11. [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */

12. };

移动到指定类型的伙伴系统中

1. /*将指定区域段的页面移动到指定类型的

2. 伙伴系统中，其实就是将页面的类型做了

3. 更改，但是是采用移动的方式

5. 功能和上面函数类似，但是要求以

6. 页面块方式对其*/

7. static int move_freepages_block(struct zone *zone, struct page *page,

8. int migratetype)

9. {

10. unsigned long start_pfn, end_pfn;

11. struct page *start_page, *end_page;

12.

13. /*如下是对齐操作，其中变量pageblock_nr_pages为MAX_ORDER-1*/

14. start_pfn = page_to_pfn(page);

15. start_pfn = start_pfn & ~(pageblock_nr_pages-1);

16. start_page = pfn_to_page(start_pfn);

17. end_page = start_page + pageblock_nr_pages - 1;

18. end_pfn = start_pfn + pageblock_nr_pages - 1;

19.

20. /* Do not cross zone boundaries */

21. if (start_pfn < zone->zone_start_pfn)

22. start_page = page;

23. /*结束边界检查*/

24. if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)

25. return 0;

26. /*调用上面函数*/

27. return move_freepages(zone, start_page, end_page, migratetype);

28. }

1. /*

2. * Move the free pages in a range to the free lists of the requested type.

3. * Note that start_page and end_pages are not aligned on a pageblock

4. * boundary. If alignment is required, use move_freepages_block()

5. */

6. /*将指定区域段的页面移动到指定类型的

7. 伙伴系统中，其实就是将页面的类型做了更改，但是是采用移动的方式*/

8. static int move_freepages(struct zone *zone,

9. struct page *start_page, struct page *end_page,

10. int migratetype)

11. {

12. struct page *page;

13. unsigned long order;

14. int pages_moved = 0;

15.

16. #ifndef CONFIG_HOLES_IN_ZONE

17. /*

18. * page_zone is not safe to call in this context when

19. * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

20. * anyway as we check zone boundaries in move_freepages_block().

21. * Remove at a later date when no bug reports exist related to

22. * grouping pages by mobility

23. */

24. BUG_ON(page_zone(start_page) != page_zone(end_page));

25. #endif

26.

27. for (page = start_page; page <= end_page;) {

28. /* Make sure we are not inadvertently changing nodes */

29. VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

30.

31. if (!pfn_valid_within(page_to_pfn(page))) {

32. page++;

33. continue;

34. }

35.

36. if (!PageBuddy(page)) {

37. page++;

38. continue;

39. }

40.

41. order = page_order(page);

42. list_del(&page->lru);/*将页面块从原来的伙伴系统链表*/

43. /*中删除，注意，这里不是一个页面

44. *而是以该页面的伙伴块*/

45. list_add(&page->lru,/*添加到指定order和类型下的伙伴系统链表*/

46. &zone->free_area[order].free_list[migratetype]);

47. page += 1 << order;/*移动页面数往上定位*/

48. pages_moved += 1 << order;/*移动的页面数*/

49. }

50.

51. return pages_moved;

52. }

四、慢速分配，允许等待和回收

1. /**

2. * 当无法快速分配页面时，如果调用者允许等待

3. ，则通过本函数进行慢速分配。

4. * 此时允许进行内存回收。

5. */

6. static inline struct page *

7. __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

8. struct zonelist *zonelist, enum zone_type high_zoneidx,

9. nodemask_t *nodemask, struct zone *preferred_zone,

10. int migratetype)

11. {

12. const gfp_t wait = gfp_mask & __GFP_WAIT;

13. struct page *page = NULL;

14. int alloc_flags;

15. unsigned long pages_reclaimed = 0;

16. unsigned long did_some_progress;

17. struct task_struct *p = current;

18.

19. /*

20. * In the slowpath, we sanity check order to avoid ever trying to

21. * reclaim >= MAX_ORDER areas which will never succeed. Callers may

22. * be using allocators in order of preference for an area that is

23. * too large.

24. *//*参数合法性检查*/

25. if (order >= MAX_ORDER) {

26. WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

27. return NULL;

28. }

29.

30. /*

31. * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

32. * __GFP_NOWARN set) should not cause reclaim since the subsystem

33. * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

34. * using a larger set of nodes after it has established that the

35. * allowed per node queues are empty and that nodes are

36. * over allocated.

37. */

38. /**

39. * 调用者指定了GFP_THISNODE标志，表示不能进行内存回收。

40. * 上层调用者应当在指定了GFP_THISNODE失败后，使用其他标志进行分配。

41. */

42. if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)

43. goto nopage;

44.

45. restart:/*如果调用者没有禁止kswapd，则唤醒该线程进行内存回收。*/

46. wake_all_kswapd(order, zonelist, high_zoneidx);

47.

48. /*

49. * OK, we're below the kswapd watermark and have kicked background

50. * reclaim. Now things get more complex, so set up alloc_flags according

51. * to how we want to proceed.

52. */

53. /*根据分配标志确定内部标志，主要是用于水线 */

54. alloc_flags = gfp_to_alloc_flags(gfp_mask);

55.

56. /**

57. * 与快速分配流程相比，这里的分配标志使用了低的水线。

58. * 在进行内存回收操作前，我们使用低水线再尝试分配一下。

59. * 当然，不管是否允许ALLOC_NO_WATERMARKS标志，我们都将它清除。

60. */

61. /* This is the last chance, in general, before the goto nopage. */

62. page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

63. high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

64. preferred_zone, migratetype);

65. if (page)/*分配成功，找到页面*/

66. goto got_pg;

67.

68. rebalance:

69. /* Allocate without watermarks if the context allows */

70. /* 某些上下文，如内存回收进程及被杀死的任务，都允许它完全突破水线的限制分配内存。 */

71. if (alloc_flags & ALLOC_NO_WATERMARKS) {

72. page = __alloc_pages_high_priority(gfp_mask, order,

73. zonelist, high_zoneidx, nodemask,

74. preferred_zone, migratetype);

75. if (page))/* 在不考虑水线的情况下，分配到了内存 */

76. goto got_pg;

77. }

78.

79. /* Atomic allocations - we can't balance anything */

80. /* 调用者希望原子分配内存，此时不能等待内存回收，返回NULL */

81. if (!wait)

82. goto nopage;

83.

84. /* Avoid recursion of direct reclaim */

85. /* 调用者本身就是内存回收进程，不能进入后面的内存回收处理流程，否则死锁 */

86. if (p->flags & PF_MEMALLOC)

87. goto nopage;

88.

89. /* Avoid allocations with no watermarks from looping endlessly */

90. /**

91. * 当前线程正在被杀死，它可以完全突破水线分配内存。这里向上层返回NULL，是为了避免系统进入死循环。

92. * 当然，如果上层调用不允许失败，则死循环继续分配，等待其他线程释放一点点内存。

93. */

94. if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

95. goto nopage;

96.

97. /* Try direct reclaim and then allocating */

98. /**

99. * 直接在内存分配上下文中进行内存回收操作。

100. */

101. page = __alloc_pages_direct_reclaim(gfp_mask, order,

102. zonelist, high_zoneidx,

103. nodemask,

104. alloc_flags, preferred_zone,

105. migratetype, &did_some_progress);

106. if (page))/* 庆幸，回收了一些内存后，满足了上层分配需求 */

107. goto got_pg;

108.

109. /*

110. * If we failed to make any progress reclaiming, then we are

111. * running out of options and have to consider going OOM

112. */

113. /* 内存回收过程没有回收到内存，系统真的内存不足了 */

114. if (!did_some_progress) {

115. /**

116. * 调用者不是文件系统的代码，允许进行文件系统操作，并且允许重试。

117. * 这里需要__GFP_FS标志可能是进入OOM流程后会杀进程或进入panic，需要文件操作。

118. */

119. if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

120. if (oom_killer_disabled)/* 系统禁止了OOM，向上层返回NULL */

121. goto nopage;

122. /**

123. * 杀死其他进程后再尝试分配内存

124. */

125. page = __alloc_pages_may_oom(gfp_mask, order,

126. zonelist, high_zoneidx,

127. nodemask, preferred_zone,

128. migratetype);

129. if (page)

130. goto got_pg;

131.

132. /*

133. * The OOM killer does not trigger for high-order

134. * ~__GFP_NOFAIL allocations so if no progress is being

135. * made, there are no other options and retrying is

136. * unlikely to help.

137. */)/* 要求的页面数量较多，再试意义不大 */

138. if (order > PAGE_ALLOC_COSTLY_ORDER &&

139. !(gfp_mask & __GFP_NOFAIL))

140. goto nopage;

141.

142. goto restart;

143. }

144. }

145.

146. /* Check if we should retry the allocation */

147. /* 内存回收过程回收了一些内存，接下来判断是否有必要继续重试 */

148. pages_reclaimed += did_some_progress;

149. if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {

150. /* Wait for some write requests to complete then retry */

151. congestion_wait(BLK_RW_ASYNC, HZ/50);

152. goto rebalance;

153. }

154.

155. nopage:

156. /* 内存分配失败了，打印内存分配失败的警告 */

157. if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {

158. printk(KERN_WARNING "%s: page allocation failure."

159. " order:%d, mode:0x%x\n",

160. p->comm, order, gfp_mask);

161. dump_stack();

162. show_mem();

163. }

164. return page;

165. got_pg:

166. /* 运行到这里，说明成功分配了内存，这里进行内存检测调试 */

167. if (kmemcheck_enabled)

168. kmemcheck_pagealloc_alloc(page, order, gfp_mask);

169. return page;

170.

171. }

总结：Linux伙伴系统主要分配流程为

正常非配（或叫快速分配）流程：

1，如果分配的是单个页面，考虑从per CPU缓存中分配空间，如果缓存中没有页面，从伙伴系统中提取页面做补充。

2，分配多个页面时，从指定类型中分配，如果指定类型中没有足够的页面，从备用类型链表中分配。最后会试探保留类型链表。

慢速（允许等待和页面回收）分配：

3，当上面两种分配方案都不能满足要求时，考虑页面回收、杀死进程等操作后在试。

velanjun

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Linux伙伴系统分配器

一、Linux伙伴系统分配器伙伴系统分配器大体上分为两类。__get_free_pages()类函数返回分配的第一个页面的线性地址；alloc_pages()类函数返回页面描述符地址。不管以哪种函数进行分配，最终会调用alloc_pages()进行分配页面。为清楚了解其分配制度，先给个伙伴系统数据的存储框图也就是每个order对应一个free_area结构，free_a
复制链接

扫一扫