本文档主要介绍page allocator,page allocator是buddy system的前端,在请求页框分配的时候会先调用page allocator里的函数,page allocator再去调用buddy system里的函数分配实际的页框。page allocator分配时对外的接口是alloc_pages函数:
- 98static inline struct page *
- 99alloc_pages(unsigned int gfp_mask, unsigned int order)
- 100{
- 101 if (unlikely(order >= MAX_ORDER))
- 102 return NULL;
- 103
- 104 return alloc_pages_current(gfp_mask, order);
- 105}
- 先判断一下order,order是将要分配的一组连续页的对数,可以分配2^order个page,这里的MAX_ORDER为11,因为buddy system是11个块链表,最大的order是11
- 调用alloc_pages_current
- 783struct page *alloc_pages_current(unsigned gfp, unsigned order)
- 784{
- 785 struct mempolicy *pol = current->mempolicy;
- 786
- 787 if (!pol || in_interrupt())
- 788 pol = &default_policy;
- 789 if (pol->policy == MPOL_INTERLEAVE)
- 790 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
- 791 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
- 792}
- 691/*
- 692 * This is the 'heart' of the zoned buddy allocator.
- 693 */
- 694struct page * fastcall
- 695__alloc_pages(unsigned int gfp_mask, unsigned int order,
- 696 struct zonelist *zonelist)
- 697{
- 698 const int wait = gfp_mask & __GFP_WAIT;
- 699 struct zone **zones, *z;
- 700 struct page *page;
- 701 struct reclaim_state reclaim_state;
- 702 struct task_struct *p = current;
- 703 int i;
- 704 int classzone_idx;
- 705 int do_retry;
- 706 int can_try_harder;
- 707 int did_some_progress;
- 708
- 709 might_sleep_if(wait);
- 710
- 711 /*
- 712 * The caller may dip into page reserves a bit more if the caller
- 713 * cannot run direct reclaim, or is the caller has realtime scheduling
- 714 * policy
- 715 */
- 716 can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
- 717
- 718 zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
- 719
- 720 if (unlikely(zones[0] == NULL)) {
- 721 /* Should this ever happen?? */
- 722 return NULL;
- 723 }
- 724
- 725 classzone_idx = zone_idx(zones[0]);
- 726
- 727 restart:
- 728 /* Go through the zonelist once, looking for a zone with enough free */
- 729 for (i = 0; (z = zones[i]) != NULL; i++) {
- 730
- 731 if (!zone_watermark_ok(z, order, z->pages_low,
- 732 classzone_idx, 0, 0))
- 733 continue;
- 734
- 735 page = buffered_rmqueue(z, order, gfp_mask);
- 736 if (page)
- 737 goto got_pg;
- 738 }
- 739
- 740 for (i = 0; (z = zones[i]) != NULL; i++)
- 741 wakeup_kswapd(z, order);
- 742
- 743 /*
- 744 * Go through the zonelist again. Let __GFP_HIGH and allocations
- 745 * coming from realtime tasks to go deeper into reserves
- 746 */
- 747 for (i = 0; (z = zones[i]) != NULL; i++) {
- 748 if (!zone_watermark_ok(z, order, z->pages_min,
- 749 classzone_idx, can_try_harder,
- 750 gfp_mask & __GFP_HIGH))
- 751 continue;
- 752
- 753 page = buffered_rmqueue(z, order, gfp_mask);
- 754 if (page)
- 755 goto got_pg;
- 756 }
- 757
- 758 /* This allocation should allow future memory freeing. */
- 759 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) {
- 760 /* go through the zonelist yet again, ignoring mins */
- 761 for (i = 0; (z = zones[i]) != NULL; i++) {
- 762 page = buffered_rmqueue(z, order, gfp_mask);
- 763 if (page)
- 764 goto got_pg;
- 765 }
- 766 goto nopage;
- 767 }
- 768
- 769 /* Atomic allocations - we can't balance anything */
- 770 if (!wait)
- 771 goto nopage;
- 772
- 773rebalance:
- 774 cond_resched();
- 775
- 776 /* We now go into synchronous reclaim */
- 777 p->flags |= PF_MEMALLOC;
- 778 reclaim_state.reclaimed_slab = 0;
- 779 p->reclaim_state = &reclaim_state;
- 780
- 781 did_some_progress = try_to_free_pages(zones, gfp_mask, order);
- 782
- 783 p->reclaim_state = NULL;
- 784 p->flags &= ~PF_MEMALLOC;
- 785
- 786 cond_resched();
- 787
- 788 if (likely(did_some_progress)) {
- 789 /*
- 790 * Go through the zonelist yet one more time, keep
- 791 * very high watermark here, this is only to catch
- 792 * a parallel oom killing, we must fail if we're still
- 793 * under heavy pressure.
- 794 */
- 795 for (i = 0; (z = zones[i]) != NULL; i++) {
- 796 if (!zone_watermark_ok(z, order, z->pages_min,
- 797 classzone_idx, can_try_harder,
- 798 gfp_mask & __GFP_HIGH))
- 799 continue;
- 800
- 801 page = buffered_rmqueue(z, order, gfp_mask);
- 802 if (page)
- 803 goto got_pg;
- 804 }
- 805 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
- 806 /*
- 807 * Go through the zonelist yet one more time, keep
- 808 * very high watermark here, this is only to catch
- 809 * a parallel oom killing, we must fail if we're still
- 810 * under heavy pressure.
- 811 */
- 812 for (i = 0; (z = zones[i]) != NULL; i++) {
- 813 if (!zone_watermark_ok(z, order, z->pages_high,
- 814 classzone_idx, 0, 0))
- 815 continue;
- 816
- 817 page = buffered_rmqueue(z, order, gfp_mask);
- 818 if (page)
- 819 goto got_pg;
- 820 }
- 821
- 822 out_of_memory(gfp_mask);
- 823 goto restart;
- 824 }
- 825
- 826 /*
- 827 * Don't let big-order allocations loop unless the caller explicitly
- 828 * requests that. Wait for some write requests to complete then retry.
- 829 *
- 830 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
- 831 * <= 3, but that may not be true in other implementations.
- 832 */
- 833 do_retry = 0;
- 834 if (!(gfp_mask & __GFP_NORETRY)) {
- 835 if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
- 836 do_retry = 1;
- 837 if (gfp_mask & __GFP_NOFAIL)
- 838 do_retry = 1;
- 839 }
- 840 if (do_retry) {
- 841 blk_congestion_wait(WRITE, HZ/50);
- 842 goto rebalance;
- 843 }
- 844
- 845nopage:
- 846 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
- 847 printk(KERN_WARNING "%s: page allocation failure."
- 848 " order:%d, mode:0x%x\n",
- 849 p->comm, order, gfp_mask);
- 850 dump_stack();
- 851 }
- 852 return NULL;
- 853got_pg:
- 854 zone_statistics(zonelist, z);
- 855 return page;
- 856}
- can_try_harder置位问题,是实时进程并且不在中断上下文或__GFP_WAIT没有被置位
-
- #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
- 然后以pages_low为阀值,这样水位线就是z->pages_low + z->lowmem_reserve[classzone_idx],如果zone_water_mark无法得到满足,《kernel hacker修炼之道之内存管理-结点与管理区初始化(下)》文档中已经讨论过,也就是说free pages不足,就查看下一个管理区
- 如果成功就从buddy system分配pages,否则唤醒kswapd守护进程,对各个管理区进行页回收,将暂时不用的page交换除去
- 再次进行查找,此时降低阀值为pages_min,,如果还无法成功分配,说明系统内存已经到非常紧张的情况了
- 此时检查几个标志是否置位。PF_MEMALLOC被置位,表明分配器自身需要更多的内存;TIF_MEMDIE被置位,表明线程刚好被OOM killer机制选中。这时,忽略阀值,直接从伙伴系统分配,也就是分配到了lowmem_reserve的内存
- 在忽略水位线的情况下如果还分配不到,只能放弃,打印出错信息
- 如果刚才那几个标志没有被设置那么,等尝试回收回收再分配,走slow path,此时会进入一个循环中
- 如果是原子分配,那么只能放弃
- 由于要回收回收再分配,消耗太多时间,此时其他进程可能饥饿,所以调用cond_resched函数
- 由于try_to_free_pages自身可能需要分配新内存,所以在前后设置位然后清除PF_MEMALLOC标志,有这个标志,该进程应该在内存管理方面享有最高优先级
- 如果try_to_free_pages成功回收内存,则会返回回收的页框数
- 那么reclaim_state又是什么呢?
- struct reclaim_state {
- unsigned long reclaimed_slab;
- };
- 如果当前进程正在执行内存回收,reclaim_state的reclaimed_slab字段就被适当的增加,于是刚被释放的页就能够通过页框回收算法被记录下来
- 如果回收了一些页框,那么以pages_min为阀值分配试试
- 如果没有回收到页框,则查看标志,__GFP_FS和__GFP_NORETRY,如果__GFP_FS(杀死一个进程需要进行文件系统操作)被置位且__GFP_NORETRY没有被置位,则选调用out_of_memory函数杀死一个进程,此时zone_watermark_ok的阀值为pages_high,这个很有可能失败,那么为什么定这么高呢?防止另一个进程被oom killer,这样两个无辜的进程会被杀掉。如果之前已经有一个进程被oom killer,则free_pages会满足这么高的水位线的
- 如果__GFP_NORETRY没有被置位,此时检查是否请求页小于等于8个或者__GFP_REPEAT被置位,或大于8个但__GFP_NOFAIL被置位,则进入循环
- 先调用blk_congestion_wait等待块设备写请求完成,然后进入循环
总接一下:在分配页时,会调用zone_watermark_ok检查管理区的是否满足水位值,不满足检查下一个分区。第一次检查传递的阀值是pages_low,如果无法满足,就使用swapd回收一些,并且降低阀值。如果仍然无法回收,并且PF_MEMALLOC或TIF_MEMDIE被置位,则直接从buddy system分配lowmem reserve的。如果还失败就放弃。如果那些标志没有被置位,且请求页小于等于8个或者__GFP_REPEAT被置位,或大于8个但__GFP_NOFAIL被置位,则进入一个循环,在这个循环中回收页,并试图分配请求的pages,如果没有回收到,则使用OOM killer杀死一个进程。如果回收到了就尝试分配。如果没有分配到则重新循环。如果使用OOM killer杀死一个进程,则重新开始最初的分配尝试。
- 912fastcall void free_pages(unsigned long addr, unsigned int order)
- 913{
- 914 if (addr != 0) {
- 915 BUG_ON(!virt_addr_valid((void *)addr));
- 916 __free_pages(virt_to_page((void *)addr), order);
- 917 }
- 918}
调用__free_pages看其实现:
- 900fastcall void __free_pages(struct page *page, unsigned int order)
- 901{
- 902 if (!PageReserved(page) && put_page_testzero(page)) {
- 903 if (order == 0)
- 904 free_hot_page(page);
- 905 else
- 906 __free_pages_ok(page, order);
- 907 }
- 908}
这里判断是释放一个page还是释放多个page,如果释放一个page就释放到hot page,否则调用__free_pages_ok函数,看其实现:
- 330void __free_pages_ok(struct page *page, unsigned int order)
- 331{
- 332 LIST_HEAD(list);
- 333 int i;
- 334
- 335 arch_free_page(page, order);
- 336
- 337 mod_page_state(pgfree, 1 << order);
- 338
- 339#ifndef CONFIG_MMU
- 340 if (order > 0)
- 341 for (i = 1 ; i < (1 << order) ; ++i)
- 342 __put_page(page + i);
- 343#endif
- 344
- 345 for (i = 0 ; i < (1 << order) ; ++i)
- 346 free_pages_check(__FUNCTION__, page + i);
- 347 list_add(&page->lru, &list);
- 348 kernel_map_pages(page, 1<<order, 0);
- 349 free_pages_bulk(page_zone(page), 1, &list, order);
- 350}
- 减少页的引用计数
- 调用free_pages_bulk函数向buddy system释放页