14.3 balance_pgdat函数

最新推荐文章于 2021-11-05 17:00:00 发布

byd yes

最新推荐文章于 2021-11-05 17:00:00 发布

阅读量943

点赞数 1

分类专栏： linux.mm

本文链接：https://blog.csdn.net/dai_xiangjun/article/details/118864009

版权

linux.mm 专栏收录该内容

59 篇文章 25 订阅

订阅专栏

balance_pgdat()函数是回收页面的主函数。这个函数比较长，首先看一个框架，主体函数是一个很长的while循环。

代码如下：

/*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at high_wmark_pages(zone).
 *
 * Returns the final order kswapd was reclaiming at
 *
 * There is special handling here for zones which are full of pinned pages.
 * This can happen if the pages are all mlocked, or if they are all used by
 * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
 * What we do is to detect the case where all pages in the zone have been
 * scanned twice and there has been zero successful reclaim.  Mark the zone as
 * dead and from now on, only perform a short scan.  Basically we're polling
 * the zone for when the problem goes away.
 *
 * kswapd scans the zones in the highmem->normal->dma direction.  It skips
 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
 * lower zones regardless of the number of free pages in the lower zones. This
 * interoperates with the page allocator fallback scheme to ensure that aging
 * of pages is balanced across the zones.
 */
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                            int *classzone_idx)
{
    int i;
    int end_zone = 0;   /* Inclusive.  0 = ZONE_DMA */
    unsigned long nr_soft_reclaimed;
    unsigned long nr_soft_scanned;

    /*struct scan_control数据结构用于控制页面回收的参数，例如要回收页面的个数nr_to_reclaim、
    分配掩码gfp_mask、分配的阶数order(2^order个页面)、扫描LRU链表的优先级priority等。
    priority成员表示扫描的优先级，用于计算每次扫描页面的数量，计算方法total_size >> priority，
    初始值为12，依次递减。priority数值越低，扫描的页面数量越大，相当于逐步加大扫描粒度。
    struct scan_control定义查看下面代码*/
    struct scan_control sc = {
        .gfp_mask = GFP_KERNEL,
        .order = order,
        .priority = DEF_PRIORITY,
        .may_writepage = !laptop_mode,
        .may_unmap = 1,
        .may_swap = 1,
    };
    count_vm_event(PAGEOUTRUN);
    /*while大循环是页面回收机制的核心框架，可以分成三部分理解：*/
    do {
        unsigned long nr_attempted = 0;
        bool raise_priority = true;
        bool pgdat_needs_compaction = (order > 0);

        sc.nr_reclaimed = 0;

        /*
         * Scan in the highmem->dma direction for the highest
         * zone which needs scanning
         */
        /*(1) 从高端zone往低端zone方向查找第一个处于不平衡状态的end_zone*/
       /*此for循环，从ZONE_HIGHMEM->ZONE_NORMAL的方向对zone进行扫描，直到找到第一个不平衡的zone，即水位处于
        WAMARK_HIGH之下的zone为止。同样使用zone_balanced()函数来计算zone是否处于WMARK_HIGH水位之上，找到之后
        保存到end_zone变量中*/
        for (i = pgdat->nr_zones - 1; i >= 0; i--) {
            struct zone *zone = pgdat->node_zones + i;

            if (!populated_zone(zone))
                continue;

            if (sc.priority != DEF_PRIORITY &&
                !zone_reclaimable(zone))
                continue;

            /*
             * Do some background aging of the anon list, to give
             * pages a chance to be referenced before reclaiming.
             */
            age_active_anon(zone, &sc);

            /*
             * If the number of buffer_heads in the machine
             * exceeds the maximum allowed level and this node
             * has a highmem zone, force kswapd to reclaim from
             * it to relieve lowmem pressure.
             */
            if (buffer_heads_over_limit && is_highmem_idx(i)) {
                end_zone = i;
                break;
            }
        
            /*判断zone的水位是否处于高水位之上。*/
            if (!zone_balanced(zone, order, 0, 0)) {
                end_zone = i;
                break;
            } else {
                /*
                 * If balanced, clear the dirty and congested
                 * flags
                 */
                clear_bit(ZONE_CONGESTED, &zone->flags);
                clear_bit(ZONE_DIRTY, &zone->flags);
            }
        }

        if (i < 0)
            goto out;
        /*(2) 从最低端zone开始页面回收，一直到end_zone*/
        /*此for循环是沿着normal_zone到刚才找到的end_zone的方向进行扫描,
        确定是否需要内存规整，当zone的内存处于WMARK_LOW之上则不需要内存规整*/
        for (i = 0; i <= end_zone; i++) {
            struct zone *zone = pgdat->node_zones + i;

             /*判断zone里实际管理的页面数量是否还有zone->present_pages*/
            if (!populated_zone(zone))
                continue;

            /*
             * If any zone is currently balanced then kswapd will
             * not call compaction as it is expected that the
             * necessary pages are already available.
             */
            /*这里判断是否需要内存规则(memory compaction),当order大于0且
            当前zone处于WMARK_LOW水位之上，则不需要内存规整*/
            if (pgdat_needs_compaction &&
                    zone_watermark_ok(zone, order,
                        low_wmark_pages(zone),
                        *classzone_idx, 0))
                pgdat_needs_compaction = false;
        }
        
        /*
         * If we're getting trouble reclaiming, start doing writepage
         * even in laptop mode.
         */
        if (sc.priority < DEF_PRIORITY - 2)
            sc.may_writepage = 1;

        /*
         * Now scan the zone in the dma->highmem direction, stopping
         * at the last zone which needs scanning.
         *
         * We do this because the page allocator works in the opposite
         * direction.  This prevents the page allocator from allocating
         * pages behind kswapd's direction of progress, which would
         * cause too much scanning of the lower zones.
         */
        /*此循环的方向依然是从ZONE_NORMAL到end_zone,为什么要从ZONE_NORMAL到end_zone
        的方向回收页面呢？因为伙伴系统分配系统是从ZONE_HIGHMEM到ZONE_NORMAL的方向，
        恰好和回收页面的方向相反，这样有利于减少对锁的争用(页面分配路径上的直接页面回
        收(directly reclaim)和kswapd有可能争用zone->lru_lock锁)，提高效率。*/
        for (i = 0; i <= end_zone; i++) {
            struct zone *zone = pgdat->node_zones + i;

            if (!populated_zone(zone))
                continue;

            if (sc.priority != DEF_PRIORITY &&
                !zone_reclaimable(zone))
                continue;

            sc.nr_scanned = 0;

            nr_soft_scanned = 0;
            /*
             * Call soft limit reclaim before calling shrink_zone.
             */
            nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
                            order, sc.gfp_mask,
                            &nr_soft_scanned);
            sc.nr_reclaimed += nr_soft_reclaimed;

            /*
             * There should be no need to raise the scanning
             * priority if enough pages are already being scanned
             * that that high watermark would be met at 100%
             * efficiency.
             */
            /*kswapd_shrink_zone()是真正扫描和页面回收函数，扫描的参数和结果存放在
            struct scan_control sc中，kswapd_shrink_zone函数返回true,表明已经回
            收了所需要的页面，且不需要再提高扫描优先级*/
            if (kswapd_shrink_zone(zone, end_zone,
                           &sc, &nr_attempted))
                raise_priority = false;
        }

        /*
         * If the low watermark is met there is no need for processes
         * to be throttled on pfmemalloc_wait as they should not be
         * able to safely make forward progress. Wake them
         */
        /*
        如果进程加入到了node的pgdat->pfmemalloc_wait等待队列中。在此node的kswapd进行内存回收后，
        会通过再次判断此node是否平衡来唤醒这些进程，如果node平衡，则唤醒这些进程，否则不唤醒。实际
        上，不唤醒也说明了node没有平衡，kswapd还是会继续进行内存回收，最后kswapd实在没办法让node
        达到平衡水平下，会在kswapd睡眠前，将这些进程全部进行唤醒。
        */
        if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
                pfmemalloc_watermark_ok(pgdat))
            wake_up_all(&pgdat->pfmemalloc_wait);

        /*
         * Fragmentation may mean that the system cannot be rebalanced
         * for high-order allocations in all zones. If twice the
         * allocation size has been reclaimed and the zones are still
         * not balanced then recheck the watermarks at order-0 to
         * prevent kswapd reclaiming excessively. Assume that a
         * process requested a high-order can direct reclaim/compact.
         */
        /*sc.nr_reclaimed表示已经回收页面的数量。如果已经回收的页面大于等于
        2^order,为了避免页面碎片，这里设置order为0，以防止kswapd内核线程过
        于激进地回收页面。假如没有此判断，并且回收了2^order个页面后pgdat_balanced()
        还是发现内存节点没有达到平衡状态，那么它会循环下去，直到sc.priority<=0为止。
        注意要退出扫描，还需要判断当前内存节点的页面是否处于平衡状态pgdat_balanced()。*/
        if (order && sc.nr_reclaimed >= 2UL << order)
            order = sc.order = 0;

        /* Check if kswapd should be suspending */
        /*判断kswapd内核线程是否要停止或者睡眠*/
        if (try_to_freeze() || kthread_should_stop())
            break;

        /*
         * Compact if necessary and kswapd is reclaiming at least the
         * high watermark number of pages as requsted
         */
        /*判断是否需要对这个内存节点进行内存规整，优化内存碎片*/
        if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
            compact_pgdat(pgdat, order);

        /*
         * Raise priority if scanning rate is too low or there was no
         * progress in reclaiming pages
         */
        /*判断是否需要提高扫描的优先级和扫描粒度。变量raise_priority默认为true
        当kswapd_shrink_zone()函数返回true,即成功回收了页面时，才会把
        raise_priority设置为false。如果扫描一轮后没有一个页面被回收释放，
        那么也需要提高优先级来增加扫描页面的强度。*/
        if (raise_priority || !sc.nr_reclaimed)
            sc.priority--;

    /*(3) 整个大循环不断加大扫描粒度，并且检查从最低端zone到
    classzone_idx的zone是否处于平衡状态*/
    } while (sc.priority >= 1 &&
         !pgdat_balanced(pgdat, order, *classzone_idx));
    /*pgdat_balanced()需要注意参数classzone_idex，它表示在页面分配路径上计算出来第一个
    最合适内存分配的zone的编号，通过wake_all_kswapds()传递下来
    下面查看pgdat_balanced()函数的实现*/

out:
    /*
     * Return the order we were reclaiming at so prepare_kswapd_sleep()
     * makes a decision on the order we were last reclaiming at. However,
     * if another caller entered the allocator slow path while kswapd
     * was awake, order will remain at the higher level
     */
    *classzone_idx = end_zone;
    return order;
}

此函数看完之后我们需要查看kswapd_shrink_zone()函数，在后面。

struct scan_control定义如下：

[mm/vmscan.c]

struct scan_control {
    /* How many pages shrink_list() should reclaim */
    unsigned long nr_to_reclaim;/*需要回收的页框数量*/
    /* This context's GFP mask */
    gfp_t gfp_mask;/*申请内存时使用的分配标志*/
    /* Allocation order */
    int order;/*申请内存时使用的order值，因为只有申请内存，然后内存不足时才会进行扫描*/
    /*
     * Nodemask of nodes allowed by the caller. If NULL, all nodes
     * are scanned.
     */
    nodemask_t  *nodemask;/*允许扫描的node结点的掩码*/
    /*
     * The memory cgroup that hit its limit and as a result is the
     * primary target of this reclaim invocation.
     */
    struct mem_cgroup *target_mem_cgroup;
    /* Scan (total_size >> priority) pages at once */
    /*扫描优先级，代码一次扫描(total_size >> priority)个页框
    优先级越低，一次扫描的页框数量就越多
    优先级越高，一次扫描的数量就越少
    默认优先级为12
    */
    int priority;
    unsigned int may_writepage:1;/*是否能够进行回写操作(与分配标志的__GFP_IO和__GFP_FS有关)*/
    /* Can mapped pages be reclaimed? */
    unsigned int may_unmap:1;/*能够进行unmap操作，就是将所有映射了此页的页表项清空*/
    /* Can pages be swapped as part of reclaim? */
    unsigned int may_swap:1;/*是否能够进行swap交换，如果不能，在内存回收时则不扫描匿名页面LRU链表*/
    /* Can cgroups be reclaimed below their normal consumption range? */
    unsigned int may_thrash:1;
    unsigned int hibernation_mode:1;
    /* One of the zones is ready for compaction */
    unsigned int compaction_ready:1;/*扫描结束后会标记，用于内存回收判断是否需要进行内存压缩*/
    /* Incremented by the number of inactive pages that were scanned */
    unsigned long nr_scanned;/*已经扫描的页框数量*/
    /* Number of pages freed so far during a call to shrink_zones() */
    unsigned long nr_reclaimed;/*已经回收的页框数量*/
};

回到balance_pgdat()函数

pgdat_balanced()函数实现:判断一个内存节点上的物理页面是否处于平衡状态，返回true,则表示该内存节点处于平衡状态。

什么是平衡状态？

对于order为0的情况，所有zone认为其是平衡的。

[kswapd()->balance_pgdat()->pgdat_balanced()]

/*
 * pgdat_balanced() is used when checking if a node is balanced.
 *
 * For order-0, all zones must be balanced!
 *
 * For high-order allocations only zones that meet watermarks and are in a
 * zone allowed by the callers classzone_idx are added to balanced_pages. The
 * total of balanced pages must be at least 25% of the zones allowed by
 * classzone_idx for the node to be considered balanced. Forcing all zones to
 * be balanced for high orders can cause excessive reclaim when there are
 * imbalanced zones.
 * The choice of 25% is due to
 *   o a 16M DMA zone that is balanced will not balance a zone on any
 *     reasonable sized machine
 *   o On all other machines, the top zone must be at least a reasonable
 *     percentage of the middle zones. For example, on 32-bit x86, highmem
 *     would need to be at least 256M for it to be balance a whole node.
 *     Similarly, on x86-64 the Normal zone would need to be at least 1G
 *     to balance a node on its own. These seemed like reasonable ratios.
 
对于高阶分配，仅将符合水印且位于调用者classzone_idx允许的区域中的区域添加到balance_pages。
平衡页面的总数必须至少为classzone_idx允许的节点平衡区域的25％。 当存在不平衡区域时，强制所
有区域达到高阶平衡可能会导致过多的回收。
25％的选择是由于  
    平衡的16M DMA区域不会平衡任何大小合理的计算机上的区域  
    在所有其他机器上，顶部区域必须至少是中间区域的合理百分比。
例如，在32位x86上，highmem必须至少为256M，才能平衡整个节点。
类似地，在x86-64上，正常区域至少需要1G才能单独平衡节点。 这些似乎是合理的比率。
*/

/*注意参数classzone_idx是由页面分配路径上传递过来的。*/
static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
{
    unsigned long managed_pages = 0;
    unsigned long balanced_pages = 0;
    int i;

    /* Check the watermark levels */
    /*遍历从最低端的zone到classzone_idx的页面是否处于平衡状态*/
    for (i = 0; i <= classzone_idx; i++) {
        struct zone *zone = pgdat->node_zones + i;

        if (!populated_zone(zone))
            continue;

        managed_pages += zone->managed_pages;/*zone->managed_pages表示被伙伴系统管理的页面数量*/

        /*
         * A special case here:
         *
         * balance_pgdat() skips over all_unreclaimable after
         * DEF_PRIORITY. Effectively, it considers them balanced so
         * they must be considered balanced here as well!
         */
        if (!zone_reclaimable(zone)) {
            balanced_pages += zone->managed_pages;
            continue;
        }

        /*zone_balanced()函数用于判断zone的空闲页面是否高于WMARK_HIGH水位之上，
        返回true，则表示zone处于WMARK_HIGH之上。
        如果这个zone的空闲页面高于WMARK_HIGH水位，那么这个zone所有管理的页面可以
        看作balanced_pages。下面查看此函数的实现*/
        if (zone_balanced(zone, order, 0, i))
            balanced_pages += zone->managed_pages;
        else if (!order)
            return false;
    }
    /*对于order为0的情况，所有的zone都是平衡的。对于order大于0的内存分配，
    需要统计从最低端zone到classzone_idx_zone中所有处于平衡状态zone的页面数量
    (balanced_pages)，当大于这个节点的所有管理的页面managed_pages的25%，那么
    就认为这个内存节点处于平衡状态。*/
    if (order)
        return balanced_pages >= (managed_pages >> 2);
    else
        return true;
}
回到balance_pgdat()函数

zone_balanced()函数实现:zone的balanced由此函数来判断，这是针对于order来说的。

此函数有两个条件：

(1) zone内的空闲内存高于高水位

水位是在内存初始化的时候根据每个zone的内存大小自动计算出来的，每个zone可能有不同的水位。具体计算水位的算法可能各个kernel版本不尽相同，比如某个版本的这么计算：对于非高端内存来说(64位机器上已经不存在高端内存了)，min_watermark根据各个zone的内存占比，瓜分1024个page；low_watermark在此基础上增加25%；high_watermark在此基础上增加50%。（可以通过/proc/zoneinfo）看到系统中每一个zone，及其free_pages和watermark的情况）这里的高水位对于现在的大内存机器来说，其实只是九牛一毛。由这个高水位来作为判断zone_balanced的基础，可见内存在内存balance的问题上还是很注重系统性能的。

(2) 要求zone内的内存在0到给定order之间平衡分布

例如：总的内存超过高水位、order-1及以上的内存超过高水位的1/2、order-2及以上的内存超过高水位的1/4、......、一直到所要求的order。

为什么针对order的内存balanced不仅仅关心order阶的内存，而是关心0-order阶的所有内存呢？因为高order的连续内存是稀缺资源。如果内存分布不平衡，低order的内存请求可能因为低order内存的暂时缺货不得不将高order所对应的连续内存进行分拆。这种浪费是尽量避免的。并且这样的分拆可能导致高order内存耗尽，而导致满足不了对指定order的内存分配需求。

那么为什么针对order的内存balanced又仅仅关系0到order阶的所有内存、而不关心大于order阶的内存呢？当我们需要检查针对于order的zone_balanced时，起始是说明我们需要这个zone内2^order的连续页面，由于连续页面回收不易，也不是系统内最普遍的需求(给用户空间使用的内存基本上都是order-0的，不考虑hugepage这样的特殊情况)，所以更高的order就不要考虑了。后面会看到，kswapd只针对order-0进行回收。

static bool zone_balanced(struct zone *zone, int order,
              unsigned long balance_gap, int classzone_idx)
{
    if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
                    balance_gap, classzone_idx, 0))
        return false;

    if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
                order, 0, classzone_idx) == COMPACT_SKIPPED)
        return false;

    return true;
}

bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
            unsigned long mark, int classzone_idx, int alloc_flags)
{
    long free_pages = zone_page_state(z, NR_FREE_PAGES);

    if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
        free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

    return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
                                free_pages);
}

回到pgdat_balanced()函数

kswapd_shrink_zone()函数实现:页面回收的真正函数

[kswapd()->balanced_pgdat()->kswapd_shrink_zone]


/*
 * kswapd shrinks the zone by the number of pages required to reach
 * the high watermark.
 *
 * Returns true if kswapd scanned at least the requested number of pages to
 * reclaim or if the lack of progress was due to pages under writeback.
 * This is used to determine if the scanning priority needs to be raised.
 */
static bool kswapd_shrink_zone(struct zone *zone,
                   int classzone_idx,
                   struct scan_control *sc,
                   unsigned long *nr_attempted)
{
/*
    struct scan_control sc = {
        .gfp_mask = GFP_KERNEL,
        .order = order,
        .priority = DEF_PRIORITY,
        .may_writepage = !laptop_mode,
        .may_unmap = 1,
        .may_swap = 1,
    };
*/
    int testorder = sc->order;
    unsigned long balance_gap;
    bool lowmem_pressure;

    /* Reclaim above the high watermark. */
    /*计算一轮扫描最多回收的页面sc->nr_to_reclaim个数，SWAP_CLUSTER_MAX
    定义为32个页面，high_wmark_pages()宏表示预期需要最多回收多少个页面
    才能达到WMARK_HIGH水位。这里比较两者取其最大值。这里会使用到
    zone->watermark[WMARK_HIGH]变量，WMARK_HIGH水位值的计算是在
    __setup_per_zone_wmarks()函数中，通过min_free_kbytes和zone管理的页
    面数等参数计算得出。*/
    sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));

    /*
     * Kswapd reclaims only single pages with compaction enabled. Trying
     * too hard to reclaim until contiguous free pages have become
     * available can hurt performance by evicting too much useful data
     * from memory. Do not reclaim more than needed for compaction.
     */
    if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
            compaction_suitable(zone, sc->order, 0, classzone_idx)
                            != COMPACT_SKIPPED)
        testorder = 0;

    /*
     * We put equal pressure on every zone, unless one zone has way too
     * many pages free already. The "too many pages" is defined as the
     * high wmark plus a "gap" where the gap is either the low
     * watermark or 1% of the zone, whichever is smaller.
     */
    /* balance_gap相当于在判断zone是否处于平衡状态时增加了些难度，原来要判断空闲页面
    是否超过了高水位WMARK_HIGH即可，现在需要判断是否超过(WMARK_HIGH+balance_gap)。
    balance_gap值比较小，一般取低水位值或zone管理页面的1% */
    balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
            zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));

    /*
     * If there is no low memory pressure or the zone is balanced then no
     * reclaim is necessary
     */
    /*在调用shink_zone()函数之前，需要判断当前zone的页面是否处于平衡状态，即当前水位是否
    已经高于WMARK_HIGH+balanc_gap。如果已经处于平衡状态，那么不需要执行页面回收，直接返
    回即可。这里还考虑了buffer_head的使用情况，buffer_heads_over_limit全局变量定义在
    fd/buffer.c文件中，我们暂时先不考虑它。*/
    lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
    if (!lowmem_pressure && zone_balanced(zone, testorder,
                        balance_gap, classzone_idx))
        return true;

    /*shrink_zone()函数去尝试回收zone的页面，它是kswapd内核线程的核心函数，后续详细介绍。*/
    shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);

    /* Account for the number of pages attempted to reclaim */
    *nr_attempted += sc->nr_to_reclaim;

    clear_bit(ZONE_WRITEBACK, &zone->flags);

    /*
     * If a zone reaches its high watermark, consider it to be no longer
     * congested. It's possible there are dirty pages backed by congested
     * BDIs but as pressure is relieved, speculatively avoid congestion
     * waits.
     */
    /*shrink_zone完成之后继续判断当前zone是否处于平衡状态，如果处于平衡状态，则可以
    不考虑block层的堵塞问题(congest)，即使还有一些页面处于回写状态也是可以控制的，
    清除ZONE_CONGESTED比特位*/
    if (zone_reclaimable(zone) &&
        zone_balanced(zone, testorder, 0, classzone_idx)) {
        clear_bit(ZONE_CONGESTED, &zone->flags);
        clear_bit(ZONE_DIRTY, &zone->flags);
    }
    /*最后，如果扫描的页面数量(sc->nr_scaned)大于等于扫描目录(sc->nr_to_reclaim)
    的话表示扫描了足够多的页面，则该函数返回true。扫描了足够多的页面，也有可能一无
    所获。kswapd_shrink_zone()函数除了上面说的情况返回true以外，当zone处于平衡
    状态时也会返回true，返回false只会影响balance_pgdat()函数的扫描粒度。*/
    return sc->nr_scanned >= sc->nr_to_reclaim;
}

页面分配路径page allocator和页面回收路径kswapd之间有很多交互的地方，如下图：

当页面分配路径page allocator在低水位中分配内存失败时，会唤醒kswapd内核线程，把order和preferred_zone传递给kswapd，这两个参数是他们之间的纽带。
页面分配路径page allocator和页面回收路径kswapd在扫描zone时的方向是相反的，页面分配路径page allocator从ZONE_HIGHMEM往ZONE_NORMAL方向扫描zone，kswapd则相反。
如何判断kswapd应该停止页面回收呢？一个重要的条件是从zone_normal到preferred_zone处于平衡状态时，那么就认为这个内存节点处于平衡状态，可以停止页面回收。
页面分配路径page allocator和页面回收路径kswapd采用zone的水位标不同，page allocator采用低水位，即在低水位中无法分配内存，就唤醒kswapd；而kswapd判断是否停止页面回收采用的高水位。

byd yes

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
1
评论
14.3 balance_pgdat函数

balance_pgdat()函数是回收页面的主函数。这个函数比较长，首先看一个框架，主体函数是一个很长的while循环。代码如下：/* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * * Returns the final order kswapd was reclaiming at * ...
复制链接

扫一扫