17 内存规整(memory compaction)

    伙伴系统以页为单位来管理内存,内存碎片也是基于页面的,即有大量离散且不连续的页面导致的。从内存角度来看,内存碎片不是好事情,有些情况下物理设备需要大段的连续的物理内存,如果内核无法满足,则会发生内核panic。内存碎片化好比军训中带队行走时间长了,队列乱了,需要重新规整一下,因此本章称为内存规整,一些文献称为内存紧凑,它是为了解决内存碎片化而出现的一个功能。

    内核中去碎片化的基本原理是按照页的可移动性将页面分组。迁移内核本身使用的物理内存的实现难度和复杂度都很大,因此目前的内核是不迁移内核本身使用的物理页面。对于应用程序进程使用的页面,实际上通过用户页表的映射来访问。用户页表可以移动和修改映射关系,不会影响用户进程,因此内存规整是基于页面迁移实现的。

内存规整实现:

    内存规整的一个重要的应用场景是在分配大块内存时(order > 1),在WMARK_LOW低水位情况下分配失败,唤醒kswapd内核线程后依然无法分配出内存,这时调用__alloc_pages_direct_compact()来压缩内存尝试分配出所需要的内存。下面沿着alloc_pages()->...->__alloc_pages_direct_compact()这条内核路径来看内存规整是如何工作的。

[mm/page_alloc.c]

[alloc_pages()->alloc_pages_node()->__alloc_pages()->__alloc_pages_nodemask()->__alloc_pages_slowpath()->__alloc_pages_direct_compact()]

/* Try memory compaction for high-order allocations before reclaim */
/*参数mode指migration_mode,通常由__alloc_pages_slowpath()传递过来,其值为MIGRATE_ASYNC*/
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        int alloc_flags, const struct alloc_context *ac,
        enum migrate_mode mode, int *contended_compaction,
        bool *deferred_compaction)
{
    unsigned long compact_result;
    struct page *page;

    /*内存规整是针对high-order的内存分配,所以order等于0的情况不需要触发内存规整。*/
    if (!order)
        return NULL;

    current->flags |= PF_MEMALLOC;
    /*try_to_compact_pages()函数执行时需要设置当前进程的PF_MEMALLOC标志位,该标志位会在页面迁移时用到,
    避免页面锁(PG_Locked)发生死锁,下面查看此函数实现*/
    compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
                        mode, contended_compaction);
    current->flags &= ~PF_MEMALLOC;

    switch (compact_result) {
    case COMPACT_DEFERRED:
        *deferred_compaction = true;
        /* fall-through */
    case COMPACT_SKIPPED:
        return NULL;
    default:
        break;
    }

    /*
     * At least in one zone compaction wasn't deferred or skipped, so let's
     * count a compaction stall
     */
    count_vm_event(COMPACTSTALL);

    /*当内存规整执行完成后,调用get_page_from_freelist()尝试分配内存,如果分配成功将返回首页page数据结构*/
    page = get_page_from_freelist(gfp_mask, order,
                    alloc_flags & ~ALLOC_NO_WATERMARKS, ac);

    if (page) {
        struct zone *zone = page_zone(page);

        zone->compact_blockskip_flush = false;
        compaction_defer_reset(zone, order, true);
        count_vm_event(COMPACTSUCCESS);
        return page;
    }

    /*
     * It's bad if compaction run occurs and fails. The most likely reason
     * is that pages exist, but not enough to satisfy watermarks.
     */
    count_vm_event(COMPACTFAIL);

    cond_resched();

    return NULL;
}

try_to_compact_pages()函数实现:

[__alloc_pages_direct_compact()->try_to_compact_pages()]

/**
 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
 * @gfp_mask: The GFP mask of the current allocation
 * @order: The order of the current allocation
 * @alloc_flags: The allocation flags of the current allocation
 * @ac: The context of current allocation
 * @mode: The migration mode for async, sync light, or sync migration
 * @contended: Return value that determines if compaction was aborted due to
 *         need_resched() or lock contention
 *
 * This is the main entry point for direct page compaction.
 */
unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
            int alloc_flags, const struct alloc_context *ac,
            enum migrate_mode mode, int *contended)
{
    int may_enter_fs = gfp_mask & __GFP_FS;
    int may_perform_io = gfp_mask & __GFP_IO;
    struct zoneref *z;
    struct zone *zone;
    int rc = COMPACT_DEFERRED;
    int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */

    *contended = COMPACT_CONTENDED_NONE;

    /* Check if the GFP flags allow compaction */
    if (!order || !may_enter_fs || !may_perform_io)
        return COMPACT_SKIPPED;

    trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);

    /* Compact each zone in the list */
    /*for_each_zone_zonelist_nodemask宏,它会根据分配掩码来确定需要扫描和遍历哪些zone*/
    for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
                                ac->nodemask) {
        int status;
        int zone_contended;

        if (compaction_deferred(zone, order))
            continue;
        /*compact_zone_order()对特定zone执行内存规整,下面查看此函数实现*/
        status = compact_zone_order(zone, order, gfp_mask, mode,
                &zone_contended, alloc_flags,
                ac->classzone_idx);
        rc = max(status, rc);
        /*
         * It takes at least one zone that wasn't lock contended
         * to clear all_zones_contended.
         */
        all_zones_contended &= zone_contended;

        /* If a normal allocation would succeed, stop compacting */
        /*zone_watermark_ok()判断zone当前的水位是否高于LOW_WMARK水位,如果是,则退出循环*/
        if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
                    ac->classzone_idx, alloc_flags)) {
            /*
             * We think the allocation will succeed in this zone,
             * but it is not certain, hence the false. The caller
             * will repeat this with true if allocation indeed
             * succeeds in this zone.
             */
            compaction_defer_reset(zone, order, false);
            /*
             * It is possible that async compaction aborted due to
             * need_resched() and the watermarks were ok thanks to
             * somebody else freeing memory. The allocation can
             * however still fail so we better signal the
             * need_resched() contention anyway (this will not
             * prevent the allocation attempt).
             */
            if (zone_contended == COMPACT_CONTENDED_SCHED)
                *contended = COMPACT_CONTENDED_SCHED;

            goto break_loop;
        }

        if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
            /*
             * We think that allocation won't succeed in this zone
             * so we defer compaction there. If it ends up
             * succeeding after all, it will be reset.
             */
            defer_compaction(zone, order);
        }

        /*
         * We might have stopped compacting due to need_resched() in
         * async compaction, or due to a fatal signal detected. In that
         * case do not try further zones and signal need_resched()
         * contention.
         */
        if ((zone_contended == COMPACT_CONTENDED_SCHED)
                    || fatal_signal_pending(current)) {
            *contended = COMPACT_CONTENDED_SCHED;
            goto break_loop;
        }

        continue;
break_loop:
        /*
         * We might not have tried all the zones, so  be conservative
         * and assume they are not all lock contended.
         */
        all_zones_contended = 0;
        break;
    }

    /*
     * If at least one zone wasn't deferred or skipped, we report if all
     * zones that were tried were lock contended.
     */
    if (rc > COMPACT_SKIPPED && all_zones_contended)
        *contended = COMPACT_CONTENDED_LOCK;

    return rc;
}
回到__alloc_pages_direct_compact函数

compact_zone_order()函数实现:

[__alloc_pages_direct_compact()->try_to_compact_pages()->compact_zone_order()]

和kswapd的代码一样,这里定义了控制相关的数据结构struct compact_control cc来传递参数。cc.migratepages是将要迁移页面的链表,cc.freepages表示要迁移的目的链表。

static unsigned long compact_zone_order(struct zone *zone, int order,
        gfp_t gfp_mask, enum migrate_mode mode, int *contended,
        int alloc_flags, int classzone_idx)
{
    unsigned long ret;
    struct compact_control cc = {
        .nr_freepages = 0,
        .nr_migratepages = 0,
        .order = order,
        .gfp_mask = gfp_mask,
        .zone = zone,
        .mode = mode,
        .alloc_flags = alloc_flags,
        .classzone_idx = classzone_idx,
    };
    INIT_LIST_HEAD(&cc.freepages);
    INIT_LIST_HEAD(&cc.migratepages);

    /*下面查看此函数实现*/
    ret = compact_zone(zone, &cc);

    VM_BUG_ON(!list_empty(&cc.freepages));
    VM_BUG_ON(!list_empty(&cc.migratepages));

    *contended = cc.contended;
    return ret;
}
回到try_to_compact_pages()函数

compact_zone()函数实现:

[alloc_pages()->alloc_pages_node()->__alloc_pages()->__alloc_pages_nodemask()->

__alloc_pages_slowpath()->__alloc_pages_direct_compact()->try_to_compact_pages()->compact_zone_order()->compact_zone()]

static int compact_zone(struct zone *zone, struct compact_control *cc)
{
    int ret;
    unsigned long start_pfn = zone->zone_start_pfn;
    unsigned long end_pfn = zone_end_pfn(zone);
    const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
    const bool sync = cc->mode != MIGRATE_ASYNC;
    unsigned long last_migrated_pfn = 0;

    /*根据当前水位来判断是否需要进行内存规整,下面查看此函数实现*/
    ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
                            cc->classzone_idx);
    switch (ret) {
    case COMPACT_PARTIAL:
    case COMPACT_SKIPPED:
        /* Compaction is likely to fail */
        return ret;
    case COMPACT_CONTINUE:
        /* Fall through to compaction */
        ;
    }

    /*
     * Clear pageblock skip if there were failures recently and compaction
     * is about to be retried after being deferred. kswapd does not do
     * this reset as it'll reset the cached information when going to sleep.
     */
    if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
        __reset_isolation_suitable(zone);

    /*
     * Setup to move all movable pages to the end of the zone. Used cached
     * information on where the scanners should start but check that it
     * is initialised by ensuring the values are within zone boundaries.
     */
    /*设置cc->migrate_pfn和cc->free_pfn。简单来说,cc->migrate_pfn设置为zone的开始
    pfn(zone->zone_start_pfn),表示从zone的第一个页面开始扫描和查找哪些页面可以迁移。
    cc->free_pfn设置为zone的最末的pfn,表示从zone的最末端开始扫描和查找有哪些空闲的
    页面可以用作迁移页面目的地。*/
    cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
    cc->free_pfn = zone->compact_cached_free_pfn;
    if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
        cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
        zone->compact_cached_free_pfn = cc->free_pfn;
    }
    if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
        cc->migrate_pfn = start_pfn;
        zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
        zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
    }

    trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
                cc->free_pfn, end_pfn, sync);

    migrate_prep_local();

    /*while循环从zone的开头处去扫描和查找合适的迁移页面,然后尝试迁移到zone末端的空闲页面中,
    直到zone处于低水位WMARK_LOW之上。compact_finished()判断compact过程是否可以结束,下面
    查看此函数的实现*/
    while ((ret = compact_finished(zone, cc, migratetype)) ==
                        COMPACT_CONTINUE) {
        int err;
        unsigned long isolate_start_pfn = cc->migrate_pfn;

        /*isolate_migratepages()函数扫描并且寻找zone中可迁移的页面,可迁移的页面会添加到
        cc->migratepages链表中,下面查看此函数的实现*/
        switch (isolate_migratepages(zone, cc)) {
        case ISOLATE_ABORT:
            ret = COMPACT_PARTIAL;
            putback_movable_pages(&cc->migratepages);
            cc->nr_migratepages = 0;
            goto out;
        case ISOLATE_NONE:
            /*
             * We haven't isolated and migrated anything, but
             * there might still be unflushed migrations from
             * previous cc->order aligned block.
             */
            goto check_drain;
        case ISOLATE_SUCCESS:
            ;
        }

        /*迁移页的核心函数,从cc->migratepages链表中摘取页,然后尝试去迁移页。下面查看此函数的实现*/
        err = migrate_pages(&cc->migratepages, compaction_alloc,
                compaction_free, (unsigned long)cc, cc->mode,
                MR_COMPACTION);

        trace_mm_compaction_migratepages(cc->nr_migratepages, err,
                            &cc->migratepages);

        /* All pages were either migrated or will be released */
        cc->nr_migratepages = 0;

        /*处理迁移页面失败的情况,没迁移的页面会放回适合的LRU链表中*/
        if (err) {
            putback_movable_pages(&cc->migratepages);
            /*
             * migrate_pages() may return -ENOMEM when scanners meet
             * and we want compact_finished() to detect it
             */
            if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
                ret = COMPACT_PARTIAL;
                goto out;
            }
        }

        /*
         * Record where we could have freed pages by migration and not
         * yet flushed them to buddy allocator. We use the pfn that
         * isolate_migratepages() started from in this loop iteration
         * - this is the lowest page that could have been isolated and
         * then freed by migration.
         */
        if (!last_migrated_pfn)
            last_migrated_pfn = isolate_start_pfn;

check_drain:
        /*
         * Has the migration scanner moved away from the previous
         * cc->order aligned block where we migrated from? If yes,
         * flush the pages that were freed, so that they can merge and
         * compact_finished() can detect immediately if allocation
         * would succeed.
         */
        if (cc->order > 0 && last_migrated_pfn) {
            int cpu;
            unsigned long current_block_start =
                cc->migrate_pfn & ~((1UL << cc->order) - 1);

            if (last_migrated_pfn < current_block_start) {
                cpu = get_cpu();
                lru_add_drain_cpu(cpu);
                drain_local_pages(zone);
                put_cpu();
                /* No more flushing until we migrate again */
                last_migrated_pfn = 0;
            }
        }

    }

out:
    /*
     * Release free pages and update where the free scanner should restart,
     * so we don't leave any returned pages behind in the next attempt.
     */
    if (cc->nr_freepages > 0) {
        unsigned long free_pfn = release_freepages(&cc->freepages);

        cc->nr_freepages = 0;
        VM_BUG_ON(free_pfn == 0);
        /* The cached pfn is always the first in a pageblock */
        free_pfn &= ~(pageblock_nr_pages-1);
        /*
         * Only go back, not forward. The cached pfn might have been
         * already reset to zone end in compact_finished()
         */
        if (free_pfn > zone->compact_cached_free_pfn)
            zone->compact_cached_free_pfn = free_pfn;
    }

    trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
                cc->free_pfn, end_pfn, sync, ret);

    return ret;
}
回到compact_zone_order()函数

compaction_suitable()函数实现:判断当前水位是否需要内存规整

unsigned long compaction_suitable(struct zone *zone, int order,
                    int alloc_flags, int classzone_idx)
{
    unsigned long ret;

    ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
    trace_mm_compaction_suitable(zone, order, ret);
    if (ret == COMPACT_NOT_SUITABLE_ZONE)
        ret = COMPACT_SKIPPED;

    return ret;
}
/*
 * compaction_suitable: Is this suitable to run compaction on this zone now?
 * Returns
 *   COMPACT_SKIPPED  - If there are too few free pages for compaction
 *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
 *   COMPACT_CONTINUE - If compaction should run now
 */
static unsigned long __compaction_suitable(struct zone *zone, int order,
                    int alloc_flags, int classzone_idx)
{
    int fragindex;
    unsigned long watermark;

    /*
     * order == -1 is expected when compacting via
     * /proc/sys/vm/compact_memory
     */
    if (order == -1)
        return COMPACT_CONTINUE;

    /*以低水位WMARK_LOW为判断标准然后做如下三个判断*/
    watermark = low_wmark_pages(zone);
    /*
     * If watermarks for high-order allocation are already met, there
     * should be no need for compaction at all.
     */
    /*(1) 以分配内存请求的order来判断zone是否在低水位WMARK_LOW之上,如果是,
        则返回COMPACT_PARTIAL表示不需要做内存规整*/
    if (zone_watermark_ok(zone, order, watermark, classzone_idx,
                                alloc_flags))
        return COMPACT_PARTIAL;

    /*
     * Watermarks for order-0 must be met for compaction. Note the 2UL.
     * This is because during migration, copies of pages need to be
     * allocated and for a short time, the footprint is higher
     */
    /*(2) 接下来以order为0来判断zone是否在低水位WMARK_LOW + (2 << order)之上,
    如果达不到这个条件,说明zone中只有很少的空闲页面,不适合做内存规整,返回
    COMPACT_SKIPPED表示跳过这个zone*/
    watermark += (2UL << order);
    if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
        return COMPACT_SKIPPED;

    /*(3) 其余情况返回COMPACT_CONTINUE表示zone可以做内存规整。*/
    /*
     * fragmentation index determines if allocation failures are due to
     * low memory or external fragmentation
     *
     * index of -1000 would imply allocations might succeed depending on
     * watermarks, but we already failed the high-order watermark check
     * index towards 0 implies failure is due to lack of memory
     * index towards 1000 implies failure is due to fragmentation
     *
     * Only compact if a failure would be due to fragmentation.
     */
    fragindex = fragmentation_index(zone, order);
    if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
        return COMPACT_NOT_SUITABLE_ZONE;

    return COMPACT_CONTINUE;
}
回到compact_zone()函数

compact_finished()函数实现:

static int compact_finished(struct zone *zone, struct compact_control *cc,
                const int migratetype)
{
    int ret;

    ret = __compact_finished(zone, cc, migratetype);
    trace_mm_compaction_finished(zone, cc->order, ret);
    if (ret == COMPACT_NO_SUITABLE_PAGE)
        ret = COMPACT_CONTINUE;

    return ret;
}


static int __compact_finished(struct zone *zone, struct compact_control *cc,
                const int migratetype)
{
    unsigned int order;
    unsigned long watermark;

    if (cc->contended || fatal_signal_pending(current))
        return COMPACT_PARTIAL;

    /* Compaction run completes if the migrate and free scanner meet */
    /*结束条件有两个:
    (1) cc->migrate_pfn和cc->free_pfn两个指针相遇,他们从zone的一头一尾向中间方向运行
    (2) 以order为条件判断当前zone的水位在低水位WMARK_LOW之上。
    如果当zone在低水位WMARK_LOW之上,那么需要判断伙伴系统中的order对应的zone中的可移动类型
    的空闲链表是否为空(zone->free_area[order].free_list[MIGRATE_MOVABLE]),最好的结果是
    order对应的free_area链表正好有空闲页面,或者大于order的空闲链表里有空闲页面,再或者大于
    pageblock_order的空闲链表有空闲页面。
    */
    if (cc->free_pfn <= cc->migrate_pfn) {
        /* Let the next compaction start anew. */
        zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
        zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
        zone->compact_cached_free_pfn = zone_end_pfn(zone);

        /*
         * Mark that the PG_migrate_skip information should be cleared
         * by kswapd when it goes to sleep. kswapd does not set the
         * flag itself as the decision to be clear should be directly
         * based on an allocation request.
         */
        if (!current_is_kswapd())
            zone->compact_blockskip_flush = true;

        return COMPACT_COMPLETE;
    }

    /*
     * order == -1 is expected when compacting via
     * /proc/sys/vm/compact_memory
     */
    if (cc->order == -1)
        return COMPACT_CONTINUE;

    /* Compaction run is not finished if the watermark is not met */
    watermark = low_wmark_pages(zone);

    if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
                            cc->alloc_flags))
        return COMPACT_CONTINUE;

    /* Direct compactor: Is a suitable page free? */
    for (order = cc->order; order < MAX_ORDER; order++) {
        struct free_area *area = &zone->free_area[order];

        /* Job done if page is free of the right migratetype */
        if (!list_empty(&area->free_list[migratetype]))
            return COMPACT_PARTIAL;

        /* Job done if allocation would set block type */
        if (order >= pageblock_order && area->nr_free)
            return COMPACT_PARTIAL;
    }

    return COMPACT_NO_SUITABLE_PAGE;
}
回到compact_zone()函数

isolate_migratepages()函数实现:

用于扫描和查找合适迁移的页面,从zone的头部开始找起,查找的步长以pageblock_nr_pages为单位。linux内核以pageblock为单位来管理页的迁移属性。页的迁移属性包括MIGRATE_UNMOVABLE、MIGRATE_RECLAIMABLE、MIGRATE_MOVABLE、MIGRATE_PCPTYPES和MIGRATE_CMA等,内核有两个函数来管理迁移类型,分别是get_pageblock_migratetype()和set_pageblock_migratetype()。内核在初始化时,所有的页面最初都标记位MIGRATE_MOVABLE,见memmap_init_zone()函数(mm/page_alloc.c)。pageblock_nr_pages通常是1024个页面(1UL << MAX_ORDER-1)。

[alloc_pages()->alloc_pages_node()->__alloc_pages()->__alloc_pages_nodemask()->__alloc_pages_slowpath()->

__alloc_pages_direct_compact()->try_to_compact_pages()->compact_zone_order()->compact_zone()->isolate_migratepages()]

*
 * Isolate all pages that can be migrated from the first suitable block,
 * starting at the block pointed to by the migrate scanner pfn within
 * compact_control.
 */
static isolate_migrate_t isolate_migratepages(struct zone *zone,
                    struct compact_control *cc)
{
    unsigned long low_pfn, end_pfn;
    struct page *page;
    /*确定分离类型,通常isolate_mode为ISOLATE_ASYNC_MIGRATE*/
    const isolate_mode_t isolate_mode =
        (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);

    /*
     * Start at where we last stopped, or beginning of the zone as
     * initialized by compact_zone()
     */
    low_pfn = cc->migrate_pfn;

    /* Only scan within a pageblock boundary */
    end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);

    /*
     * Iterate over whole pageblocks until we find the first suitable.
     * Do not cross the free scanner.
     */
    /*从zone头部cc->migrate_pfn开始以pageblock_br_pages为单位向zone尾部方向扫描。*/
    for (; end_pfn <= cc->free_pfn;
            low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {

        /*
         * This can potentially iterate a massively long zone with
         * many pageblocks unsuitable, so periodically check if we
         * need to schedule, or even abort async compaction.
         */
        if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
                        && compact_should_abort(cc))
            break;

        page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
        if (!page)
            continue;

        /* If isolation recently failed, do not retry */
        if (!isolation_suitable(cc, page))
            continue;

        /*
         * For async compaction, also only scan in MOVABLE blocks.
         * Async compaction is optimistic to see if the minimum amount
         * of work satisfies the allocation.
         */
        /*判断pageblock是否为MIGRATE_MOVABLE或MIGRATE_CMA类型,因为这两种类型的
        页是可以迁移的。cc->mode迁移的类型在__alloc_pages_slowpath()函数传递下来
        的参数,通常migration_mode参数是异步的,即MIGRATE_ASYNC*/
        if (cc->mode == MIGRATE_ASYNC &&
            !migrate_async_suitable(get_pageblock_migratetype(page)))
            continue;

        /* Perform the isolation */
        /*扫描和分离pageblock中的页面是否适合迁移,下面查看此函数的实现*/
        low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
                                isolate_mode);

        if (!low_pfn || cc->contended) {
            acct_isolated(zone, cc);
            return ISOLATE_ABORT;
        }

        /*
         * Either we isolated something and proceed with migration. Or
         * we failed and compact_zone should decide if we should
         * continue or not.
         */
        break;
    }

    acct_isolated(zone, cc);
    /*
     * Record where migration scanner will be restarted. If we end up in
     * the same pageblock as the free scanner, make the scanners fully
     * meet so that compact_finished() terminates compaction.
     */
    cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;

    return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
回到compact_zone()函数

isolate_migratepages_block()函数实现:

[alloc_pages()->alloc_pages_node()->__alloc_pages()->__alloc_pages_nodemask()->__alloc_pages_slowpath()->

__alloc_pages_direct_compact()->try_to_compact_pages()->compact_zone_order()->compact_zone()->

isolate_migratepages()->isolate_migratepages_block()]

**
 * isolate_migratepages_block() - isolate all migrate-able pages within
 *                a single pageblock
 * @cc:     Compaction control structure.
 * @low_pfn:    The first PFN to isolate
 * @end_pfn:    The one-past-the-last PFN to isolate, within same pageblock
 * @isolate_mode: Isolation mode to be used.
 *
 * Isolate all pages that can be migrated from the range specified by
 * [low_pfn, end_pfn). The range is expected to be within same pageblock.
 * Returns zero if there is a fatal signal pending, otherwise PFN of the
 * first page that was not scanned (which may be both less, equal to or more
 * than end_pfn).
 *
 * The pages are isolated on cc->migratepages list (not required to be empty),
 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
 * is neither read nor updated.
 */
static unsigned long
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
            unsigned long end_pfn, isolate_mode_t isolate_mode)
{
    struct zone *zone = cc->zone;
    unsigned long nr_scanned = 0, nr_isolated = 0;
    struct list_head *migratelist = &cc->migratepages;
    struct lruvec *lruvec;
    unsigned long flags = 0;
    bool locked = false;
    struct page *page = NULL, *valid_page = NULL;
    unsigned long start_pfn = low_pfn;

    /*
     * Ensure that there are not too many pages isolated from the LRU
     * list by either parallel reclaimers or compaction. If there are,
     * delay for some time until fewer pages are isolated
     */
    /*too_many_isolated()函数判断当前临时从LRU链表分离出来的页面比较多,则最好睡眠
    等待100毫秒(congestion_wait),如果迁移模式是异步(MIGRATE_ASYNC)的,则直接退出。*/
    while (unlikely(too_many_isolated(zone))) {
        /* async migration should just abort */
        if (cc->mode == MIGRATE_ASYNC)
            return 0;

        congestion_wait(BLK_RW_ASYNC, HZ/10);

        if (fatal_signal_pending(current))
            return 0;
    }

    if (compact_should_abort(cc))
        return 0;

    /* Time to isolate some pages for migration */
    /*此循环中扫描pageblock寻找可以迁移的页*/
    for (; low_pfn < end_pfn; low_pfn++) {
        /*
         * Periodically drop the lock (if held) regardless of its
         * contention, to give chance to IRQs. Abort async compaction
         * if contended.
         */
        if (!(low_pfn % SWAP_CLUSTER_MAX)
            && compact_unlock_should_abort(&zone->lru_lock, flags,
                                &locked, cc))
            break;

        if (!pfn_valid_within(low_pfn))
            continue;
        nr_scanned++;

        page = pfn_to_page(low_pfn);

        if (!valid_page)
            valid_page = page;

        /*
         * Skip if free. We read page order here without zone lock
         * which is generally unsafe, but the race window is small and
         * the worst thing that can happen is that we skip some
         * potential isolation targets.
         */
        /*如果该页还在伙伴系统中,那么该页不适合迁移,略过该页。通过
        page_order_unsafe()读取该页的order值,for循环可以直接略过这些页*/
        if (PageBuddy(page)) {
            unsigned long freepage_order = page_order_unsafe(page);

            /*
             * Without lock, we cannot be sure that what we got is
             * a valid page order. Consider only values in the
             * valid order range to prevent low_pfn overflow.
             */
            if (freepage_order > 0 && freepage_order < MAX_ORDER)
                low_pfn += (1UL << freepage_order) - 1;
            continue;
        }

        /*
         * Check may be lockless but that's ok as we recheck later.
         * It's possible to migrate LRU pages and balloon pages
         * Skip any other type of page
         */
        /*在LRU链表中的页面或balloon页面适合迁移,其他类型的页面将被略过。*/
        if (!PageLRU(page)) {
            if (unlikely(balloon_page_movable(page))) {
                if (balloon_page_isolate(page)) {
                    /* Successfully isolated */
                    goto isolate_success;
                }
            }
            continue;
        }

        /*
         * PageLRU is set. lru_lock normally excludes isolation
         * splitting and collapsing (collapsing has already happened
         * if PageLRU is set) but the lock is not necessarily taken
         * here and it is wasteful to take it just to check transhuge.
         * Check TransHuge without lock and skip the whole pageblock if
         * it's either a transhuge or hugetlbfs page, as calling
         * compound_order() without preventing THP from splitting the
         * page underneath us may return surprising results.
         */
        if (PageTransHuge(page)) {
            if (!locked)
                low_pfn = ALIGN(low_pfn + 1,
                        pageblock_nr_pages) - 1;
            else
                low_pfn += (1 << compound_order(page)) - 1;

            continue;
        }

        /*
         * Migration will fail if an anonymous page is pinned in memory,
         * so avoid taking lru_lock and isolating it unnecessarily in an
         * admittedly racy check.
         */
        /*之前已经排除了PageBuddy和页不在LRU链表的情况,接下来剩下的页面是比较合适的候选者,
        但是还有一些特殊情况需要过滤掉。page_mapping()返回0,说明有可能是匿名页面。对于匿名
        页面来说,通常情况下page_count(page) = page_mapcount(page),即page->_count = page->_mapcount + 1.
        如果不相等,说明内核有人偷偷使用了这个匿名页面,所以匿名页面也不适合迁移。*/
        if (!page_mapping(page) &&
            page_count(page) > page_mapcount(page))
            continue;

        /* If we already hold the lock, we can skip some rechecking */

        /*加锁zone->lru_lock,并且重新判断该页是否是LRU链表中的页*/
        if (!locked) {
            locked = compact_trylock_irqsave(&zone->lru_lock,
                                &flags, cc);
            if (!locked)
                break;

            /* Recheck PageLRU and PageTransHuge under lock */
            if (!PageLRU(page))
                continue;
            if (PageTransHuge(page)) {
                low_pfn += (1 << compound_order(page)) - 1;
                continue;
            }
        }

        lruvec = mem_cgroup_page_lruvec(page, zone);

        /* Try isolate the page */
        /*__isolate_lru_page()分离ISOLATE_ASYNC_MIGRATE类型的页面。
        __isolate_lru_page()函数之前分析过,对于正在回写的页面是不合
        格的候选者,对于脏的页面,如果该页没有定义mapping->a_ops->migratepage()
        函数指针,那么也是不合格的候选者,另外还会对该页的page->_count引用计数加1,
        并清PG_lru标志位*/
        if (__isolate_lru_page(page, isolate_mode) != 0)
            continue;

        VM_BUG_ON_PAGE(PageTransCompound(page), page);

        /* Successfully isolated */
        /*把该页从LRU链表中删除*/
        del_page_from_lru_list(page, lruvec, page_lru(page));

        /*表示该页是一个合格的、可以迁移的页面,添加到cc->migratelist链表中*/
isolate_success:
        list_add(&page->lru, migratelist);
        cc->nr_migratepages++;
        nr_isolated++;

        /* Avoid isolating too much */
        if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
            ++low_pfn;
            break;
        }
    }

    /*
    适合被内存规整迁移的页面总结如下:
    (1) 必须在LRU链表中的页面,还在伙伴系统中的页面不适合。
    (2) 正在回写中的页面不适合,即标记为PG_writeback的页面。
    (3) 标记为PG_unevictable的页面不适合。
    (4) 没有定义mapping->a_ops->migratepage()方法的脏页面不适合。
    */

    /*
     * The PageBuddy() check could have potentially brought us outside
     * the range to be scanned.
     */
    if (unlikely(low_pfn > end_pfn))
        low_pfn = end_pfn;

    if (locked)
        spin_unlock_irqrestore(&zone->lru_lock, flags);

    /*
     * Update the pageblock-skip information and cached scanner pfn,
     * if the whole pageblock was scanned without isolating any page.
     */
    if (low_pfn == end_pfn)
        update_pageblock_skip(cc, valid_page, nr_isolated, true);

    trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
                        nr_scanned, nr_isolated);

    count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
    if (nr_isolated)
        count_compact_events(COMPACTISOLATED, nr_isolated);

    return low_pfn;
}
回到compact_zone()函数

migrate_pages()函数实现:迁移页的核心函数,从cc->migratepages链表中摘取页,然后尝试去迁移页。compaction_alloc()从zone的末尾开始查找空闲页面,然后并把空闲页面添加到cc->freepages链表中。

    migrate_pages()函数在页迁移一节中已经介绍,其中get_new_page()函数指针指向compaction_alloc()函数,put_new_page()函数指针指向compaction_free()函数,迁移模式为MIGRATE_ASYNC,reasion为MR_COMPACTION.

/*
 * This is a migrate-callback that "allocates" freepages by taking pages
 * from the isolated freelists in the block we are migrating to.
 */
/*查找哪些页面适合迁移,compaction_alloc()函数是从zone尾部开始查找哪些页面是空闲页面,
核心函数是isolate_freepages()函数,它与之前的isolate_migratepages()函数很相似。
compaction_alloc()函数最后返回一个空闲的页面。*/
static struct page *compaction_alloc(struct page *migratepage,
                    unsigned long data,
                    int **result)
{
    struct compact_control *cc = (struct compact_control *)data;
    struct page *freepage;

    /*
     * Isolate free pages if necessary, and if we are not aborting due to
     * contention.
     */
    if (list_empty(&cc->freepages)) {
        if (!cc->contended)
            isolate_freepages(cc);

        if (list_empty(&cc->freepages))
            return NULL;
    }

    freepage = list_entry(cc->freepages.next, struct page, lru);
    list_del(&freepage->lru);
    cc->nr_freepages--;

    return freepage;
}

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

byd yes

你的鼓励是我最大的动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值