14.2 kswapd内核线程

最新推荐文章于 2023-02-20 14:36:09 发布

byd yes

最新推荐文章于 2023-02-20 14:36:09 发布

阅读量719

点赞数 1

分类专栏： linux.mm

本文链接：https://blog.csdn.net/dai_xiangjun/article/details/118863989

版权

linux.mm 专栏收录该内容

59 篇文章 25 订阅

订阅专栏

Linux内核中的kswapd内核线程负责在内存不足时回收页面，每个NUMA节点初始化一个。kswapd通过kswapd_max_order和classzone_idx参数协调页面分配器，当内存分配失败时被唤醒进行页面回收。平衡过程由balance_pgdat函数执行，确保系统在高负载下仍能保持可用内存。

摘要由CSDN通过智能技术生成

linux内核中有一个非常重要的内核线程kswapd，负责在内存不足的情况下回收页面。kswapd内核线程被初始化为系统中每个NUMA内存节点创建一个名为"kswapd%d"的内核线程。

static int __init kswapd_init(void)
{
    int nid;

    swap_setup();
    for_each_node_state(nid, N_MEMORY)
        kswapd_run(nid);
    hotcpu_notifier(cpu_callback, 0);
    return 0;
}

int kswapd_run(int nid)
{
    pg_data_t *pgdat = NODE_DATA(nid);
    int ret = 0;

    if (pgdat->kswapd)
        return 0;

    pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
    if (IS_ERR(pgdat->kswapd)) {
        /* failure at boot is fatal */
        BUG_ON(system_state == SYSTEM_BOOTING);
        pr_err("Failed to start kswapd on node %d\n", nid);
        ret = PTR_ERR(pgdat->kswapd);
        pgdat->kswapd = NULL;
    }
    return ret;
}

在NUMA系统中，每个node节点有一个pg_data_t数据结构来描述物理内存的布局。pg_data_t数据结构定义在include/linux/mmzone.h头文件中，kswapd传递的参数就是pg_data_t数据结构。

typedef struct pglist_data {
    struct zone node_zones[MAX_NR_ZONES];
    struct zonelist node_zonelists[MAX_ZONELISTS];
    int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
    struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
    struct page_ext *node_page_ext;
#endif
#endif
#ifndef CONFIG_NO_BOOTMEM
    struct bootmem_data *bdata;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
    /*
     * Must be held any time you expect node_start_pfn, node_present_pages
     * or node_spanned_pages stay constant.  Holding this will also
     * guarantee that any pfn_valid() stays that way.
     *
     * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
     * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG.
     *
     * Nests above zone->lock and zone->span_seqlock
     */
    spinlock_t node_size_lock;
#endif
    unsigned long node_start_pfn;
    unsigned long node_present_pages; /* total number of physical pages */
    unsigned long node_spanned_pages; /* total size of physical page
                         range, including holes */
    int node_id;
    wait_queue_head_t kswapd_wait;
    wait_queue_head_t pfmemalloc_wait;
    struct task_struct *kswapd; /* Protected by
                       mem_hotplug_begin/end() */
    int kswapd_max_order;
    enum zone_type classzone_idx;
#ifdef CONFIG_NUMA_BALANCING
    /* Lock serializing the migrate rate limiting window */
    spinlock_t numabalancing_migrate_lock;

    /* Rate limiting time interval */
    unsigned long numabalancing_migrate_next_window;

    /* Number of pages migrated during the rate limiting time interval */
    unsigned long numabalancing_migrate_nr_pages;
#endif
} pg_data_t;

和kswapd相关的参数有kswapd_max_order、kswapd_wait和classzone_idx等。kswapd_wait是一个等待队列，每个pg_data_t数据结构都有这样一个等待队列，它是在free_area_init_core()函数中初始化。页面分配路径上唤醒函数wakeup_kswapd()把kswapd_max_order和classzone_idx作为参数传递给kswapd内核线程。在分配内存路径上，如果在低水位(ALLOC_WMARK_LOW)的情况下无法成功分配内存，那么就通过wakeup_kswapd()函数唤醒kswapd内核线程来回收页面，以便释放一些内存。

wakeup_kswapd()函数定义在mm/vmscan.c文件中。

[alloc_pages()->alloc_pages_node()->__alloc_pages()->__alloc_pages_nodemask()->

__alloc_pages_slowpath()->wake_all_kswapds()->wakeup_kswapd()]

/*
 * A zone is low on free memory, so wake its kswapd task to service it.
 */
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
    pg_data_t *pgdat;

    if (!populated_zone(zone))
        return;

    if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
        return;
    pgdat = zone->zone_pgdat;
    /*这里需要复制kswapd_max_order和classzone_idx，其中kswapd_max_order
    不能小于alloc_page()分配内存的order，classzone_idx是在__alloc_pages_nodemask
    函数计算第一个最合适分配内存的zone序号，这两个参数会传递给kswapd内核线程中。
    classzone_idx是理解页面分配器和页面回收kswapd内核线程中间如何协同工作的一个关键点*/
    if (pgdat->kswapd_max_order < order) {
        pgdat->kswapd_max_order = order;
        pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
    }
    if (!waitqueue_active(&pgdat->kswapd_wait))
        return;
    if (zone_balanced(zone, order, 0, 0))
        return;

    trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
    wake_up_interruptible(&pgdat->kswapd_wait);
}

这里假设GFP_HIGHUSER_MOVABLE为分配掩码分配内存，以在__alloc_pages_nodemask()->first_zones_zonelist()中计算出来的preferred_zone为ZONE_HIGHMEM,那么ac.classzone_idx的值为1. 当内存分配失败时，页面分配器会唤醒kswapd内核线程，并且传递ac.classzone_idx值到kswapd内核线程，最后传递给zone_balanced()函数的classzone_idx参数。

/*
 * This is the 'heart' of the zoned buddy allocator.
 */
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
            struct zonelist *zonelist, nodemask_t *nodemask)
{
    struct zoneref *preferred_zoneref;
    struct page *page = NULL;
    unsigned int cpuset_mems_cookie;
    int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
    gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
    struct alloc_context ac = {
        .high_zoneidx = gfp_zone(gfp_mask),
        .nodemask = nodemask,
        .migratetype = gfpflags_to_migratetype(gfp_mask),
    };

    gfp_mask &= gfp_allowed_mask;

    lockdep_trace_alloc(gfp_mask);

    might_sleep_if(gfp_mask & __GFP_WAIT);

    if (should_fail_alloc_page(gfp_mask, order))
        return NULL;

    /*
     * Check the zones suitable for the gfp_mask contain at least one
     * valid zone. It's possible to have an empty zonelist as a result
     * of GFP_THISNODE and a memoryless node
     */
    if (unlikely(!zonelist->_zonerefs->zone))
        return NULL;

    if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
        alloc_flags |= ALLOC_CMA;

retry_cpuset:
    cpuset_mems_cookie = read_mems_allowed_begin();

    /* We set it here, as __alloc_pages_slowpath might have changed it */
    ac.zonelist = zonelist;
    /* The preferred zone is used for statistics later */
    preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
                ac.nodemask ? : &cpuset_current_mems_allowed,
                &ac.preferred_zone);
    if (!ac.preferred_zone)
        goto out;
    ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);

    /* First allocation attempt */
    alloc_mask = gfp_mask|__GFP_HARDWALL;
    page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
    if (unlikely(!page)) {
        /*
         * Runtime PM, block IO and its error handling path
         * can deadlock because I/O on the device might not
         * complete.
         */
        alloc_mask = memalloc_noio_flags(gfp_mask);

        page = __alloc_pages_slowpath(alloc_mask, order, &ac);
    }

    if (kmemcheck_enabled && page)
        kmemcheck_pagealloc_alloc(page, order, gfp_mask);

    trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

out:
    /*
     * When updating a task's mems_allowed, it is possible to race with
     * parallel threads in such a way that an allocation can fail while
     * the mask is being updated. If a page allocation is about to fail,
     * check if the cpuset changed during allocation and if so, retry.
     */
    if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
        goto retry_cpuset;

    return page;
}

kswapd内核线程的执行函数如下：

[mm/vmscan.c]

/*
 * The background pageout daemon, started as a kernel thread
 * from the init process.
 *
 * This basically trickles out pages so that we have _some_
 * free memory available even if there is no other activity
 * that frees anything up. This is needed for things like routing
 * etc, where we otherwise might have all activity going on in
 * asynchronous contexts that cannot page things out.
 *
 * If there are applications that are active memory-allocators
 * (most normal use), this basically shouldn't matter.
 */
static int kswapd(void *p)
{
    unsigned long order, new_order;
    unsigned balanced_order;
    int classzone_idx, new_classzone_idx;
    int balanced_classzone_idx;
    pg_data_t *pgdat = (pg_data_t*)p;
    struct task_struct *tsk = current;

    struct reclaim_state reclaim_state = {
        .reclaimed_slab = 0,
    };
    const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

    lockdep_set_current_reclaim_state(GFP_KERNEL);

    if (!cpumask_empty(cpumask))
        set_cpus_allowed_ptr(tsk, cpumask);
    current->reclaim_state = &reclaim_state;

    /*
     * Tell the memory management that we're a "memory allocator",
     * and that if we need more memory we should get access to it
     * regardless (see "__alloc_pages()"). "kswapd" should
     * never get caught in the normal page freeing logic.
     *
     * (Kswapd normally doesn't need memory anyway, but sometimes
     * you need a small amount of memory in order to be able to
     * page out something else, and this flag essentially protects
     * us from recursively trying to free more memory as we're
     * trying to free the first piece of memory in the first place).
     */
    tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
    set_freezable();

    order = new_order = 0;
    balanced_order = 0;
    classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
    balanced_classzone_idx = classzone_idx; // balanced_classzone_idx = 0

    /*for循环是此函数的核心部分，这里有很多的局部变量来控制程序走向，其中最重要的变量是
    kswapd_max_order和classzone_idx。*/
    for ( ; ; ) {
        bool ret;

        /*
         * If the last balance_pgdat was unsuccessful it's unlikely a
         * new request of a similar or harder type will succeed soon
         * so consider going to sleep on the basis we reclaimed at
         */
        if (balanced_classzone_idx >= new_classzone_idx &&
                    balanced_order == new_order) {
            new_order = pgdat->kswapd_max_order;
            new_classzone_idx = pgdat->classzone_idx;
            pgdat->kswapd_max_order =  0;
            pgdat->classzone_idx = pgdat->nr_zones - 1;
        }

        if (order < new_order || classzone_idx > new_classzone_idx) {
            /*
             * Don't sleep if someone wants a larger 'order'
             * allocation or has tigher zone constraints
             */
            order = new_order;
            classzone_idx = new_classzone_idx;
        } else {
            kswapd_try_to_sleep(pgdat, balanced_order,
                        balanced_classzone_idx);
            order = pgdat->kswapd_max_order;
            classzone_idx = pgdat->classzone_idx;
            new_order = order;
            new_classzone_idx = classzone_idx;
            pgdat->kswapd_max_order = 0;
            pgdat->classzone_idx = pgdat->nr_zones - 1;
        }

        ret = try_to_freeze();
        if (kthread_should_stop())
            break;

        /*
         * We can speed up thawing tasks if we don't call balance_pgdat
         * after returning from the refrigerator
         */
        if (!ret) {
            trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
            balanced_classzone_idx = classzone_idx;
            balanced_order = balance_pgdat(pgdat, order,
                        &balanced_classzone_idx);
        }
    }

    tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
    current->reclaim_state = NULL;
    lockdep_clear_current_reclaim_state();

    return 0;
}

系统启动时会在kswapd_try_to_sleep()函数中睡眠并且让出CPU控制权。当系统内存紧张时，例如alloc_pages()在低水位(ALLOC_WMARK_LOW)中无法分配出内存，这时分配内存函数会调用wakeup_kswapd()来唤醒kswapd内核线程。kswapd内核线程初始化时会在kswapd_try_to_sleep()中睡眠，唤醒点在kswapd_try_to_sleep()函数中。kswapd内核线程被唤醒之后，调用balance_pgdat()来回收页面。调用逻辑如下：

alloc_pages
    __alloc_pages_nodemask()
        ->if fail on ALLOC_WMARK_LOW
            ->__alloc_pages_slowpath()
                ->wakeup_kswapd()
                    ->wake_up(kswapd_wait)
                        kswapd内核线程被唤醒
                            ->balance_pgdat()