Linux 内存管理mem cgroup分析

memcg是容器底层技术基石之一,实现了内存资源的隔离与限制功能。在云原生场景下,经常出现memcg内存统计不准确,memcg内存回收,以及memcg oom等问题,这里主要分析memcg的计数(charge/uncharge),进程迁移,softlimit reclaim,memcg reclaim的内核实现。

1.memcg主要结构体

struct memory_cgroup 表示一个memory cgroup

struct mem_cgroup {
	  /* cgroup控制器*/
    struct cgroup_subsys_state css; 

    /* cgroup内存计数 */
    struct page_counter memory;
    struct page_counter swap;

    /* page + swap内存 */
    struct page_counter memsw;

		/* 进程迁移是否charge内存到目的cgroup*/
    unsigned long move_charge_at_immigrate;
    /* taken only while moving_account > 0 */

    /* memcg内存和event统计分类 */
    struct mem_cgroup_stat_cpu __percpu *stat_cpu;
    atomic_long_t       stat[MEMCG_NR_STAT];
    atomic_long_t       events[NR_VM_EVENT_ITEMS];
    atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
    /* cgroup 分配page 的lru链表 */
    struct mem_cgroup_per_node *nodeinfo[0];
    /* WARNING: nodeinfo must be the last member here */
};

memory_cgrp_subsys实现了memcg操作集

css_alloc: 创建cgoup时调用

can_attach: 进程迁移到cgroup时,会先检查进程内存charge到cgroup后是否超过limit。

struct cgroup_subsys memory_cgrp_subsys = {
    .css_alloc = mem_cgroup_css_alloc,
    .css_online = mem_cgroup_css_online,
    .css_offline = mem_cgroup_css_offline,
    .css_released = mem_cgroup_css_released,
    .css_free = mem_cgroup_css_free,
    .css_reset = mem_cgroup_css_reset,
    .can_attach = mem_cgroup_can_attach,
    .cancel_attach = mem_cgroup_cancel_attach,
    .post_attach = mem_cgroup_move_task,
    .bind = mem_cgroup_bind,
    .dfl_cftypes = memory_files,
    .legacy_cftypes = mem_cgroup_legacy_files,
    .early_init = 0, 
};

2 内存记账(charge)

进程分配内存会计数到所属的cgroup,内核线程都会记账到根组memcg。系统在缺页处理,文件读写,进程迁移时会调用try_charge进行记账。

1.usage记账

try_charge主要计算cgroup内存usage,以及触发内存回收和oom killer,常见的调用charge流程

try_charge+0x0/0x100 [kernel]
mem_cgroup_try_charge+0x0/0x250 [kernel]
do_anonymous_page+0x267/0x610 [kernel]
__handle_mm_fault+0x673/0xaa0 [kernel]
handle_mm_fault+0x10d/0x200 [kernel]
__do_page_fault+0x1c3/0x4e0 [kernel]
do_page_fault+0x32/0x140 [kernel]
async_page_fault+0x1e/0x30 [kernel]

try_charge主要功能

1.计数memory page couter和memsw page couter

2.检查内存usage是否超过limit,调用try_to_free_mem_cgroup_pages进行cgroup内存回收

3.usage超过limit,且无法回收足够的内存时,调用mem_cgroup_oom出发cgroup级别的oom kill

static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
              unsigned int nr_pages)
{
	  /*memcg不能限制根组进程,所有直接返回 */
    if (mem_cgroup_is_root(memcg))
        return 0;
retry:/*percpu加速charge过程 */
    if (consume_stock(memcg, nr_pages))
        return 0;

    if (!do_memsw_account() ||/* 计数memory + swap*/
        page_counter_try_charge(&memcg->memsw, batch, &counter)) {
        /* 记账memory,也就是cat memory.usage_in_bytes看到的值*/
        if (page_counter_try_charge(&memcg->memory, batch, &counter))
            goto done_restock;
        if (do_memsw_account())
            page_counter_uncharge(&memcg->memsw, batch);
        mem_over_limit = mem_cgroup_from_counter(counter, memory);
    } else {
        mem_over_limit = mem_cgroup_from_counter(counter, memsw);
        may_swap = false;
    }    

		/* 内存usage超过了limit,触发cgroup回收*/
    nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
                            gfp_mask, may_swap);
	  /*内存回收后,再次判断内存usage */
    if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
        goto retry;
    /* 无法回收足够的内存,触发oom killer */
    oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
               get_order(nr_pages * PAGE_SIZE));
    }

2 memcg stat记账

把memcg 使用的内存按照RSS, CACHE,SHMEM和RSS_HUGE等分类来统计。

mem_cgroup_commit_charge主要做了

1.关联page和memcg

2.记账memcg内存type使用(rss,cache....)

3.记账memcg的event,特别是当usage超过softlimit时,会把memcg链接到mem_cgroup_tree_per_node,以供后续softlimit 内存回收。

void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
                  bool lrucare, bool compound)
{
    unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; 
		/*建立page与memcg关联 */
    commit_charge(page, memcg, lrucare);

    local_irq_disable();
    /* 统计memcg rss/cache内存使用*/
    mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
    /* 统计memcg事件*/
    memcg_check_events(memcg, page);
    local_irq_enable();

}

当usage超过softlimit时,最终会调用mem_cgroup_update_tree把memcg插入全局的mem_cgroup_tree_per_node红黑树。

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
    unsigned long excess;
    struct mem_cgroup_per_node *mz;
    struct mem_cgroup_tree_per_node *mctz;
		/* 通过page的node id获取对应的per node tree*/
    mctz = soft_limit_tree_from_page(page);
    if (!mctz)
        return;
  
    for (; memcg; memcg = parent_mem_cgroup(memcg)) {
    		/* 获取memcg的per node*/
        mz = mem_cgroup_page_nodeinfo(memcg, page);
        /* 检查usage是否超过limit*/
        excess = soft_limit_excess(memcg);
        /*
         * We have to update the tree if mz is on RB-tree or
         * mem is over its softlimit.
         */
        if (excess || mz->on_tree) {
            unsigned long flags;

            spin_lock_irqsave(&mctz->lock, flags);
            /*
             插入全局的per node tree
             */
            __mem_cgroup_insert_exceeded(mz, mctz, excess);
            spin_unlock_irqrestore(&mctz->lock, flags);
        }
    }
}

3 memcg lru记账

系统用pglist_data来描述一个node的所有内存信息,当系统没有使能cgroup时,分配出去的内存会链接到全局pglist_data的lruvec对应链表。

enum lru_list {
    LRU_INACTIVE_ANON = LRU_BASE,
    LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
    LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
    LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
    LRU_UNEVICTABLE,
    NR_LRU_LISTS
};

当使能cgroup后,分配的page都会链接到memcg的per node lruvec链表上。

struct mem_cgroup_per_node {
		/*lru链表 */
    struct lruvec       lruvec;
		/* 每种lru内存统计*/
    struct lruvec_stat __percpu *lruvec_stat_cpu;
    atomic_long_t       lruvec_stat[NR_VM_NODE_STAT_ITEMS];
		/* 按照zone来区分*/
    unsigned long       lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
		/* 带优先级的内存回收*/
    struct mem_cgroup_reclaim_iter  iter[DEF_PRIORITY + 1];
		/* usage超过softlimit时,链接到全局的mem_cgroup_tree_per_node红黑树
    ,方面进行softlimit reclaim*/
    struct rb_node      tree_node;  /* RB tree node */
    /* 标记usage是否大于softlimit*/
    unsigned long       usage_in_excess;/* Set to the value by which */
                        /* the soft limit is exceeded*/
    bool            on_tree;
    bool            congested;  /* memcg has many dirty pages */
                        /* backed by a congested BDI */
		/* 所属memor cgroup*/
    struct mem_cgroup   *memcg;     /* Back pointer, we cannot */
                        /* use container_of    */
};

lru记账的时机还是在缺页处理do_anonymous_page->lru_cache_add_active_or_unevictable

static void pagevec_lru_move_fn(struct pagevec *pvec,
    void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
    void *arg)
{
    int i;
    struct pglist_data *pgdat = NULL;
    struct lruvec *lruvec;
    unsigned long flags = 0; 

    for (i = 0; i < pagevec_count(pvec); i++) {
        struct page *page = pvec->pages[i];
        struct pglist_data *pagepgdat = page_pgdat(page);
		/* 获取page对应的lruvec*/
        lruvec = mem_cgroup_page_lruvec(page, pgdat);
        /* 添加到lru链表*/
        (*move_fn)(page, lruvec, arg);
    }    
    if (pgdat)
        spin_unlock_irqrestore(&pgdat->lru_lock, flags);
    release_pages(pvec->pages, pvec->nr);
    pagevec_reinit(pvec);
}

其中move_fn会调用add_page_to_lru_list把page添加到lru,而update_lru_size会更新对应的node和memcg的lru size

static __always_inline void add_page_to_lru_list(struct page *page,
                struct lruvec *lruvec, enum lru_list lru)
{
    update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
    list_add(&page->lru, &lruvec->lists[lru]);
}

update_lru_size最终会调用__update_lru_size更新全局的node lru信息和memcg的lru大小

static __always_inline void update_lru_size(struct lruvec *lruvec,
                enum lru_list lru, enum zone_type zid,
                int nr_pages)
{       
    __update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
    mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif  
}      

4 内存销账(uncharge)

进程退出,unmap内存,cache被回收或迁移到其他cgroup时,需要对源cgroup的usage,stat,lru进行销账。

anon uncharge

以进程退出时的uncharge流程来分析,通过unmap_vmas去unmap一段虚拟地址后,会调用release_pages函数释放page,最终调用uncharge_page

 uncharge_page+0x0/0x1f0 [kernel]
 mem_cgroup_uncharge_list+0x59/0x80 [kernel]
 release_pages+0x17b/0x450 [kernel]
 tlb_flush_mmu_free+0x36/0x50 [kernel]
 zap_pte_range+0x590/0x7b0 [kernel]
 unmap_page_range+0x32f/0x4e0 [kernel]
 unmap_vmas+0x42/0x90 [kernel]
 exit_mmap+0xa2/0x170 [kernel]
 mmput+0x53/0x120 [kernel]
 do_exit+0x26e/0xc60 [kernel]
 do_group_exit+0x39/0xa0 [kernel]
 __x64_sys_exit_group+0x14/0x20 [kernel]
 do_syscall_64+0x5b/0x1b0 [kernel]
 entry_SYSCALL_64_after_hwframe+0x44/0xa9 

uncharge_page最终调用uncharge_batch函数对usage,stat,lru统计信息进行uncharge

static void uncharge_batch(const struct uncharge_gather *ug) 
{
    unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
    unsigned long flags;

    if (!mem_cgroup_is_root(ug->memcg)) {
        /* 减少memory和memsw的计数*/
        page_counter_uncharge(&ug->memcg->memory, nr_pages);
        if (do_memsw_account())
            page_counter_uncharge(&ug->memcg->memsw, nr_pages);
    }    

    local_irq_save(flags);
    /* 减少stat计数*/
    __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
    __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
    __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
    __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
    __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
    __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
}

文件pagecache

文件cache分两种:

一种是直接读写的cache,没有虚拟地址映射,在内存回收时直接uncharge

shrink_page_list->uncharge_page

mmap方式映射的文件

如果只有一个文件映射了文件,进程 退出时会unmap对应的虚拟地址,memory.stat的cache计数不为0,但map_file为0。

如果有多个进程映射了文件,一个进程退出,对应的Cache继续计数到当前memcg。cache和map_file都不为0

5 常见问题

memcg usage/memory.stat统计不准

1.task退出时没有释放共享内存

2.task迁移时没有设置move_charge_at_immigrate,导致内存记账在源memcg

3.不同cgroup的task mmap同一段文件/内存,内核记账到第一次映射的memcg

4.parent usage大于所有子Cgroup的usage,有可能有子memcg已经被删除,但一直处于DYING状态。

3 softlimit 回收

当一个memcg的usage大于soft_limit时,系统会把memcg的per node加入到全局的soft_limit_tree红黑树,这样当触发memcg的soft limit时,会从soft_limit_tree以此回收memcg的内存

mem_cgroup_soft_limit_reclaim回收逻辑

1.order大于0时,不触发soft_limit reclaim

2.当两次从largest memcg回收不到内存时,直接退出循环

3.当从largest memcg回收到了内存时,直接退出循环

4.当对largest memcg回收时,如果usage小于soft limit了,直接退出循环

5.当对largest memcg回收时,如果累计2次挑选memcg失败:

1.累计回收内存> excess/2时,退出回收largest,memcg

2.累计循环了100次,退出回收largest,memcg

3. 累计2次没有回收到内存,退出回收largest,memcg

4 memcg内存回收

memcg内存时,递归回收子cgroup的内存,其中涉及到mem_cgroup_protected,如果memcg的usage小于memory.min,则不会回收此memcg,当usage 小于memory.low时会选择回收,大于时会是理想的回收对象。

emin计算emin = min(emin, parent_emin * min_usage / siblings_min_usage);

其中:emin = min(emin, parent_emin);

parent_emin:父memcg的emin,min_usage= min(usage, memcg->memory.min):表示memcg的最小usage

而siblings_min_usage是所有子memcg 中usage小于min的usage累加值

如果我们需要保护某个cgroup不被内存回收,可以把memory.min设置得高点,这样就不会选择这个cgroup

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值