memcg是容器底层技术基石之一,实现了内存资源的隔离与限制功能。在云原生场景下,经常出现memcg内存统计不准确,memcg内存回收,以及memcg oom等问题,这里主要分析memcg的计数(charge/uncharge),进程迁移,softlimit reclaim,memcg reclaim的内核实现。
1.memcg主要结构体
struct memory_cgroup 表示一个memory cgroup
struct mem_cgroup { /* cgroup控制器*/ struct cgroup_subsys_state css; /* cgroup内存计数 */ struct page_counter memory; struct page_counter swap; /* page + swap内存 */ struct page_counter memsw; /* 进程迁移是否charge内存到目的cgroup*/ unsigned long move_charge_at_immigrate; /* taken only while moving_account > 0 */ /* memcg内存和event统计分类 */ struct mem_cgroup_stat_cpu __percpu *stat_cpu; atomic_long_t stat[MEMCG_NR_STAT]; atomic_long_t events[NR_VM_EVENT_ITEMS]; atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; /* cgroup 分配page 的lru链表 */ struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ };
memory_cgrp_subsys实现了memcg操作集
css_alloc: 创建cgoup时调用
can_attach: 进程迁移到cgroup时,会先检查进程内存charge到cgroup后是否超过limit。
struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, .css_released = mem_cgroup_css_released, .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, .early_init = 0, };
2 内存记账(charge)
进程分配内存会计数到所属的cgroup,内核线程都会记账到根组memcg。系统在缺页处理,文件读写,进程迁移时会调用try_charge进行记账。
1.usage记账
try_charge主要计算cgroup内存usage,以及触发内存回收和oom killer,常见的调用charge流程
try_charge+0x0/0x100 [kernel] mem_cgroup_try_charge+0x0/0x250 [kernel] do_anonymous_page+0x267/0x610 [kernel] __handle_mm_fault+0x673/0xaa0 [kernel] handle_mm_fault+0x10d/0x200 [kernel] __do_page_fault+0x1c3/0x4e0 [kernel] do_page_fault+0x32/0x140 [kernel] async_page_fault+0x1e/0x30 [kernel]
try_charge主要功能
1.计数memory page couter和memsw page couter
2.检查内存usage是否超过limit,调用try_to_free_mem_cgroup_pages进行cgroup内存回收
3.usage超过limit,且无法回收足够的内存时,调用mem_cgroup_oom出发cgroup级别的oom kill
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { /*memcg不能限制根组进程,所有直接返回 */ if (mem_cgroup_is_root(memcg)) return 0; retry:/*percpu加速charge过程 */ if (consume_stock(memcg, nr_pages)) return 0; if (!do_memsw_account() ||/* 计数memory + swap*/ page_counter_try_charge(&memcg->memsw, batch, &counter)) { /* 记账memory,也就是cat memory.usage_in_bytes看到的值*/ if (page_counter_try_charge(&memcg->memory, batch, &counter)) goto done_restock; if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, batch); mem_over_limit = mem_cgroup_from_counter(counter, memory); } else { mem_over_limit = mem_cgroup_from_counter(counter, memsw); may_swap = false; } /* 内存usage超过了limit,触发cgroup回收*/ nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); /*内存回收后,再次判断内存usage */ if (mem_cgroup_margin(mem_over_limit) >= nr_pages) goto retry; /* 无法回收足够的内存,触发oom killer */ oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); }
2 memcg stat记账
把memcg 使用的内存按照RSS, CACHE,SHMEM和RSS_HUGE等分类来统计。
mem_cgroup_commit_charge主要做了
1.关联page和memcg
2.记账memcg内存type使用(rss,cache....)
3.记账memcg的event,特别是当usage超过softlimit时,会把memcg链接到mem_cgroup_tree_per_node,以供后续softlimit 内存回收。
void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare, bool compound) { unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; /*建立page与memcg关联 */ commit_charge(page, memcg, lrucare); local_irq_disable(); /* 统计memcg rss/cache内存使用*/ mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); /* 统计memcg事件*/ memcg_check_events(memcg, page); local_irq_enable(); }
当usage超过softlimit时,最终会调用mem_cgroup_update_tree把memcg插入全局的mem_cgroup_tree_per_node红黑树。
static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) { unsigned long excess; struct mem_cgroup_per_node *mz; struct mem_cgroup_tree_per_node *mctz; /* 通过page的node id获取对应的per node tree*/ mctz = soft_limit_tree_from_page(page); if (!mctz) return; for (; memcg; memcg = parent_mem_cgroup(memcg)) { /* 获取memcg的per node*/ mz = mem_cgroup_page_nodeinfo(memcg, page); /* 检查usage是否超过limit*/ excess = soft_limit_excess(memcg); /* * We have to update the tree if mz is on RB-tree or * mem is over its softlimit. */ if (excess || mz->on_tree) { unsigned long flags; spin_lock_irqsave(&mctz->lock, flags); /* 插入全局的per node tree */ __mem_cgroup_insert_exceeded(mz, mctz, excess); spin_unlock_irqrestore(&mctz->lock, flags); } } }
3 memcg lru记账
系统用pglist_data来描述一个node的所有内存信息,当系统没有使能cgroup时,分配出去的内存会链接到全局pglist_data的lruvec对应链表。
enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, LRU_UNEVICTABLE, NR_LRU_LISTS };
当使能cgroup后,分配的page都会链接到memcg的per node lruvec链表上。
struct mem_cgroup_per_node { /*lru链表 */ struct lruvec lruvec; /* 每种lru内存统计*/ struct lruvec_stat __percpu *lruvec_stat_cpu; atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; /* 按照zone来区分*/ unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; /* 带优先级的内存回收*/ struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; /* usage超过softlimit时,链接到全局的mem_cgroup_tree_per_node红黑树 ,方面进行softlimit reclaim*/ struct rb_node tree_node; /* RB tree node */ /* 标记usage是否大于softlimit*/ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; bool congested; /* memcg has many dirty pages */ /* backed by a congested BDI */ /* 所属memor cgroup*/ struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ };
lru记账的时机还是在缺页处理do_anonymous_page->lru_cache_add_active_or_unevictable
static void pagevec_lru_move_fn(struct pagevec *pvec, void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), void *arg) { int i; struct pglist_data *pgdat = NULL; struct lruvec *lruvec; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; struct pglist_data *pagepgdat = page_pgdat(page); /* 获取page对应的lruvec*/ lruvec = mem_cgroup_page_lruvec(page, pgdat); /* 添加到lru链表*/ (*move_fn)(page, lruvec, arg); } if (pgdat) spin_unlock_irqrestore(&pgdat->lru_lock, flags); release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); }
其中move_fn会调用add_page_to_lru_list把page添加到lru,而update_lru_size会更新对应的node和memcg的lru size
static __always_inline void add_page_to_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); }
update_lru_size最终会调用__update_lru_size更新全局的node lru信息和memcg的lru大小
static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, int nr_pages) { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif }
4 内存销账(uncharge)
进程退出,unmap内存,cache被回收或迁移到其他cgroup时,需要对源cgroup的usage,stat,lru进行销账。
anon uncharge
以进程退出时的uncharge流程来分析,通过unmap_vmas去unmap一段虚拟地址后,会调用release_pages函数释放page,最终调用uncharge_page
uncharge_page+0x0/0x1f0 [kernel] mem_cgroup_uncharge_list+0x59/0x80 [kernel] release_pages+0x17b/0x450 [kernel] tlb_flush_mmu_free+0x36/0x50 [kernel] zap_pte_range+0x590/0x7b0 [kernel] unmap_page_range+0x32f/0x4e0 [kernel] unmap_vmas+0x42/0x90 [kernel] exit_mmap+0xa2/0x170 [kernel] mmput+0x53/0x120 [kernel] do_exit+0x26e/0xc60 [kernel] do_group_exit+0x39/0xa0 [kernel] __x64_sys_exit_group+0x14/0x20 [kernel] do_syscall_64+0x5b/0x1b0 [kernel] entry_SYSCALL_64_after_hwframe+0x44/0xa9
uncharge_page最终调用uncharge_batch函数对usage,stat,lru统计信息进行uncharge
static void uncharge_batch(const struct uncharge_gather *ug) { unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem; unsigned long flags; if (!mem_cgroup_is_root(ug->memcg)) { /* 减少memory和memsw的计数*/ page_counter_uncharge(&ug->memcg->memory, nr_pages); if (do_memsw_account()) page_counter_uncharge(&ug->memcg->memsw, nr_pages); } local_irq_save(flags); /* 减少stat计数*/ __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file); __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages); }
文件pagecache
文件cache分两种:
一种是直接读写的cache,没有虚拟地址映射,在内存回收时直接uncharge
shrink_page_list->uncharge_page
mmap方式映射的文件
如果只有一个文件映射了文件,进程 退出时会unmap对应的虚拟地址,memory.stat的cache计数不为0,但map_file为0。
如果有多个进程映射了文件,一个进程退出,对应的Cache继续计数到当前memcg。cache和map_file都不为0
5 常见问题
memcg usage/memory.stat统计不准
1.task退出时没有释放共享内存
2.task迁移时没有设置move_charge_at_immigrate,导致内存记账在源memcg
3.不同cgroup的task mmap同一段文件/内存,内核记账到第一次映射的memcg
4.parent usage大于所有子Cgroup的usage,有可能有子memcg已经被删除,但一直处于DYING状态。
3 softlimit 回收
当一个memcg的usage大于soft_limit时,系统会把memcg的per node加入到全局的soft_limit_tree红黑树,这样当触发memcg的soft limit时,会从soft_limit_tree以此回收memcg的内存
mem_cgroup_soft_limit_reclaim回收逻辑
1.order大于0时,不触发soft_limit reclaim
2.当两次从largest memcg回收不到内存时,直接退出循环
3.当从largest memcg回收到了内存时,直接退出循环
4.当对largest memcg回收时,如果usage小于soft limit了,直接退出循环
5.当对largest memcg回收时,如果累计2次挑选memcg失败:
1.累计回收内存> excess/2时,退出回收largest,memcg
2.累计循环了100次,退出回收largest,memcg
3. 累计2次没有回收到内存,退出回收largest,memcg
4 memcg内存回收
memcg内存时,递归回收子cgroup的内存,其中涉及到mem_cgroup_protected,如果memcg的usage小于memory.min,则不会回收此memcg,当usage 小于memory.low时会选择回收,大于时会是理想的回收对象。
emin计算emin = min(emin, parent_emin * min_usage / siblings_min_usage);
其中:emin = min(emin, parent_emin);
parent_emin:父memcg的emin,min_usage= min(usage, memcg->memory.min):表示memcg的最小usage
而siblings_min_usage是所有子memcg 中usage小于min的usage累加值
如果我们需要保护某个cgroup不被内存回收,可以把memory.min设置得高点,这样就不会选择这个cgroup