// 每处理器记账缓存一次从内存控制组批量申请32页, 然后把内存控制组的内存使用量加上32页#defineCHARGE_BATCH32U// 在内存控制组记账(charge)时, 先查看当前处理器的memcg_stock_pcp// 如果memcg_stock_pcp保存的内存控制组(memcg_stock_pcp->cached)正是准备记账的内存控制组// 并且预留页数(memcg_stock_pcp->nr_pages)大于或等于准备记账的页数, 则将预留页数减去准备记账的页数structmemcg_stock_pcp{// 内存控制组(非根控制组)structmem_cgroup*cached;/* this never be root cgroup */// 预留页数unsignedint nr_pages;// 工作任务structwork_struct work;unsignedlong flags;#defineFLUSHING_CACHED_CHARGE0};// 为减少处理器之间的竞争, 提高内存记账效率, 定义一个每处理器记账缓存staticDEFINE_PER_CPU(structmemcg_stock_pcp, memcg_stock);staticDEFINE_MUTEX(percpu_charge_mutex);
2. mem_cgroup_try_charge
/* Whether the swap controller is active */#ifdefCONFIG_MEMCG_SWAP// 启用交换分区记账int do_swap_account __read_mostly;#else#definedo_swap_account0#endif// 尝试记账一个page// page: 准备记账的页// mm: 指向申请物理页的进程的内存描述符// gfp_mask: 申请分配物理页的分配掩码// memcgp: 输出参数, 返回记账的内存控制组// compound: 以复合页还是单页的方式记账intmem_cgroup_try_charge(structpage*page,structmm_struct*mm,gfp_t gfp_mask,structmem_cgroup**memcgp,
bool compound){structmem_cgroup*memcg =NULL;// 如果是复合页则需要计算页数, 否则就是单页unsignedint nr_pages = compound ?hpage_nr_pages(page):1;int ret =0;// 如果禁用内存控制组则返回0if(mem_cgroup_disabled())goto out;// page在swap cache中if(PageSwapCache(page)){/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
* already charged pages, too. The USED bit is protected by
* the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
*/VM_BUG_ON_PAGE(!PageLocked(page), page);if(compound_head(page)->mem_cgroup)goto out;if(do_swap_account){// 从page->private成员得到交换项swp_entry_t ent ={.val =page_private(page),};// 根据交换项得到内存控制组idunsignedshort id =lookup_swap_cgroup_id(ent);rcu_read_lock();// 根据id查找内存控制组
memcg =mem_cgroup_from_id(id);// 如果该内存控制组存在, 则引用计算加1if(memcg &&!css_tryget_online(&memcg->css))
memcg =NULL;rcu_read_unlock();}}// 如果内存控制组为空, 则根据内存描述符查找进程所属的内存控制组if(!memcg)
memcg =get_mem_cgroup_from_mm(mm);// 进入真正的尝试记账流程[见2.1节]
ret =try_charge(memcg, gfp_mask, nr_pages);// 内存控制组的引用计数减1css_put(&memcg->css);
out:// 入参memcgp用于返回记账的内存控制组*memcgp = memcg;return ret;}
2.1 try_charge
staticinttry_charge(structmem_cgroup*memcg,gfp_t gfp_mask,unsignedint nr_pages){// 取32和记账页数较大的那个值unsignedint batch =max(CHARGE_BATCH, nr_pages);// 最多重试5次int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;// 用于记录超过内存使用限制(硬限制)的控制组structmem_cgroup*mem_over_limit;// 记账的页面计数器structpage_counter*counter;// 回收的页面数量unsignedlong nr_reclaimed;
bool may_swap = true;
bool drained = false;// 进程属于根控制组: 因为根控制组对内存使用量没有限制, 所以不需要记账直接返回if(mem_cgroup_is_root(memcg))return0;
retry:// 如果当前cpu的记账缓存从准备记账的内存控制组预留的页数足够多// 那么从记账缓存减去准备记账的页数(而无需向内存控制组记账), 并结束记账返回成功[见2.2节]if(consume_stock(memcg, nr_pages))return0;// 1. 没有开启内存+交换分区记账, 直接进入mem_cgroup->memory记账流程[见2.3节]// 2. 开启内存+交换分区记账, 则首先进入memsw记账流程, 成功后再进入memory记账流程if(!do_memsw_account()||page_counter_try_charge(&memcg->memsw, batch,&counter)){// 当前内存控制组和其所有启用分层记账的祖先的内存使用量没有超过限制, 则进入记账成功流程if(page_counter_try_charge(&memcg->memory, batch,&counter))goto done_restock;// 如果mem_cgroup->memory超过限制记账失败// 而且开启内存+交换分区记账, 则还需要撤销29行内存+交换分区的记账if(do_memsw_account())page_counter_uncharge(&memcg->memsw, batch);// 记录内存使用量超过限制的内存控制组
mem_over_limit =mem_cgroup_from_counter(counter, memory);}else{// 记录内存+交换分区使用量超过限制的内存控制组
mem_over_limit =mem_cgroup_from_counter(counter, memsw);
may_swap = false;}// 如果batch大于准备记账的实际页数, 则使用实际页数重试记账流程if(batch > nr_pages){
batch = nr_pages;goto retry;}/*
* Unlike in global OOM situations, memcg is not in a physical
* memory shortage. Allow dying and OOM-killed tasks to
* bypass the last charges so that they can exit quickly and
* free their memory.
*/if(unlikely(tsk_is_oom_victim(current)||fatal_signal_pending(current)||
current->flags & PF_EXITING))goto force;/*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
* but we prefer facilitating memory reclaim and getting back
* under the limit over triggering OOM kills in these cases.
*/if(unlikely(current->flags & PF_MEMALLOC))goto force;if(unlikely(task_in_memcg_oom(current)))goto nomem;if(!gfpflags_allow_blocking(gfp_mask))goto nomem;// 记录MEMCG_MAX事件到mem_cgroup_stat_cpumem_cgroup_event(mem_over_limit, MEMCG_MAX);// mem_over_limit记录了超过限制的内存控制组// 这里尝试针对该控制组进行内存回收, 并返回回收的页面数量
nr_reclaimed =try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);// 内存回收之后, 剩余的内存使用量已经足够记账申请的页数, 则进入重试流程[见2.4节]if(mem_cgroup_margin(mem_over_limit)>= nr_pages)goto retry;// drained默认为false, 代表是否有把每处理器记账缓存预留的页数归还给内存控制组if(!drained){// 把每处理器记账缓存预留的页数归还给内存控制组, 并进入重试流程[见2.5节]drain_all_stock(mem_over_limit);
drained = true;goto retry;}// 如果申请页面时不允许重试, 则进入nomem流程if(gfp_mask & __GFP_NORETRY)goto nomem;/*
* Even though the limit is exceeded at this point, reclaim
* may have been able to free some pages. Retry the charge
* before killing the task.
*
* Only for regular pages, though: huge pages are rather
* unlikely to succeed so close to the limit, and we fall back
* to regular pages anyway in case of failure.
*/// 如果从超过限制的内存控制组回收的页数大于0, 并且准备记账的页数没有超过8, 则进入重试流程if(nr_reclaimed && nr_pages <=(1<< PAGE_ALLOC_COSTLY_ORDER))goto retry;/*
* At task move, charge accounts can be doubly counted. So, it's
* better to wait until the end of task_move if something is going on.
*/if(mem_cgroup_wait_acct_move(mem_over_limit))goto retry;// 最多重试5次if(nr_retries--)goto retry;// 如果申请页面时不允许失败, 则强制记账允许内存使用超过限制if(gfp_mask & __GFP_NOFAIL)goto force;if(fatal_signal_pending(current))goto force;// 记录MEMCG_OOM事件到mem_cgroup_stat_cpumem_cgroup_event(mem_over_limit, MEMCG_OOM);// 把进程设置为内存控制组内存耗尽状态mem_cgroup_oom(mem_over_limit, gfp_mask,get_order(nr_pages * PAGE_SIZE));
nomem:// 如果申请页面时允许失败, 则返回错误码ENOMEMif(!(gfp_mask & __GFP_NOFAIL))return-ENOMEM;
force:/*
* The allocation either can't fail or will lead to more memory
* being freed very soon. Allow memory usage go over the limit
* temporarily by force charging it.
*/// 如果申请页面时不允许失败, 则强制记账, 允许内存使用量临时超过硬限制page_counter_charge(&memcg->memory, nr_pages);if(do_memsw_account())page_counter_charge(&memcg->memsw, nr_pages);// 内存控制组的引用计数加上记账的页数css_get_many(&memcg->css, nr_pages);return0;
done_restock:// 内存控制组引用计数加上记账的页数css_get_many(&memcg->css, batch);// 把记账多余的页数保留到当前cpu的记账缓存[见2.6节]if(batch > nr_pages)refill_stock(memcg, batch - nr_pages);/*
* If the hierarchy is above the normal consumption range, schedule
* reclaim on returning to userland. We can perform reclaim here
* if __GFP_RECLAIM but let's always punt for simplicity and so that
* GFP_KERNEL can consistently be used during reclaim. @memcg is
* not recorded as it most likely matches current's and won't
* change in the meantime. As high limit is checked again before
* reclaim, the cost of mismatch is negligible.
*/do{if(page_counter_read(&memcg->memory)> memcg->high){/* Don't bother a random interrupted task */if(in_interrupt()){schedule_work(&memcg->high_work);break;}
current->memcg_nr_pages_over_high += batch;set_notify_resume(current);break;}}while((memcg =parent_mem_cgroup(memcg)));return0;}
// counter: mem_cgroup的页面计数器(包括memory, memsw, kmem, tcpmem和swap)// nr_pages: 记账的页数// fail: 需要返回的参数, 记录超过限制的页面计数器
bool page_counter_try_charge(structpage_counter*counter,unsignedlong nr_pages,structpage_counter**fail){structpage_counter*c;// 遍历当前页面计数器以及所有支持分层记账的父控制组的页面计数器for(c = counter; c; c = c->parent){long new;// 将记账的页数与页面计数器当前使用量相加得到新的使用量
new =atomic_long_add_return(nr_pages,&c->count);// 判断新的使用量是否超过页面计数器的硬限制if(new > c->limit){// 如果超过硬限制, 则将记账的页数从新的使用量中减掉atomic_long_sub(nr_pages,&c->count);/*
* This is racy, but we can live with some
* inaccuracy in the failcnt.
*/// 将页面计数器超过硬限制的次数加1, 并返回超过硬限制的页面计数器
c->failcnt++;*fail = c;goto failed;}/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
*/// 如果未超过硬限制而且新的内存使用量大于记录的历史最大内存使用量, 则更新最大内存使用量if(new > c->watermark)
c->watermark = new;}return true;
failed:// 如果超过硬限制则代表记账失败, 需要取消之前所有父控制组的页面计数器的记账for(c = counter; c !=*fail; c = c->parent)page_counter_cancel(c, nr_pages);return false;}
/*
* Drains all per-CPU charge caches for given root_memcg resp. subtree
* of the hierarchy under it.
*/// 将每处理器记账缓存预留的页面返回给内存控制组staticvoiddrain_all_stock(structmem_cgroup*root_memcg){int cpu, curcpu;/* If someone's already draining, avoid adding running more workers. */if(!mutex_trylock(&percpu_charge_mutex))return;/*
* Notify other cpus that system-wide "drain" is running
* We do not care about races with the cpu hotplug because cpu down
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/// 获取当前cpu
curcpu =get_cpu();// 遍历每个cpufor_each_online_cpu(cpu){// 取出每个每处理器记账缓存structmemcg_stock_pcp*stock =&per_cpu(memcg_stock, cpu);structmem_cgroup*memcg;// 取出每处理器记账缓存保存的内存控制组
memcg = stock->cached;// 如果保存的内存控制组为空, 或者记账缓存预留的页数等于0, 或者控制组引用计数为0, 则跳过此次循环if(!memcg ||!stock->nr_pages ||!css_tryget(&memcg->css))continue;// 判断当前内存控制组是否是根控制组的后代if(!mem_cgroup_is_descendant(memcg, root_memcg)){css_put(&memcg->css);continue;}// 给memcg_stock_pcp设置FLUSHING_CACHED_CHARGE标志位if(!test_and_set_bit(FLUSHING_CACHED_CHARGE,&stock->flags)){// 如果是当前cpu, 则将其对应的记账缓存预留页面返回给内存控制组[见2.5.1节]if(cpu == curcpu)drain_local_stock(&stock->work);elseschedule_work_on(cpu,&stock->work);}css_put(&memcg->css);}put_cpu();mutex_unlock(&percpu_charge_mutex);}
2.5.1 drain_local_stock
staticvoiddrain_local_stock(structwork_struct*dummy){structmemcg_stock_pcp*stock;unsignedlong flags;/*
* The only protection from memory hotplug vs. drain_stock races is
* that we always operate on local CPU stock here with IRQ disabled
*/local_irq_save(flags);// 获取当前cpu每处理器缓存
stock =this_cpu_ptr(&memcg_stock);// 见2.5.2节drain_stock(stock);// 清除FLUSHING_CACHED_CHARGE标志位clear_bit(FLUSHING_CACHED_CHARGE,&stock->flags);local_irq_restore(flags);}
// 提交记账// page: 需要记账的page// memcg: 记账的内存控制组// lrucare: 记账的page是否在lru链表// compound: 是否是复合页voidmem_cgroup_commit_charge(structpage*page,structmem_cgroup*memcg,
bool lrucare, bool compound){unsignedint nr_pages = compound ?hpage_nr_pages(page):1;VM_BUG_ON_PAGE(!page->mapping, page);VM_BUG_ON_PAGE(PageLRU(page)&&!lrucare, page);if(mem_cgroup_disabled())return;/*
* Swap faults will attempt to charge the same page multiple
* times. But reuse_swap_page() might have removed the page
* from swapcache already, so we can't check PageSwapCache().
*/if(!memcg)return;// 主要工作由commit_charge完成[见3.1节]commit_charge(page, memcg, lrucare);local_irq_disable();// 统计内存控制组记账事件[见3.2节]mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);memcg_check_events(memcg, page);local_irq_enable();if(do_memsw_account()&&PageSwapCache(page)){swp_entry_t entry ={.val =page_private(page)};/*
* The swap entry might not get freed for a long time,
* let's not wait for it. The page already received a
* memory+swap charge, drop the swap entry duplicate.
*/mem_cgroup_uncharge_swap(entry, nr_pages);}}
3.1 commit_charge
staticvoidcommit_charge(structpage*page,structmem_cgroup*memcg,
bool lrucare){int isolated;VM_BUG_ON_PAGE(page->mem_cgroup, page);/*
* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
* may already be on some other mem_cgroup's LRU. Take care of it.
*/// page已经在其他内存控制组私有的lru链表上, 则把page从lru链表中删除[见3.1.1节]if(lrucare)lock_page_lru(page,&isolated);/*
* Nobody should be changing or seriously looking at
* page->mem_cgroup at this point:
*
* - the page is uncharged
*
* - the page is off-LRU
*
* - an anonymous fault has exclusive page access, except for
* a locked page table
*
* - a page cache insertion, a swapin fault, or a migration
* have the page locked
*/// 指定page属于该内存控制组
page->mem_cgroup = memcg;// 如果以前页在lru链表中, 则把页添加到记账的内存控制组私有的lru链表中[见3.1.2节]if(lrucare)unlock_page_lru(page, isolated);}
staticvoidmem_cgroup_charge_statistics(structmem_cgroup*memcg,structpage*page,
bool compound,int nr_pages){/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
* counted as CACHE even if it's on ANON LRU.
*/// 匿名页数量增加if(PageAnon(page))__this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages);else{// 文件缓存页数量增加__this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages);// 共享页数量增加if(PageSwapBacked(page))__this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages);}if(compound){VM_BUG_ON_PAGE(!PageTransHuge(page), page);__this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages);}/* pagein of a big page is an event. So, ignore page size */// 添加进控制组的事件增加if(nr_pages >0)__this_cpu_inc(memcg->stat->events[PGPGIN]);else{// 移除控制组的事件增加__this_cpu_inc(memcg->stat->events[PGPGOUT]);
nr_pages =-nr_pages;/* for event */}__this_cpu_add(memcg->stat->nr_page_events, nr_pages);}
4. mem_cgroup_cancel_charge
// 如果尝试记账失败, 则需要取消记账voidmem_cgroup_cancel_charge(structpage*page,structmem_cgroup*memcg,
bool compound){unsignedint nr_pages = compound ?hpage_nr_pages(page):1;if(mem_cgroup_disabled())return;/*
* Swap faults will attempt to charge the same page multiple
* times. But reuse_swap_page() might have removed the page
* from swapcache already, so we can't check PageSwapCache().
*/if(!memcg)return;// 主要工作由cancel_charge完成[见4.1节]cancel_charge(memcg, nr_pages);}