structmem_cgroup{// 所有资源控制器的基类structcgroup_subsys_state css;/* Private memcg ID. Used to ID objects that outlive the cgroup */structmem_cgroup_id id;/* Accounted resources */// _MEM类型的内存计数器: 记录内存的限制和当前使用量[见5.1节]structpage_counter memory;structpage_counter swap;/* Legacy consumer-oriented counters */// _MEMSWAP类型的内存计数器: 记录内存+交换分区的限制和当前使用量structpage_counter memsw;// _KMEM类型的内核内存计数器: 记录内核内存的限制和当前使用量structpage_counter kmem;// _TCP类型的tcp缓冲区计数器: 记录tcp缓冲区的限制和当前使用量structpage_counter tcpmem;/* Normal memory consumption range */// 内存使用低界限unsignedlong low;// 内存使用高界限unsignedlong high;/* Range enforcement for interrupt charges */structwork_struct high_work;// 内存使用软限制unsignedlong soft_limit;/* vmpressure notifications */structvmpressure vmpressure;/*
* Should the accounting and control be hierarchical, per subtree?
*/// 是否使用分层记账
bool use_hierarchy;/* protected by memcg_oom_lock */
bool oom_lock;int under_oom;int swappiness;/* OOM-Killer disable */int oom_kill_disable;/* handle for "memory.events" */structcgroup_file events_file;/* protect arrays of thresholds */structmutex thresholds_lock;/* thresholds for memory usage. RCU-protected */structmem_cgroup_thresholds thresholds;/* thresholds for mem+swap usage. RCU-protected */structmem_cgroup_thresholds memsw_thresholds;/* For oom notifier event fd */structlist_head oom_notify;/*
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
*/unsignedlong move_charge_at_immigrate;/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/atomic_t moving_account;/* taken only while moving_account > 0 */spinlock_t move_lock;structtask_struct*move_lock_task;unsignedlong move_lock_flags;/*
* percpu counter.
*/// 每cpu变量: 统计内存控制组状态(包括内存使用量和内存事件)[见5.3节]structmem_cgroup_stat_cpu __percpu *stat;unsignedlong socket_pressure;/* Legacy tcp memory accounting */
bool tcpmem_active;int tcpmem_pressure;#ifndefCONFIG_SLOB/* Index in the kmem_cache->memcg_params.memcg_caches array */int kmemcg_id;enummemcg_kmem_state kmem_state;structlist_head kmem_caches;#endifint last_scanned_node;#ifMAX_NUMNODES >1nodemask_t scan_nodes;atomic_t numainfo_events;atomic_t numainfo_updating;#endif#ifdefCONFIG_CGROUP_WRITEBACKstructlist_head cgwb_list;structwb_domain cgwb_domain;#endif/* List of events which userspace want to receive */structlist_head event_list;spinlock_t event_list_lock;// 每个节点对应一个mem_cgroup_per_node实例[见5.2节]structmem_cgroup_per_node*nodeinfo[0];/* WARNING: nodeinfo must be the last member here */};
/*
* per-zone information in memory controller.
*/structmem_cgroup_per_node{// 内存控制组私有的lru链表// 当进程加入内存控制组后, 给进程分配的页面不再加入node的lru链表, 而是加入内存控制组私有的lru链表structlruvec lruvec;structlruvec_stat __percpu *lruvec_stat;unsignedlong lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];structmem_cgroup_reclaim_iter iter[DEF_PRIORITY +1];structrb_node tree_node;/* RB tree node */// 内存使用量超过软限制的数值 = mem_cgroup.memory.count - mem_cgroup.soft_limitunsignedlong usage_in_excess;/* Set to the value by which *//* the soft limit is exceeded*/// 表示内存控制组是否在软限制树种// 当内存使用量超过软限制时, 通过成员tree_node把mem_cgroup_per_node实例加入软限制树
bool on_tree;// 指向mem_cgroup_per_node实例所属的内存控制组structmem_cgroup*memcg;/* Back pointer, we cannot *//* use container_of */};
// cgroup自定义的page状态enummemcg_stat_item{// 文件缓存
MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS,// 匿名内存
MEMCG_RSS,// 匿名巨页
MEMCG_RSS_HUGE,// swap缓存
MEMCG_SWAP,
MEMCG_SOCK,/* XXX: why are these zone and not node counters? */
MEMCG_KERNEL_STACK_KB,
MEMCG_NR_STAT,};
4.5 memcg_event_item
/* Cgroup-specific events, on top of universal VM events */enummemcg_event_item{
MEMCG_LOW = NR_VM_EVENT_ITEMS,
MEMCG_HIGH,
MEMCG_MAX,
MEMCG_OOM,
MEMCG_NR_EVENTS,};
staticunsignedlongmem_cgroup_usage(structmem_cgroup*memcg, bool swap){unsignedlong val =0;// 根控制组if(mem_cgroup_is_root(memcg)){structmem_cgroup*iter;// 遍历根控制组下所有子控制组for_each_mem_cgroup_tree(iter, memcg){// 读取控制组内page cache的数量[见6.2节]
val +=memcg_page_state(iter, MEMCG_CACHE);// 读取控制组内anonymous page的数量
val +=memcg_page_state(iter, MEMCG_RSS);// 如果开启swap, 还要统计swap cache的数量if(swap)
val +=memcg_page_state(iter, MEMCG_SWAP);}}else{if(!swap)// 读取_MEM类型page_counter->usage
val =page_counter_read(&memcg->memory);else// 读取_MEMSWAP类型page_counter->usage
val =page_counter_read(&memcg->memsw);}return val;}
5.2 memcg_page_state
/* idx can be of type enum memcg_stat_item or node_stat_item */staticinlineunsignedlongmemcg_page_state(structmem_cgroup*memcg,int idx){// 遍历每个cpu上的mem_cgroup_stat_cpu, 并统计该控制组内由idx指定状态的页面数量for_each_possible_cpu(cpu)
val +=per_cpu(memcg->stat->count[idx], cpu);if(val <0)
val =0;return val;}
6. mem_cgroup_write
staticssize_tmem_cgroup_write(structkernfs_open_file*of,char*buf,size_t nbytes,loff_t off){// 根据接口文件找到其对应的控制组structmem_cgroup*memcg =mem_cgroup_from_css(of_css(of));unsignedlong nr_pages;int ret;
buf =strstrip(buf);// 解析写入的参数:将参数(可能带有后缀K, M, G, T, P, E)解析为页面数量
ret =page_counter_memparse(buf,"-1",&nr_pages);if(ret)return ret;// 解析资源类型switch(MEMFILE_ATTR(of_cft(of)->private)){case RES_LIMIT:// 如前面所说: 根控制组不能设置使用限制if(mem_cgroup_is_root(memcg)){/* Can't set limit on root */
ret =-EINVAL;break;}// 解析资源属性: 写入*.limit_in_bytes文件即设置硬限制switch(MEMFILE_TYPE(of_cft(of)->private)){case _MEM:// 设置内存使用硬限制[见7.1节]
ret =mem_cgroup_resize_limit(memcg, nr_pages);break;case _MEMSWAP:
ret =mem_cgroup_resize_memsw_limit(memcg, nr_pages);break;case _KMEM:
ret =memcg_update_kmem_limit(memcg, nr_pages);break;case _TCP:
ret =memcg_update_tcp_limit(memcg, nr_pages);break;}break;// 设置软限制case RES_SOFT_LIMIT:
memcg->soft_limit = nr_pages;
ret =0;break;}return ret ?: nbytes;}
6.1 mem_cgroup_resize_limit
staticintmem_cgroup_resize_limit(structmem_cgroup*memcg,unsignedlong limit){unsignedlong curusage;unsignedlong oldusage;
bool enlarge = false;int retry_count;int ret;/*
* For keeping hierarchical_reclaim simple, how long we should retry
* is depends on callers. We set our retry-count to be function
* of # of children which we should visit in this loop.
*/
retry_count = MEM_CGROUP_RECLAIM_RETRIES *mem_cgroup_count_children(memcg);// 返回当前内存使用硬限制
oldusage =page_counter_read(&memcg->memory);do{if(signal_pending(current)){
ret =-EINTR;break;}mutex_lock(&memcg_limit_mutex);// 新的限制不能超过内存+交换分区的限制if(limit > memcg->memsw.limit){mutex_unlock(&memcg_limit_mutex);
ret =-EINVAL;break;}// 增大硬限制if(limit > memcg->memory.limit)
enlarge = true;// 更新硬限制
ret =page_counter_limit(&memcg->memory, limit);mutex_unlock(&memcg_limit_mutex);// 更新成功则跳过跳出循环if(!ret)break;// 否则代表当前使用量已经超过硬限制, 需要针对该控制组进行内存回收try_to_free_mem_cgroup_pages(memcg,1, GFP_KERNEL, true);// 回收完之后再次读取内存使用量
curusage =page_counter_read(&memcg->memory);/* Usage is reduced ? */// 如果内存使用量仍然比之前大, 则进行重试if(curusage >= oldusage)
retry_count--;else// 否则进行重试, 直到使用量小于硬限制
oldusage = curusage;}while(retry_count);if(!ret && enlarge)memcg_oom_recover(memcg);return ret;}