Codebase: Android4.4
Kernel: 3.1.0
概念:
OOMkiller,即out of memory killer,是linux下面的一种管理当内存耗尽时的处理机制。当内存较少时,OOM会遍历整个进程链表,然后根据进程的内存使用情况以及它的oom score值最终找到得分较高的进程,然后发送kill信号将其杀掉。
伙伴系统中在分配内存时会做判断,当内存不足时,会调用核心函数out_of_memory(), 函数位于文件oom_kill.c@kernel/mm.
下面先分析out_of_memory()。
out_of_memory():
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask, bool force_kill)
{
const nodemask_t *mpol_mask;
struct task_struct *p;
unsigned long totalpages;
unsigned long freed = 0;
unsigned int points;
enum oom_constraint constraint = CONSTRAINT_NONE;
int killed = 0;
~~snip
/*如果当前已经有Pending的kill信号,那么马上返回。
毕竟oom最中为了free memory而执行sig kill。*/
if (fatal_signal_pending(current)) {
set_thread_flag(TIF_MEMDIE);
return;
}
~~snip
/*用户空间可以通过/proc/sys/vm/panic_on_oom来改变oom的行为,
1表示oom的时候直接panic,0就只杀掉”best”进程而让系统继续运行。*/
check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
read_lock(&tasklist_lock);
/*同样/proc/sys/vm/ oom_kill_allocating_task为true时表示直接将当前分配的task
给kill掉。*/
if (sysctl_oom_kill_allocating_task &&
!oom_unkillable_task(current, NULL, nodemask) &&
current->mm) {
oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
nodemask,
"Out of memory (oom_kill_allocating_task)");
goto out;
}
/*根据当前task的内存以oom score信息得到point值最高的那个。*/
p = select_bad_process(&points, totalpages, NULL, mpol_mask,
force_kill);
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
read_unlock(&tasklist_lock);
panic("Out of memory and no killable processes...\n");
}
if (PTR_ERR(p) != -1UL) {
/*唔,被杀了,苦逼!*/
oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
nodemask, "Out of memory");
killed = 1;
}
out:
read_unlock(&tasklist_lock);
/*
* Give "p" a good chance of killing itself before we
* retry to allocate memory unless "p" is current
*/
if (killed && !test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
}
select_bad_process():
static struct task_struct *select_bad_process(unsigned int *ppoints,
unsigned long totalpages, struct mem_cgroup *memcg,
const nodemask_t *nodemask, bool force_kill)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
*ppoints = 0;
/*遍历所有进程*/
do_each_thread(g, p) {
unsigned int points;
/*处于退出的进程就不管了*/
if (p->exit_state)
continue;
/*有些核心的线程不能杀,如init, kernel_thread*/
if (oom_unkillable_task(p, memcg, nodemask))
continue;
/*正在被oom killing的进程也不管。*/
if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
if (unlikely(frozen(p)))
__thaw_task(p);
if (!force_kill)
return ERR_PTR(-1UL);
}
if (!p->mm)
continue;
if (p->flags & PF_EXITING) {
if (p == current) {
chosen = p;
*ppoints = 1000;
} else if (!force_kill) {
/*
* If this task is not being ptraced on exit,
* then wait for it to finish before killing
* some other task unnecessarily.
*/
if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
return ERR_PTR(-1UL);
}
}
/*计算task对应的points*/
points = oom_badness(p, memcg, nodemask, totalpages);
/*如果此task比上次的points要大,那么保存point.*/
if (points > *ppoints) {
chosen = p;
*ppoints = points;
}
} while_each_thread(g, p);
return chosen;
}
oom_badness():
unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
const nodemask_t *nodemask, unsigned long totalpages)
{
long points;
if (oom_unkillable_task(p, memcg, nodemask))
return 0;
p = find_lock_task_mm(p);
if (!p)
return 0;
/*oom_score_adj为-1000的不做处理,此值可以通过/proc/pid_num/oom_score_adj设置,范围为-1000 ~ 1000,值越大越容易被oom kill掉。*/
if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
task_unlock(p);
return 0;
}
/*
* The memory controller may have a limit of 0 bytes, so avoid a divide
* by zero, if necessary.
*/
if (!totalpages)
totalpages = 1;
/* get_mm_rss获取当前用户空间使用文件和匿名页占有内存数,nr_ptes 获取
当前保存页表使用的内存。*/
points = get_mm_rss(p->mm) + p->mm->nr_ptes;
/*获取交换内存使用的内存数*/
points += get_mm_counter(p->mm, MM_SWAPENTS);
/*每个task同等计算,可不管。*/
points *= 1000;
points /= totalpages;
task_unlock(p);
/*当该进程具有CAP_SYS_ADMIN能力,那么Point降低,因为具有ADMIN权限的
Task是被认为表现良好的。 */
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
points -= 30;
/*加上oom_score_adj,范围从-1000 ~ 1000. */
points += p->signal->oom_score_adj;
/*
* Never return 0 for an eligible task that may be killed since it's
* possible that no single user task uses more than 0.1% of memory and
* no single admin tasks uses more than 3.0%.
*/
if (points <= 0)
return 1;
/*1000封顶*/
return (points < 1000) ? points : 1000;
}
oom_kill_process():
static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
unsigned int points, unsigned long totalpages,
struct mem_cgroup *memcg, nodemask_t *nodemask,
const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
struct task_struct *t = p;
struct mm_struct *mm;
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
/*
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
if (p->flags & PF_EXITING) {
set_tsk_thread_flag(p, TIF_MEMDIE);
return;
}
if (__ratelimit(&oom_rs))
dump_header(p, gfp_mask, order, memcg, nodemask);
task_lock(p);
pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
message, task_pid_nr(p), p->comm, points);
task_unlock(p);
/*当前被选定子进程的mm和父进程不一样时,找到其中最高point
的children task,然后替代父进程被杀掉,所以当一个进程有多个子进程并且
真用较多内存时,子进程有可能被杀掉,而父进程还可以活着。 */
do {
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
if (child->mm == p->mm)
continue;
/*
* oom_badness() returns 0 if the thread is unkillable
*/
child_points = oom_badness(child, memcg, nodemask,
totalpages);
if (child_points > victim_points) {
victim = child;
victim_points = child_points;
}
}
} while_each_thread(p, t);
victim = find_lock_task_mm(victim);
if (!victim)
return;
/* mm cannot safely be dereferenced after task_unlock(victim) */
mm = victim->mm;
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
K(get_mm_counter(victim->mm, MM_FILEPAGES)));
task_unlock(victim);
/*
只要mm是一样的,也就是说共享内存的进程,都会和当前找到最高point的
指定进程一起被杀掉。 */
for_each_process(p)
if (p->mm == mm && !same_thread_group(p, victim) &&
!(p->flags & PF_KTHREAD)) {
if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
continue;
task_lock(p); /* Protect ->comm from prctl() */
pr_err("Kill process %d (%s) sharing same memory\n",
task_pid_nr(p), p->comm);
task_unlock(p);
/*发送 SIGKILL信号。*/
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
}
set_tsk_thread_flag(victim, TIF_MEMDIE);
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
}
所以,out_of_memory()做的任务就是遍历系统全部进程,然后根据内存使用情况以及oom_score_adj的值计算得到一个point, 最终将最高point的task给kill掉。
相关知识:
1. Malloc会引起OOM killer,可参考:
http://blog.dccmx.com/2011/04/oom-killer-on-linux
2. OOM killer值是管理计算lowmemory部分,即使High memory有很多空闲内存。
3. 进程rss的计算可参考此文:
http://filwmm1314.blog.163.com/blog/static/2182591920121016541582/
4. 影响到oom killer行为的文件有:
/proc/sys/vm/overcommit_memory
/proc/sys/vm/panic_on_oom
/proc/sys/vm/oom_kill_allocating_task
/porc/pid_xxx/oom_score_adj