记一次内核crash排查
进行问题定位
进入crash目录/var/crash 执行了crash vmcore /usr/lib/debug/boot/vmlinux-3.2.0-136-custom 执行bt查看堆栈
crash> bt
PID: 0 TASK: ffff88084c04dc00 CPU: 8 COMMAND: "swapper/8"
#0 [ffff88107fc03730] machine_kexec at ffffffff8103adda
#1 [ffff88107fc037a0] crash_kexec at ffffffff810b7a28
#2 [ffff88107fc03870] oops_end at ffffffff81671b18
#3 [ffff88107fc038a0] no_context at ffffffff816566a7
#4 [ffff88107fc038e0] __bad_area_nosemaphore at ffffffff8165687f
#5 [ffff88107fc03940] bad_area_nosemaphore at ffffffff816568b1
#6 [ffff88107fc03950] do_page_fault at ffffffff816747eb
#7 [ffff88107fc03a60] page_fault at ffffffff81671055
[exception RIP: pid_nr_ns+22]
RIP: ffffffff81089f96 RSP: ffff88107fc03b10 RFLAGS: 00010206
RAX: 0000000000000000 RBX: ffff881017164e80 RCX: 000000000000781b
RDX: 0000000000000000 RSI: ffffffff81c28240 RDI: 000000000007d165
RBP: ffff88107fc03b10 R8: ffff88107fc15650 R9: ffff881017164e80
R10: 0000000000000002 R11: 0000000000000001 R12: ffff880825d838f4
R13: 000000000000718e R14: 000000000000718e R15: ffff88104b8f6000
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#8 [ffff88107fc03b18] task_tgid_nr_ns at ffffffff81089fdc
....省略
#20 [ffff88107fc03df8] call_timer_fn at ffffffff81078a36
#21 [ffff88107fc03e48] run_timer_softirq at ffffffff8107a3ba
#22 [ffff88107fc03ec8] __do_softirq at ffffffff8107118c
#23 [ffff88107fc03f48] call_softirq at ffffffff8167afec
#24 [ffff88107fc03f60] do_softirq at ffffffff81017565
#25 [ffff88107fc03f80] irq_exit at ffffffff810715ce
#26 [ffff88107fc03f90] smp_apic_timer_interrupt at ffffffff8167b99e
#27 [ffff88107fc03fb0] apic_timer_interrupt at ffffffff8167985e
--- <IRQ stack> ---
#28 [ffff88084c055e28] apic_timer_interrupt at ffffffff8167985e
[exception RIP: mwait_idle+149]
RIP: ffffffff8101e295 RSP: ffff88084c055ed8 RFLAGS: 00000246
RAX: 0000000000000000 RBX: ffffffff8107b2cd RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff88084c055fd8 RDI: ffffffff81de1e68
RBP: ffff88084c055ef8 R8: 0000000000000000 R9: 0000000000000000
R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000282
R13: ffff88107fc0cac0 R14: 000000051ea97143 R15: ffff88104c328000
ORIG_RAX: ffffffffffffff10 CS: 0010 SS: 0018
#29 [ffff88084c055f00] cpu_idle at ffffffff81014236
初步看了下堆栈,是挂在了task_tgid_nr_ns函数里面的pid_nr_ns 反汇编查看pid_nr_ns函数, 查看pid_nr_ns+22 汇编第22行出错的原因
crash> dis pid_nr_ns
0xffffffff81089f80 <pid_nr_ns>: push %rbp
0xffffffff81089f81 <pid_nr_ns+1>: mov %rsp,%rbp
0xffffffff81089f84 <pid_nr_ns+4>: nopl 0x0(%rax,%rax,1)
0xffffffff81089f89 <pid_nr_ns+9>: xor %eax,%eax
0xffffffff81089f8b <pid_nr_ns+11>: test %rdi,%rdi
0xffffffff81089f8e <pid_nr_ns+14>: je 0xffffffff81089faa <pid_nr_ns+42>
0xffffffff81089f90 <pid_nr_ns+16>: mov 0x820(%rsi),%edx
0xffffffff81089f96 <pid_nr_ns+22>: cmp 0x4(%rdi),%edx
0xffffffff81089f99 <pid_nr_ns+25>: ja 0xffffffff81089faa <pid_nr_ns+42>
0xffffffff81089f9b <pid_nr_ns+27>: shl $0x5,%rdx
0xffffffff81089f9f <pid_nr_ns+31>: lea 0x30(%rdi,%rdx,1),%rdx
0xffffffff81089fa4 <pid_nr_ns+36>: cmp 0x8(%rdx),%rsi
0xffffffff81089fa8 <pid_nr_ns+40>: je 0xffffffff81089fb0 <pid_nr_ns+48>
0xffffffff81089faa <pid_nr_ns+42>: pop %rbp
0xffffffff81089fab <pid_nr_ns+43>: retq
0xffffffff81089fac <pid_nr_ns+44>: nopl 0x0(%rax)
0xffffffff81089fb0 <pid_nr_ns+48>: mov (%rdx),%eax
0xffffffff81089fb2 <pid_nr_ns+50>: pop %rbp
0xffffffff81089fb3 <pid_nr_ns+51>: retq
0xffffffff81089fb4 <pid_nr_ns+52>: data32 data32 nopw %cs:0x0(%rax,%rax,1)
c语言程序
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
struct upid *upid;
pid_t nr = 0;
if (pid && ns->level <= pid->level) {
upid = &pid->numbers[ns->level];
if (upid->ns == ns)
nr = upid->nr;
}
return nr;
}
rdi寄存器存放函数的第一个参数 rsi寄存器存放的是函数的第二个阐述 +22汇编是0x4(%rdi),%edx 让rdi寄存器的值偏移4个字节(对应c代码是pid->level) +16汇编是mov 0x820(%rsi),%edx,让rsi寄存器的值偏移0x820个地址(对应c代码是ns->level) +22行整体对应的c函数代码是 ns->level <= pid->level +16行代码中对rsi寄存器(即函数的第二个参数ns)取值访问没有出现异常,所以问题就在rdi寄存器(即函数的第一个参数pid) 查看了RDI的地址的值是
RDX: 0000000000000000 RSI: ffffffff81c28240 RDI: 000000000007d165
rdi的值是一个非法值了 现在已经定位到rdi是非法值了,下一步是判断,为什么它变成非法了。 回到我们的函数调用栈中查看该函数的调用者task_pid_nr_ns
内核3.2.0代码
pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return pid_nr_ns(task_tgid(tsk), ns);
}
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
struct upid *upid;
pid_t nr = 0;
if (pid && ns->level <= pid->level) {
upid = &pid->numbers[ns->level];
if (upid->ns == ns)
nr = upid->nr;
}
return nr;
}
此处加了rcu锁,但是对task->pids[type].pid 使用时,并没有调用rcu_dereference保护该pid指针 初步判断这个是没有进行同步问题因为,在父进程摧毁时,还没向上关联到1号进程时,访问了父进程的pid,导致了崩溃 为了进一步确认这个问题,最省心的是查看高版本的内核的更细日志,以及相关代码
4.13内核代码
static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns){
return __task_pid_nr_ns(task_tgid(tsk), ns);
}
struct pid *__task_pid_nr_ns(struct task_struct *task, enum pid_type type)
{
struct pid *pid;
rcu_read_lock();
if (type != PIDTYPE_PID)
task = task->group_leader;
pid = pid_nr_ns(rcu_dereference(task->pids[type].pid));
rcu_read_unlock();
return pid;
}
可以看到最新版内核已经加上了rcu_dereference,task_tgid_nr_ns已经换成inline函数了
https://lkml.org/lkml/2015/11/24/680 这个patch在4.4-rc3已经合入代码中了
https://elixir.bootlin.com/linux/v4.4-rc2/source/include/linux/sched.h#L1912 这个patch在4.13-rc7正式启动inlie函数版的task_tgid_nr_ns
Subject [PATCH] pidns: fix NULL dereference in __task_pid_nr_ns()
From Eric Dumazet <>
Date Tue, 24 Nov 2015 11:39:54 -0800
share
From: Eric Dumazet <edumazet@google.com>
I got a crash during a "perf top" session that was caused
by a race in __task_pid_nr_ns() :
pid_nr_ns() was inlined, but apparently compiler chose to read
task->pids[type].pid twice, and the pid->level dereference
crashed because we got a NULL pointer at the second read :
if (pid && ns->level <= pid->level) { // CRASH
Just use RCU API properly to solve this race, and not worry
about "perf top" crashing hosts :(
get_task_pid() can benefit from same fix.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
kernel/pid.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/pid.c b/kernel/pid.c
index ca368793808e..78b3d9f80d44 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -467,7 +467,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
rcu_read_lock();
if (type != PIDTYPE_PID)
task = task->group_leader;
- pid = get_pid(task->pids[type].pid);
+ pid = get_pid(rcu_dereference(task->pids[type].pid));
rcu_read_unlock();
return pid;
}
@@ -528,7 +528,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (likely(pid_alive(task))) {
if (type != PIDTYPE_PID)
task = task->group_leader;
- nr = pid_nr_ns(task->pids[type].pid, ns);
+ nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
}
rcu_read_unlock();
最后的结论是,在4.13以下内核不要调用task_pid_nr_ns函数,对task->pids数组内容的访问必须加上rcu锁