记一次内核crash排查

最新推荐文章于 2023-10-26 09:30:40 发布

network-fire

最新推荐文章于 2023-10-26 09:30:40 发布

阅读量466

点赞数

分类专栏： kernel

本文链接：https://blog.csdn.net/going369/article/details/103984338

版权

kernel 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

记一次内核crash排查

有同事反馈有机器宕机，并且是我们组的模块造成的

进行问题定位

进入crash目录/var/crash 执行了crash vmcore /usr/lib/debug/boot/vmlinux-3.2.0-136-custom
执行bt查看堆栈

crash> bt
PID: 0      TASK: ffff88084c04dc00  CPU: 8   COMMAND: "swapper/8"
 #0 [ffff88107fc03730] machine_kexec at ffffffff8103adda
 #1 [ffff88107fc037a0] crash_kexec at ffffffff810b7a28
 #2 [ffff88107fc03870] oops_end at ffffffff81671b18
 #3 [ffff88107fc038a0] no_context at ffffffff816566a7
 #4 [ffff88107fc038e0] __bad_area_nosemaphore at ffffffff8165687f
 #5 [ffff88107fc03940] bad_area_nosemaphore at ffffffff816568b1
 #6 [ffff88107fc03950] do_page_fault at ffffffff816747eb
 #7 [ffff88107fc03a60] page_fault at ffffffff81671055
    [exception RIP: pid_nr_ns+22]
    RIP: ffffffff81089f96  RSP: ffff88107fc03b10  RFLAGS: 00010206
    RAX: 0000000000000000  RBX: ffff881017164e80  RCX: 000000000000781b
    RDX: 0000000000000000  RSI: ffffffff81c28240  RDI: 000000000007d165
    RBP: ffff88107fc03b10   R8: ffff88107fc15650   R9: ffff881017164e80
    R10: 0000000000000002  R11: 0000000000000001  R12: ffff880825d838f4
    R13: 000000000000718e  R14: 000000000000718e  R15: ffff88104b8f6000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #8 [ffff88107fc03b18] task_tgid_nr_ns at ffffffff81089fdc
 ....省略
#20 [ffff88107fc03df8] call_timer_fn at ffffffff81078a36
#21 [ffff88107fc03e48] run_timer_softirq at ffffffff8107a3ba
#22 [ffff88107fc03ec8] __do_softirq at ffffffff8107118c
#23 [ffff88107fc03f48] call_softirq at ffffffff8167afec
#24 [ffff88107fc03f60] do_softirq at ffffffff81017565
#25 [ffff88107fc03f80] irq_exit at ffffffff810715ce
#26 [ffff88107fc03f90] smp_apic_timer_interrupt at ffffffff8167b99e
#27 [ffff88107fc03fb0] apic_timer_interrupt at ffffffff8167985e
--- <IRQ stack> ---
#28 [ffff88084c055e28] apic_timer_interrupt at ffffffff8167985e
    [exception RIP: mwait_idle+149]
    RIP: ffffffff8101e295  RSP: ffff88084c055ed8  RFLAGS: 00000246
    RAX: 0000000000000000  RBX: ffffffff8107b2cd  RCX: 0000000000000000
    RDX: 0000000000000000  RSI: ffff88084c055fd8  RDI: ffffffff81de1e68
    RBP: ffff88084c055ef8   R8: 0000000000000000   R9: 0000000000000000
    R10: 0000000000000001  R11: 0000000000000001  R12: 0000000000000282
    R13: ffff88107fc0cac0  R14: 000000051ea97143  R15: ffff88104c328000
    ORIG_RAX: ffffffffffffff10  CS: 0010  SS: 0018
#29 [ffff88084c055f00] cpu_idle at ffffffff81014236

初步看了下堆栈，是挂在了task_tgid_nr_ns函数里面的pid_nr_ns
反汇编查看pid_nr_ns函数, 查看pid_nr_ns+22 汇编第22行出错的原因

crash> dis pid_nr_ns
0xffffffff81089f80 <pid_nr_ns>: push   %rbp
0xffffffff81089f81 <pid_nr_ns+1>:       mov    %rsp,%rbp
0xffffffff81089f84 <pid_nr_ns+4>:       nopl   0x0(%rax,%rax,1)
0xffffffff81089f89 <pid_nr_ns+9>:       xor    %eax,%eax
0xffffffff81089f8b <pid_nr_ns+11>:      test   %rdi,%rdi
0xffffffff81089f8e <pid_nr_ns+14>:      je     0xffffffff81089faa <pid_nr_ns+42>
0xffffffff81089f90 <pid_nr_ns+16>:      mov    0x820(%rsi),%edx
0xffffffff81089f96 <pid_nr_ns+22>:      cmp    0x4(%rdi),%edx
0xffffffff81089f99 <pid_nr_ns+25>:      ja     0xffffffff81089faa <pid_nr_ns+42>
0xffffffff81089f9b <pid_nr_ns+27>:      shl    $0x5,%rdx
0xffffffff81089f9f <pid_nr_ns+31>:      lea    0x30(%rdi,%rdx,1),%rdx
0xffffffff81089fa4 <pid_nr_ns+36>:      cmp    0x8(%rdx),%rsi
0xffffffff81089fa8 <pid_nr_ns+40>:      je     0xffffffff81089fb0 <pid_nr_ns+48>
0xffffffff81089faa <pid_nr_ns+42>:      pop    %rbp
0xffffffff81089fab <pid_nr_ns+43>:      retq   
0xffffffff81089fac <pid_nr_ns+44>:      nopl   0x0(%rax)
0xffffffff81089fb0 <pid_nr_ns+48>:      mov    (%rdx),%eax
0xffffffff81089fb2 <pid_nr_ns+50>:      pop    %rbp
0xffffffff81089fb3 <pid_nr_ns+51>:      retq   
0xffffffff81089fb4 <pid_nr_ns+52>:      data32 data32 nopw %cs:0x0(%rax,%rax,1)

c语言程序
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
	struct upid *upid;
	pid_t nr = 0;

	if (pid && ns->level <= pid->level) {
		upid = &pid->numbers[ns->level];
		if (upid->ns == ns)
			nr = upid->nr;
	}
	return nr;
}

rdi寄存器存放函数的第一个参数 rsi寄存器存放的是函数的第二个阐述
+22汇编是0x4(%rdi),%edx 让rdi寄存器的值偏移4个字节(对应c代码是pid->level)
+16汇编是mov 0x820(%rsi),%edx，让rsi寄存器的值偏移0x820个地址(对应c代码是ns->level)
+22行整体对应的c函数代码是 ns->level <= pid->level
+16行代码中对rsi寄存器(即函数的第二个参数ns)取值访问没有出现异常，所以问题就在rdi寄存器(即函数的第一个参数pid)
查看了RDI的地址的值是

RDX: 0000000000000000  RSI: ffffffff81c28240  RDI: 000000000007d165

rdi的值是一个非法值了
现在已经定位到rdi是非法值了，下一步是判断，为什么它变成非法了。
回到我们的函数调用栈中查看该函数的调用者task_pid_nr_ns

内核3.2.0代码

pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return pid_nr_ns(task_tgid(tsk), ns);
}
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
	struct upid *upid;
	pid_t nr = 0;

	if (pid && ns->level <= pid->level) {
		upid = &pid->numbers[ns->level];
		if (upid->ns == ns)
			nr = upid->nr;
	}
	return nr;
}

此处加了rcu锁，但是对task->pids[type].pid 使用时，并没有调用rcu_dereference保护该pid指针
初步判断这个是没有进行同步问题因为，在父进程摧毁时，还没向上关联到1号进程时，访问了父进程的pid，导致了崩溃
为了进一步确认这个问题，最省心的是查看高版本的内核的更细日志，以及相关代码

4.13内核代码
static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns){
      return __task_pid_nr_ns(task_tgid(tsk), ns);
}
struct pid *__task_pid_nr_ns(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;
        rcu_read_lock();
        if (type != PIDTYPE_PID)
                task = task->group_leader;
        pid = pid_nr_ns(rcu_dereference(task->pids[type].pid));
        rcu_read_unlock();
        return pid;
}

可以看到最新版内核已经加上了rcu_dereference，task_tgid_nr_ns已经换成inline函数了


https://lkml.org/lkml/2015/11/24/680 这个patch在4.4-rc3已经合入代码中了

https://elixir.bootlin.com/linux/v4.4-rc2/source/include/linux/sched.h#L1912 这个patch在4.13-rc7正式启动inlie函数版的task_tgid_nr_ns



Subject	[PATCH] pidns: fix NULL dereference in __task_pid_nr_ns()
From	Eric Dumazet <>
Date	Tue, 24 Nov 2015 11:39:54 -0800
share
From: Eric Dumazet <edumazet@google.com>

I got a crash during a "perf top" session that was caused
by a race in __task_pid_nr_ns() :

pid_nr_ns() was inlined, but apparently compiler chose to read
task->pids[type].pid twice, and the pid->level dereference
crashed because we got a NULL pointer at the second read :

if (pid && ns->level <= pid->level) { // CRASH

Just use RCU API properly to solve this race, and not worry
about "perf top" crashing hosts :(

get_task_pid() can benefit from same fix.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 kernel/pid.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index ca368793808e..78b3d9f80d44 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -467,7 +467,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 	rcu_read_lock();
 	if (type != PIDTYPE_PID)
 		task = task->group_leader;
-	pid = get_pid(task->pids[type].pid);
+	pid = get_pid(rcu_dereference(task->pids[type].pid));
 	rcu_read_unlock();
 	return pid;
 }
@@ -528,7 +528,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 	if (likely(pid_alive(task))) {
 		if (type != PIDTYPE_PID)
 			task = task->group_leader;
-		nr = pid_nr_ns(task->pids[type].pid, ns);
+		nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
 	}
 	rcu_read_unlock();