linux宕机时Oops分析及问题定位

以下面这个例子说明(下面这个例子就是造一个野指针所引发的错误):

/*
 * test-debug-scr.c
 *
 * Copyright (C) 2012 - 2021 Reuuimlla Limited
 * 
 * Adapt to support xxx
 */

#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/delay.h>
#include <linux/ide.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/vmalloc.h>

/*-------------------------------------------------------------------------------*/
/*    DEFINITION                                         */
/*-------------------------------------------------------------------------------*/

/*
 * @description	: 驱动入口函数
 * @param 		: 无
 * @return 		: 无
 */
static int __init test_init(void)
{
     int *p = 0x1231223;
     *p = 0x1231223;
    printk("module loaded.\n");
    return 0;
}

/*
 * @description	: 驱动出口函数
 * @param 		: 无
 * @return 		: 无
 */
static void __exit test_exit(void)
{
    printk("module unloaded.\n");
    return;
}

module_init(test_init);
module_exit(test_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("test-debug-scr generic test driver");
MODULE_AUTHOR("xxxx");


很明显,人为制造错误的地方就是test_init函数内的 “ *p = 0x1231223 ”语句。

然后,把这个模块编译出来,再用insmod来插入到内核空间,Oops出现了。

# insmod test-debug.ko 
[   61.715630] Internal error: Oops: 805 [#1] PREEMPT SMP ARM
[   61.721656] Modules linked in: test_debug(O+) shb_uart(O) shb_lcd(O) adc(O) i2c(O)
[   61.721904] CPU: 2 PID: 1177 Comm: insmod Tainted: G           O 3.10.65 #493
[   61.721904] task: e6265500 ti: e5d5a000 task.ti: e5d5a000
[   61.721904] PC is at test_init+0x1c/0x44 [test_debug]
[   61.721904] LR is at do_one_initcall+0xa8/0x144
[   61.721904] pc : [<bf01a01c>]    lr : [<c000a4d8>]    psr: 600c0013
[   61.721904] sp : e5d5be40  ip : e5d5be50  fp : e5d5be4c
[   61.721904] r10: e5ff7124  r9 : 00000001  r8 : 00000000
[   61.721904] r7 : bf0180d4  r6 : c0967100  r5 : bf01a000  r4 : e5d5a000
[   61.721904] r3 : 01231000  r2 : 00000023  r1 : 00000012  r0 : bf018044
[   61.721904] Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment user
[   61.721904] Control: 10c53c7d  Table: 66b0c06a  DAC: 00000015
[   61.721904] 
[   61.721904] LR: 0xc000a458:
[   61.721904] a458  e3c4403f e5d6301c e5948004 e3530000 0a000019 e594300c e1a01000 e59f00dc
[   61.721904] a478  e593215c eb1754da e24b002c eb017781 e14b22dc e14b23f4 e12fff35 e1a07000
[   61.721904] a498  e24b002c eb01777b e14b22dc e14b03d4 e0520000 e1a02520 e0c31001 e1a03521
[   61.721904] a4b8  e59f009c e1822b01 e1a01005 e88d000c e1a02007 eb1754c6 ea000001 e12fff30
[   61.721904] a4d8  e1a07000 e3a03000 e5c6301d e5943004 e1580003 0a000003 e59f1068 e59f0068
[   61.721904] a4f8  eb09c6d5 e5848004 e10f3000 e3130080 0a000004 e3a02040 e59f1050 e59f0048
[   61.721904] a518  eb09c895 f1080080 e5d6301d e3530000 0a000006 e59f3030 e30012c5 e59f2030
[   61.721904] a538  e59f0030 e58d3000 e1a03005 eb0065a6 e1a00007 e24bd020 e89da9f0 c0967100
[   61.721904] 
[   61.721904] SP: 0xe5d5bdc0:
[   61.721904] bdc0  8040003f c00c4e38 ffffffff c00b262c 00000001 c00b2868 bf01a01c 600c0013
[   61.721904] bde0  ffffffff e5d5be2c e5d5be4c e5d5bdf8 c05e5b98 c000a16c bf018044 00000012
[   61.721904] be00  00000023 01231000 e5d5a000 bf01a000 c0967100 bf0180d4 00000000 00000001
[   61.721904] be20  e5ff7124 e5d5be4c e5d5be50 e5d5be40 c000a4d8 bf01a01c 600c0013 ffffffff
[   61.721904] be40  e5d5be8c e5d5be50 c000a4d8 bf01a00c c00f58cc e5d5bf48 00000001 bf01808c
[   61.721904] be60  bf0180d4 e5ff7100 e5d5be8c e5d5bf48 00000001 bf01808c bf0180d4 e5ff7100
[   61.721904] be80  e5d5bf44 e5d5be90 c007b494 c000a43c bf018098 00007fff c0077394 e5d5bea8
[   61.721904] bea0  00024dd6 bf0181f0 e80133ec e5d5bf48 c05eef2c c0078c30 e5ff7108 00000000
[   61.721904] 
[   61.721904] IP: 0xe5d5bdd0:
[   61.721904] bdd0  00000001 c00b2868 bf01a01c 600c0013 ffffffff e5d5be2c e5d5be4c e5d5bdf8
[   61.721904] bdf0  c05e5b98 c000a16c bf018044 00000012 00000023 01231000 e5d5a000 bf01a000
[   61.721904] be10  c0967100 bf0180d4 00000000 00000001 e5ff7124 e5d5be4c e5d5be50 e5d5be40
[   61.721904] be30  c000a4d8 bf01a01c 600c0013 ffffffff e5d5be8c e5d5be50 c000a4d8 bf01a00c
[   61.721904] be50  c00f58cc e5d5bf48 00000001 bf01808c bf0180d4 e5ff7100 e5d5be8c e5d5bf48
[   61.721904] be70  00000001 bf01808c bf0180d4 e5ff7100 e5d5bf44 e5d5be90 c007b494 c000a43c
[   61.721904] be90  bf018098 00007fff c0077394 e5d5bea8 00024dd6 bf0181f0 e80133ec e5d5bf48
[   61.721904] beb0  c05eef2c c0078c30 e5ff7108 00000000 bf018098 e5d5a000 e7ff8000 0001b43c
[   61.721904] 
[   61.721904] FP: 0xe5d5bdcc:
[   61.721904] bdcc  c00b262c 00000001 c00b2868 bf01a01c 600c0013 ffffffff e5d5be2c e5d5be4c
[   61.721904] bdec  e5d5bdf8 c05e5b98 c000a16c bf018044 00000012 00000023 01231000 e5d5a000
[   61.721904] be0c  bf01a000 c0967100 bf0180d4 00000000 00000001 e5ff7124 e5d5be4c e5d5be50
[   61.721904] be2c  e5d5be40 c000a4d8 bf01a01c 600c0013 ffffffff e5d5be8c e5d5be50 c000a4d8
[   61.721904] be4c  bf01a00c c00f58cc e5d5bf48 00000001 bf01808c bf0180d4 e5ff7100 e5d5be8c
[   61.721904] be6c  e5d5bf48 00000001 bf01808c bf0180d4 e5ff7100 e5d5bf44 e5d5be90 c007b494
[   61.721904] be8c  c000a43c bf018098 00007fff c0077394 e5d5bea8 00024dd6 bf0181f0 e80133ec
[   61.721904] beac  e5d5bf48 c05eef2c c0078c30 e5ff7108 00000000 bf018098 e5d5a000 e7ff8000
[   61.721904] 
[   61.721904] R4: 0xe5d59f80:
[   61.721904] 9f80  5f617461 30313030 3138373a 3e3e3e20 646f6320 32333d65 20203333 35323120
[   61.721904] 9fa0  5f39320a 38333431 e7203430 83e8b594 6d2f20bd 682f746e 2f736667 6f632f45
[   61.721904] 9fc0  745f6564 61622f33 6d5f6573 72657465 2f33745f 2f637273 2e636969 20707063
[   61.721904] 9fe0  5f706d63 61746164 3130305f 38373a30 3e3e2032 6f63203e 333d6564 20383332
[   61.721904] a000  00000000 00000001 00000000 e6265500 c0919278 00000002 00000015 c1983e00
[   61.721904] a020  e6265500 e687b740 e5d5a000 e611f6c0 c090f820 00000000 e5d5be7c e5d5bde8
[   61.721904] a040  c05e4030 00000000 00000000 00000000 00000000 00000000 01000000 00000000
[   61.721904] a060  b6f496d0 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[   61.721904] 
[   61.721904] R6: 0xc0967080:
[   61.721904] 7080  c09e2d3c c054e6a0 c054e7f0 c09e2d48 c054c0c0 c054c1a0 c09e2d54 c056b84c
[   61.721904] 70a0  c056ba40 c09e2d54 c056a360 c056a568 c09e2d54 c0567424 c05677e4 c09e2d54
[   61.721904] 70c0  c0560e94 c0560f54 c09e2d54 c054d858 c054d864 c09e2d54 c054bc34 c054bce8
[   61.721904] 70e0  c09e2d60 c0578ca4 c0578cb4 c09e2d60 c054f768 c054f778 c09e2d60 00000000
[   61.721904] 7100  00000000 c19754b8 c07c24ff 00000000 00000000 c1975280 c1975440 00000000
[   61.721904] 7120  00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[   61.721904] 7140  00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[   61.721904] 7160  00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[   61.721904] 
[   61.721904] R10: 0xe5ff70a4:
[   61.721904] 70a4  30936305 e5ce5b54 00000000 00000000 81240002 000024bc 00000000 00000001
[   61.721904] 70c4  00000000 e63d17c0 e63d15c0 e63d1691 e5ff7090 e63d1890 00000000 00000000
[   61.721904] 70e4  2fe0e316 e5ce5b78 00000000 00000000 81240002 000024bd 00000000 e5ff7140
[   61.721904] 7104  00000001 e63d1640 00000124 00000024 bf018068 c0078b04 00000000 00000000
[   61.721904] 7124  00000000 00000000 00000000 00000000 00000000 00000000 00000000 e5ff7180
[   61.721904] 7144  e5ff7144 e5ff7144 bf0180d4 00000000 c092d01c e5ff7200 00000001 00000003
[   61.721904] 7164  00000000 00000000 00000000 00000000 00000000 00000000 00000000 65746f6e
[   61.721904] 7184  00000073 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[   61.721904] Process insmod (pid: 1177, stack limit = 0xe5d5a238)
[   61.721904] Stack: (0xe5d5be40 to 0xe5d5c000)
[   61.721904] be40: e5d5be8c e5d5be50 c000a4d8 bf01a00c c00f58cc e5d5bf48 00000001 bf01808c
[   61.721904] be60: bf0180d4 e5ff7100 e5d5be8c e5d5bf48 00000001 bf01808c bf0180d4 e5ff7100
[   61.721904] be80: e5d5bf44 e5d5be90 c007b494 c000a43c bf018098 00007fff c0077394 e5d5bea8
[   61.721904] bea0: 00024dd6 bf0181f0 e80133ec e5d5bf48 c05eef2c c0078c30 e5ff7108 00000000
[   61.721904] bec0: bf018098 e5d5a000 e7ff8000 0001b43c 00001171 00000000 0b300007 00000000
[   61.721904] bee0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[   61.721904] bf00: 00000000 00000000 00000000 00000000 00000000 00000000 000000e0 00000000
[   61.721904] bf20: 00024dd6 00000003 0000017b c000fac8 e5d5a000 00000000 e5d5bfa4 e5d5bf48
[   61.721904] bf40: c007bb94 c0079eec e7ff8000 0001b43c e8012e74 e800cae3 e800cf94 00000204
[   61.721904] bf60: 000002c4 00000000 00000000 00000000 00000023 00000024 0000000f 00000000
[   61.721904] bf80: 0000000c 00000000 00000000 00000000 00000000 00eb8d78 00000000 e5d5bfa8
[   61.721904] bfa0: c000f880 c007bb30 00000000 00000000 00000003 00024dd6 00000000 00eb8d78
[   61.721904] bfc0: 00000000 00000000 00eb8d78 0000017b 00eb8060 00000000 00000002 beb26d84
[   61.721904] bfe0: beb26bb8 beb26ba8 0001e120 b6ea7da2 800c0030 00000003 fad1ffdf 7eb9fb6e
[   61.721904] [<bf01a01c>] (test_init+0x1c/0x44 [test_debug]) from [<c000a4d8>] (do_one_initcall+0xa8/0x144)
[   61.721904] [<c000a4d8>] (do_one_initcall+0xa8/0x144) from [<c007b494>] (load_module+0x15b4/0x1b68)
[   61.721904] [<c007b494>] (load_module+0x15b4/0x1b68) from [<c007bb94>] (SyS_finit_module+0x70/0x78)
[   61.721904] [<c007bb94>] (SyS_finit_module+0x70/0x78) from [<c000f880>] (ret_fast_syscall+0x0/0x30)
[   61.721904] Code: e59f3028 e3a02023 e3a01012 e59f0020 (e5c32223) 
[   62.833595] ---[ end trace a662172d624f693f ]---
Segmentation fault

这里需要关注几个地方:

  • PC指针的位置:  [   61.721904] PC is at test_init+0x1c/0x44 [test_debug]
  • Oops的错误代码:[   61.715630] Internal error: Oops: 805 [#1] PREEMPT SMP ARM
  • 栈的回溯过程(最后几行):SyS_finit_module -> load_module -> do_one_initcall -> test_init

(1) 先说Oops的错误代码: Oops: 805 [#1]

这里,805表示Oops的错误代码,#1表示这个错误发生一次。Oops的错误代码,根据错误的原因会有不同的定义,本文中的例子可以参考下面的定义(如果发现自己遇到的Oops和下面无法对应的话,最好去内核代码里查找):

 * error_code:
 *      bit 0 == 0 means no page found, 1 means protection fault
 *      bit 1 == 0 means read, 1 means write
 *      bit 2 == 0 means kernel, 1 means user-mode
 *      bit 3 == 0 means data, 1 means instruction

(2)栈的回溯过程: SyS_finit_module -> load_module -> do_one_initcall -> test_init

从栈的回溯顺序可以看到,最后出现错误的地方就是 “test_init”函数中。

(3)PC指针的位置: PC is at test_init+0x1c/0x44 [test_debug]

从栈的回溯过程可以得到出错的函数,但是具体出错在函数的哪一行还不知道。

但是从“test_init+0x1c/0x44” 这段信息可以知道:该函数长度为 0x44,错误发生在 test_init 函数的  0x1c 偏移处。

因此这里需要借用 objdump 工具,如果是交叉编译的,可以使用具体环境下的objdump工具:

我这里使用的是:arm-linux-gnueabihf-objdump

arm-linux-gnueabihf-objdump -S test-debug.o : 参数 -S 表示尽可能的把原来的代码和反汇编出来的代码一起呈现出来,-S 参数需要结合arm-linux-gcc编译参数 -g,才能达到反汇编时同时输出原来的代码。

通过将编译的 “.o” 文件反编译,找出对应的代码偏移处。反编译后的代码如下:

# arm-linux-gnueabihf-objdump -S test-debug.o 

test-debug.o:     file format elf32-littlearm


Disassembly of section .init.text:

00000000 <init_module>:
 * @description	: 驱动入口函数
 * @param 		: 无
 * @return 		: 无
 */
static int __init test_init(void)
{
   0:	e1a0c00d 	mov	ip, sp
   4:	e92dd800 	push	{fp, ip, lr, pc}
   8:	e24cb004 	sub	fp, ip, #4
     int *p = 0x1231223;
     *p = 0x1231223;
   c:	e59f3028 	ldr	r3, [pc, #40]	; 3c <init_module+0x3c>
  10:	e3a02023 	mov	r2, #35	; 0x23
  14:	e3a01012 	mov	r1, #18
    printk("module loaded.\n");
  18:	e59f0020 	ldr	r0, [pc, #32]	; 40 <init_module+0x40>
 * @return 		: 无
 */
static int __init test_init(void)
{
     int *p = 0x1231223;
     *p = 0x1231223;
  1c:	e5c32223 	strb	r2, [r3, #547]	; 0x223
  20:	e5c32225 	strb	r2, [r3, #549]	; 0x225
  24:	e3a02001 	mov	r2, #1
  28:	e5c31224 	strb	r1, [r3, #548]	; 0x224
  2c:	e5c32226 	strb	r2, [r3, #550]	; 0x226
    printk("module loaded.\n");
  30:	ebfffffe 	bl	0 <printk>
    return 0;
}
  34:	e3a00000 	mov	r0, #0
  38:	e89da800 	ldm	sp, {fp, sp, pc}
  3c:	01231000 	.word	0x01231000
  40:	00000000 	.word	0x00000000

Disassembly of section .exit.text:

00000000 <cleanup_module>:
 * @description	: 驱动入口函数
 * @param 		: 无
 * @return 		: 无
 */
static int __init test_init(void)
{
   0:	e1a0c00d 	mov	ip, sp
   4:	e92dd800 	push	{fp, ip, lr, pc}
   8:	e24cb004 	sub	fp, ip, #4
     int *p = 0x1231223;
     *p = 0x1231223;
   c:	e59f0004 	ldr	r0, [pc, #4]	; 18 <cleanup_module+0x18>
  10:	ebfffffe 	bl	0 <printk>
  14:	e89da800 	ldm	sp, {fp, sp, pc}
    printk("module loaded.\n");
  18:	00000010 	.word	0x00000010

重点观察 test_init 函数的 1c 处,即发生错误的行。

至此:

  • 我们人为制造出的错误行: *p = 0x1231223;
  • Oops给出的偏移位置: PC is at test_init+0x1c/0x44 [test_debug]
  • 反编译出的代码偏移位置:  1c:    e5c32223     strb    r2, [r3, #547]    ; 0x223

三者基本吻合,由此可以断定发生的错误就在: *p = 0x1231223 处。

在Oops的帮助下我们很快就解决了问题。

=========================================================================

摘抄一段:

        在Oops发生以后没有造成宕机的情况下,我们就可以从dmesg中查看到完整的信息。但更多

的情况是Oops发生的同时系统也会宕机,此时这些出错信息是来不及存入文件中的,关掉电源后

就无法再看到了,我们只能通过其他的方式来记录:手抄或者拍照。

        还有更坏的情况,如果Oops信息过多的话,一页屏幕显示不全,我们怎么来查看完整的内容

呢?第1种方法,在grub里用vga参数指定更高的分辨率以使屏幕可以显示更多的内容。很明显,

这个方法其实解决不了太多的问题;第2种方法,使用两台机器,把调试机的Oops信息通过串口

打印到宿主机的屏幕上。但现在大部分的笔记本电脑是没有串口的,这个解决方法也有很大的局限

性;第3种方法,使用内核转储工具 kdump 把发生Oops时的内存和CPU寄存器的内容dump到一

文件里,之后我们再用gdb来分析问题(重点在第3种方法)

  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值