一次linux oops分析

移植microarrayfp指纹模块,在probe进行spi写、读操作时内核出现oops的崩溃问题。

[   18.404728] Unable to handle kernel paging request at virtual address ffffffe0fd4533c0
[   18.412693] Mem abort info:
[   18.415498]   ESR = 0x96000145
[   18.418585]   Exception class = DABT (current EL), IL = 32 bits
[   18.424571]   SET = 0, FnV = 0
[   18.427633]   EA = 0, S1PTW = 0
[   18.430822] Data abort info:
[   18.433726]   ISV = 0, ISS = 0x00000145
[   18.437591]   CM = 1, WnR = 1
[   18.440581] swapper pgtable: 4k pages, 39-bit VAs, pgdp = ffffffa0fd568000
[   18.447475] [ffffffe0fd4533c0] pgd=0000000000000000, pud=0000000000000000
[   18.454317] Internal error: Oops: 96000145 [#1] PREEMPT SMP
[   18.459894] Modules linked in:
[   18.462951] Process swapper/0 (pid: 1, stack limit = 0xffffff8008058000)
[   18.469655] CPU: 6 PID: 1 Comm: swapper/0 Tainted: G        W         4.19.157 #2
[   18.477141] Hardware name: Qualcomm Technologies, Inc. BENGAL IDP (DT)
[   18.483671] pstate: 00400005 (nzcv daif +PAN -UAO)
[   18.488471] pc : __dma_inv_area+0x28/0x58
[   18.492485] lr : fast_smmu_map_page+0x10c/0x270
[   18.497020] sp : ffffff800805b6e0
[   18.500333] x29: ffffff800805b6f0 x28: 000000157d4533d0
[   18.505641] x27: 00000000000003d0 x26: ffffffbf00000000
[   18.510950] x25: 0000000000000000 x24: 0000000000000002
[   18.516259] x23: 0000000000000000 x22: 000000000000000a
[   18.521566] x21: ffffffccd68c9b80 x20: ffffffa0fc26d000
[   18.526874] x19: 0000000000001000 x18: 0000000000000078
[   18.532182] x17: ffffffa0fcbfd000 x16: 0000000000000000
[   18.537489] x15: ffffffa0fb81a5f0 x14: ffffffa0fc4bc018
[   18.542797] x13: 0000000000000002 x12: 0000000000000000
[   18.548105] x11: 0000000000000000 x10: 0000000000000000
[   18.553413] x9 : 000000000157d453 x8 : ffffffe0fd453000
[   18.558720] x7 : 0000000000000000 x6 : 000000000000003f
[   18.564028] x5 : 0000000000000000 x4 : 0000000000000002
[   18.569336] x3 : 000000000000003f x2 : 0000000000000040
[   18.574644] x1 : ffffffe0fd4533c0 x0 : ffffffe0fd4533d0
[   18.579953] Call trace:
[   18.582398]  __dma_inv_area+0x28/0x58
[   18.586067]  geni_se_iommu_map_buf+0xa4/0x100
[   18.590421]  spi_geni_prepare_message+0x15c/0x4a8           
[   18.595129]  __spi_pump_messages+0x408/0x718                
[   18.599397]  __spi_sync+0x2bc/0x310                         
[   18.602882]  spi_sync+0x30/0x50                            
[   18.606024]  mas_sync+0xd4/0x118                          
[   18.609252]  init_connect+0x40/0x250
[   18.612833]  mas_probe+0x43c/0x650
[   18.616232]  spi_drv_probe+0x88/0xb0
[   18.619804]  really_probe+0x540/0x720
[   18.623470]  driver_probe_device+0x74/0x148
[   18.627651]  __driver_attach+0x114/0x1c8
[   18.631577]  bus_for_each_dev+0x84/0xd0
[   18.635416]  driver_attach+0x2c/0x38
[   18.638986]  bus_add_driver+0x144/0x268
[   18.642825]  driver_register+0x78/0x110

Unable to handle kernel paging request at virtual address ffffffe0fd4533c0

其中地址ffffffe0fd4533c0为非常大的值。首先排除空指针和hang可能。

从栈信息可知调用顺序为:

mas_probe -> init_connect -> mas_sync -> spi_sync 直到调__dma_inv_area崩溃

使用objdumpvmlinux文件进行反编译

运行./aarch64-linux-androidkernel-objdump -D vmlinux > obj_log导出到obj_log文件。

搜寻obj_log文件__dma_inv_area看到如下汇编代码:

ffffff80080ad650 <__dma_inv_area>:
ffffff80080ad650:       8b000021        add     x1, x1, x0
ffffff80080ad654:       d53b0023        mrs     x3, ctr_el0
ffffff80080ad658:       d503201f        nop
ffffff80080ad65c:       d3504c63        ubfx    x3, x3, #16, #4
ffffff80080ad660:       d2800082        mov     x2, #0x4                        // #4
ffffff80080ad664:       9ac32042        lsl     x2, x2, x3
ffffff80080ad668:       d1000443        sub     x3, x2, #0x1
ffffff80080ad66c:       ea03003f        tst     x1, x3
ffffff80080ad670:       8a230021        bic     x1, x1, x3
ffffff80080ad674:       54000040        b.eq    ffffff80080ad67c <__dma_inv_area+0x2c>
ffffff80080ad678:       d50b7e21        dc      civac, x1
ffffff80080ad67c:       ea03001f        tst     x0, x3
ffffff80080ad680:       8a230000        bic     x0, x0, x3
ffffff80080ad684:       54000060        b.eq    ffffff80080ad690 <__dma_inv_area+0x40>
ffffff80080ad688:       d50b7e20        dc      civac, x0
ffffff80080ad68c:       14000002        b       ffffff80080ad694 <__dma_inv_area+0x44>
ffffff80080ad690:       d5087620        dc      ivac, x0
ffffff80080ad694:       8b020000        add     x0, x0, x2
ffffff80080ad698:       eb01001f        cmp     x0, x1
ffffff80080ad69c:       54ffffa3        b.cc    ffffff80080ad690 <__dma_inv_area+0x40>
ffffff80080ad6a0:       d5033f9f        dsb     sy
ffffff80080ad6a4:       d65f03c0        ret

根据Call trace:信息__dma_inv_area+0x28/0x58

0x28代表__dma_inv_area崩溃时所在的偏移量,从__dma_inv_area基地址ffffff80080ad650偏移0x28可得到

ffffff80080ad678: d50b7e21 dc civac, x1处崩溃。而这条指令只有寄存器x1的值可变。

从以下的寄存器状态可知,X1的值刚好对应崩溃的地址ffffffe0fd4533c0

[   18.488471] pc : __dma_inv_area+0x28/0x58
[   18.492485] lr : fast_smmu_map_page+0x10c/0x270
[   18.497020] sp : ffffff800805b6e0
[   18.500333] x29: ffffff800805b6f0 x28: 000000157d4533d0
[   18.505641] x27: 00000000000003d0 x26: ffffffbf00000000
[   18.510950] x25: 0000000000000000 x24: 0000000000000002
[   18.516259] x23: 0000000000000000 x22: 000000000000000a
[   18.521566] x21: ffffffccd68c9b80 x20: ffffffa0fc26d000
[   18.526874] x19: 0000000000001000 x18: 0000000000000078
[   18.532182] x17: ffffffa0fcbfd000 x16: 0000000000000000
[   18.537489] x15: ffffffa0fb81a5f0 x14: ffffffa0fc4bc018
[   18.542797] x13: 0000000000000002 x12: 0000000000000000
[   18.548105] x11: 0000000000000000 x10: 0000000000000000
[   18.553413] x9 : 000000000157d453 x8 : ffffffe0fd453000
[   18.558720] x7 : 0000000000000000 x6 : 000000000000003f
[   18.564028] x5 : 0000000000000000 x4 : 0000000000000002
[   18.569336] x3 : 000000000000003f x2 : 0000000000000040
[   18.574644] x1 : ffffffe0fd4533c0 x0 : ffffffe0fd4533d0

因此找出地址ffffffe0fd4533c0从哪里来?

驱动中所调用的函数,最后看到

int mas_sync(u8 *txb, u8 *rxb, int len) 
{
    int ret = 0;
	struct spi_message m;
	struct spi_transfer t = {
		.tx_buf = txb,
		.rx_buf = rxb,
		.len = len,
	    .delay_usecs = 1,
        .bits_per_word = 8,
		.speed_hz = smas->spi->max_speed_hz,
	};

    mutex_lock(&dev_lock);
    spi_message_init(&m);
	spi_message_add_tail(&t, &m);
	ret = spi_sync(smas->spi, &m);
	mutex_unlock(&dev_lock);

 	return ret;
}

spi_sync(smas->spi, &m)中,smas->spi是驱动注册产生,只有struct spi_message m为额外定义,m只定义未赋值。

其中要传输的数据包含在

#define FBUF 	(32*1024)

static u8 stxb[FBUF];
static u8 srxb[FBUF];

struct spi_transfer t = {
		.tx_buf = txb,
		.rx_buf = rxb,
		.len = len,
	    .delay_usecs = 1,
        .bits_per_word = 8,
		.speed_hz = smas->spi->max_speed_hz,
	};

由下面可知spi_message_add_tail(&t, &m);struct spi_transfer类型变量t插入到struct spi_message类型变量m的链表成员transfers,由此建立变量t与变量m的联系。

static inline void
spi_message_add_tail(struct spi_transfer *t, struct spi_message *m)
{
	list_add_tail(&t->transfer_list, &m->transfers);
}

struct spi_message {
	struct list_head	transfers;

	struct spi_device	*spi;

	unsigned		is_dma_mapped:1;

	void			(*complete)(void *context);
	void			*context;
	unsigned		frame_length;
	unsigned		actual_length;
	int			status;

	struct list_head	queue;
	void			*state;

	struct list_head        resources;
};

以下调用均是linux内核函数,列举其结构关系。最终看到struct spi_transfer *xfer成员tx_bufrx_buf。到geni_se_iommu_map_buf()可知接下来内核将崩溃。

因此锁定成员tx_bufrx_buf分析。

static int __spi_sync(struct spi_device *spi, struct spi_message *message)
{
    struct spi_controller *ctlr = spi->controller;
    //...
    message->spi = spi;
    //...
    __spi_pump_messages(ctlr, false);
    //...
}

static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
{
    //...
    ctlr->prepare_message(ctlr, ctlr->cur_msg);
    //...
}

static int spi_geni_prepare_message(struct spi_master *spi,
					struct spi_message *spi_msg)
{
    struct spi_geni_master *mas = spi_master_get_devdata(spi);
    //...
    ret = spi_geni_map_buf(mas, spi_msg);
    //...
}

static int spi_geni_map_buf(struct spi_geni_master *mas,
				struct spi_message *msg)
{
	//...
    list_for_each_entry(xfer, &msg->transfers, transfer_list)
    {
        struct spi_transfer *xfer;
        
        if (xfer->rx_buf) {
			ret = geni_se_iommu_map_buf(mas->wrapper_dev,
						&xfer->rx_dma, xfer->rx_buf,
						xfer->len, DMA_FROM_DEVICE);
        }
        
        if (xfer->tx_buf) {
			ret = geni_se_iommu_map_buf(mas->wrapper_dev,
						&xfer->tx_dma,
						(void *)xfer->tx_buf,
						xfer->len, DMA_TO_DEVICE);
        }
    }
    //...
}

打印static u8 stxb[FBUF];static u8 srxb[FBUF];

地址得到

stxbffffffe0fd4533c0

srxbffffffe0fd4533d0

与崩溃时的指针一致。

从vmlinux看kernel代码空间地址在ffffff80xxxxxxxx

驱动中静态变量stxbsrxb共64KB空间可能超出空间

最后将变量由static定义改为申请内核堆中空间kmalloc,内核未再发生崩溃

  • 1
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值