kernel cannot startup

38 篇文章 10 订阅

引子:

通过OTG连接USB 键盘,启动到这里就不再输出log也不能通过串口输入,但此时系统
没有死掉,还响应中断。
[   15.809686:0] ALSA device list:
[   15.812931:0]   #0: ns115-audio

假设系统死锁在某个地方,如果此时能触发一个crash,就能分析出那个进程被中断打断,

就是这个进程的进入了死锁状态;

[25437091669] [RU]  PID: 5      TASK: ee073c00  CPU: 0   COMMAND: "kworker/u:0"
[25436661169] [IN]  PID: 23     TASK: ee0d2800  CPU: 0   COMMAND: "kworker/0:1"
[25418302669] [IN]  PID: 388    TASK: ee2a0000  CPU: 0   COMMAND: "irq/47-ricoh61x"
[25202033252] [IN]  PID: 3      TASK: ee02a400  CPU: 0   COMMAND: "ksoftirqd/0"
[24215598335] [IN]  PID: 7      TASK: ee073400  CPU: 0   COMMAND: "watchdog/0"

/*根据前面的时间信息:查看back trace,并没有想象的gic_handler?

 *为什么那?注意这里的crash触发位于workqueue中,已经从中断中退出,或者说不是在中断上下文中;
 *如果crash发生在tasklet中如下;
 *通过这个对比,加深了中断的下半部tasklet,workqueue之间的区别。
 **/

crash> bt ee2a0000
PID: 388    TASK: ee2a0000  CPU: 0   COMMAND: "irq/47-ricoh61x"
 #0 [<c05529b0>] (__schedule) from [<c0553154>]
 #1 [<c05530cc>] (schedule) from [<c007fb2c>]
 #2 [<c007fa30>] (irq_thread) from [<c00483c8>]
 #3 [<c0048334>] (kthread) from [<c002d80c>]
crash> bt ee0d2800
PID: 23     TASK: ee0d2800  CPU: 0   COMMAND: "kworker/0:1"
 #0 [<c05529b0>] (__schedule) from [<c0553154>]
 #1 [<c05530cc>] (schedule) from [<c0042310>]
 #2 [<c004205c>] (worker_thread) from [<c00483c8>]
 #3 [<c0048334>] (kthread) from [<c002d80c>]
crash> bt ee073c00
PID: 5      TASK: ee073c00  CPU: 0   COMMAND: "kworker/u:0"
 #0 [<c0398fdc>] (ricoh61x_irq_work) from [<c0041cd4>]
 #1 [<c0041a64>] (process_one_work) from [<c004221c>]
 #2 [<c004205c>] (worker_thread) from [<c00483c8>]
 #3 [<c0048334>] (kthread) from [<c002d80c>]
crash> bt ee02a400
PID: 3      TASK: ee02a400  CPU: 0   COMMAND: "ksoftirqd/0"
 #0 [<c05529b0>] (__schedule) from [<c0553154>]
 #1 [<c05530cc>] (schedule) from [<c0553510>]
 #2 [<c05534f4>] (schedule_preempt_disabled) from [<c0030a38>]
 #3 [<c00309c8>] (run_ksoftirqd) from [<c00483c8>]
 #4 [<c0048334>] (kthread) from [<c002d80c>]
crash> bt ee073400
PID: 7      TASK: ee073400  CPU: 0   COMMAND: "watchdog/0"
 #0 [<c05529b0>] (__schedule) from [<c0553154>]
 #1 [<c05530cc>] (schedule) from [<c007e99c>]
 #2 [<c007e91c>] (watchdog) from [<c00483c8>]
 #3 [<c0048334>] (kthread) from [<c002d80c>]

/*crash发生在tasklet中的上下文*/

PID: 4121   TASK: ee14e680  CPU: 0   COMMAND: "tty_trans"
 #0 [<c006aa08>] (crash_kexec) from [<c04d0f00>]
 #1 [<c04d0e6c>] (panic) from [<c006f784>]
 #2 [<c006f63c>] (watchdog_timer_fn) from [<c0044a08>]
 #3 [<c00449b4>] (__run_hrtimer) from [<c0044d10>]
 #4 [<c0044c00>] (hrtimer_interrupt) from [<c0013f7c>]
 #5 [<c0013f48>] (twd_handler) from [<c00727d0>]
  #6 [<c0072750>] (handle_percpu_devid_irq) from [<c006fbd4>]
 #7 [<c006fba8>] (generic_handle_irq) from [<c000f0c8>]
 #8 [<c000f04c>] (handle_IRQ) from [<c0008470>]
 #9 [<c0008430>] (gic_handle_irq) from [<c000dc40>
]
    pc : [<c04d3a58>]    lr : [<c023e764>]    psr: 20000113
    sp : ed57fc70  ip : ed57fc80  fp : ed57fc7c
    r10: 0096c000  r9 : c0662040  r8 : c0673048
    r7 : ee1fe800  r6 : ee1fe968  r5 : 20000113  r4 : 00000005
    r3 : 00000000  r2 : 00000100  r1 : 20000113  r0 : c07bec08
    Flags: nzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM
#10 [<c04d3a38>] (_raw_spin_unlock_irqrestore) from [<c023e764>]
#11 [<c023e6d8>] (smd_send_intr) from [<c023ef10>]
#12 [<c023ee94>] (smd_ch_irq_tasklet_handler) from [<c002cd60>]
#13 [<c002ccc4>] (tasklet_action) from [<c002d11c>]
#14 [<c002d08c>] (__do_softirq) from [<c002d3a4>]
#15 [<c002d35c>] (irq_exit) from [<c000f0cc>]
#16 [<c000f04c>] (handle_IRQ) from [<c0008470>]
#17 [<c0008430>] (gic_handle_irq) from [<c000dc40>]
    pc : [<c0028188>]    lr : [<c0027af4>]    psr: 60000013
    sp : ed57fdb8  ip : ed57fd90  fp : ed57fe54
    r10: 00000011  r9 : 00000000  r8 : c06a4f87
    r7 : 60000013  r6 : 00000009  r5 : 00000001  r4 : 00000020
    r3 : c06823c8  r2 : c06823c8  r1 : ed57fd50  r0 : 60000013
    Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM
#18 [<c0027d80>] (vprintk) from [<c04d10ec>]
#19 [<c04d10d0>] (printk) from [<c031da08>]
#20 [<c031d994>] (gs_write) from [<c01d84a8>]
#21 [<c01d81d0>] (n_tty_write) from [<c01d43bc>]
#22 [<c01d4230>] (tty_write) from [<c00ab5f8>]
#23 [<c00ab544>] (vfs_write) from [<c00ab748>]
#24 [<c00ab708>] (sys_write) from [<c000e040>]
    pc : [<00016e4c>]    lr : [<000088fc>]    psr: 60000010
    sp : bead1620  ip : 00000000  fp : bead1644
    r10: 00000000  r9 : 00000000  r8 : 00000000
    r7 : 00000004  r6 : 00000000  r5 : 00000000  r4 : 00008be4
    r3 : 00000080  r2 : 00000080  r1 : bead1650  r0 : 00000003
    Flags: nZCv  IRQs on  FIQs on  Mode USER_32  ISA ARM

到这里看怀疑的不对啊,没有找到问题所在。寻找别的方法:

如果使用另一个键盘,是可以启动的,比较log的差别,看是否能找到线索!

[   15.967893:0] ALSA device list:
[   15.971143:0]   #0: ns115-audio
[   15.975909:0] md: Waiting for all devices to be available before autodetect
出问题时没有输出:md: Waiting for all devices to be available before autodetect
以此log为线索:

kernel_init -> prepare_namespace(){
        /*
         * wait for the known devices to complete their probing
         *
         * Note: this is a potential source of long boot delays.
         * For example, it is not atypical to wait 5 seconds here
         * for the touchpad of a laptop to initialize.
         */
        wait_for_device_probe();

        md_run_setup();
    }

/**
 * wait_for_device_probe
 * Wait for device probing to be completed.
 */
void wait_for_device_probe(void)
{
    /* wait for the known devices to complete their probing */
    wait_event(probe_waitqueue, atomic_read(&probe_count) == 0);
    async_synchronize_full();
}

void __init md_run_setup(void)
{
    create_dev("/dev/md0", MKDEV(MD_MAJOR, 0));

    if (raid_noautodetect)
        printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=autodetect will force)\n");
    else
        autodetect_raid();
    md_setup_drive();
}

到这里我们查到了原因:wait_event(probe_waitqueue, atomic_read(&probe_count) == 0);
该函数在进程init里,从crash dump里确认:是一致的。

crash> bt
PID: 1      TASK: ee02ac00  CPU: 0   COMMAND: "swapper/0"
 #0 [<c05529b0>] (__schedule) from [<c0553154>]
 #1 [<c05530cc>] (schedule) from [<c0289658>]
 #2 [<c02895d4>] (wait_for_device_probe) from [<c06e4e8c>]
 #3 [<c06e4e50>] (prepare_namespace) from [<c06e4464>]
 #4 [<c06e42ec>] (kernel_init) from [<c002d80c>]

到这里就查到了问题的原因,是在probe函数里出了问题。

怎么解决那?

从log对比看,正常的只有一个interface,  而出问题有两个interface,
正常的:
[    7.536800:1] usb interface:Class 3, subClass 1
[    7.541314:1] usb interface:protocol 1

异常的:
[    7.536800:1] usb interface:Class 3, subClass 1
[    7.541314:1] usb interface:protocol 1

[    7.570086:0] usb interface:Class 3, subClass 0
[    7.574679:0] usb interface:protocol 0

最直接的解决办法是:过滤掉这个interface, 不 add this device, 就不会有probe函数的调用。

usb_set_configuration(struct usb_device *dev, int configuration)
{
    for (i = 0; i < nintf; ++i) {
        struct usb_interface *intf = cp->interface[i];
        struct usb_host_interface *host_intf;
        struct usb_device *dev;
        dev = interface_to_usbdev(intf);
        host_intf = intf->cur_altsetting;
        
        // idVendor 0x1c4f idP 0xe
        if(dev->descriptor.idVendor == NEWMEN_KPD_VID && dev->descriptor.idProduct == NEWMEN_KPD_PID
            && host_intf->desc.bInterfaceProtocol != USB_INTERFACE_PROTOCOL_KEYBOARD){
            pr_err("ignore one interface@@@@@@@@@@@@\n");
            continue;
        }
    }
}
到此问题解决了。

以此wait_event(probe_waitqueue, atomic_read(&probe_count) == 0)为线索,再仔细看看

wait_event的实现:

/**
 * wait_event - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event(wq, condition)                     \
do {                                    \
    if (condition)                             \
        break;                            \
    __wait_event(wq, condition);                    \
} while (0)

#define __wait_event(wq, condition)                     \
do {                                    \
    DEFINE_WAIT(__wait);                        \
                                    \
    for (;;) {                            \
        prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);    \
        if (condition)                        \
            break;                        \
        schedule();                        \
    }                                \
    finish_wait(&wq, &__wait);                    \
} while (0)

/*只有函数really_probe中操作probe_count*/

static int really_probe(struct device *dev, struct device_driver *drv)
{
    int ret = 0;

    atomic_inc(&probe_count);
    pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
         drv->bus->name, __func__, drv->name, dev_name(dev));
    WARN_ON(!list_empty(&dev->devres_head));

    dev->driver = drv;
    if (driver_sysfs_add(dev)) {
        printk(KERN_ERR "%s: driver_sysfs_add(%s) failed\n",
            __func__, dev_name(dev));
        goto probe_failed;
    }

    if (dev->bus->probe) {
        ret = dev->bus->probe(dev);
        if (ret)
            goto probe_failed;
    } else if (drv->probe) {
        ret = drv->probe(dev);
        if (ret)
            goto probe_failed;
    }

    driver_bound(dev);
    ret = 1;
    pr_debug("bus: '%s': %s: bound device %s to driver %s\n",
         drv->bus->name, __func__, dev_name(dev), drv->name);
    goto done;

probe_failed:
    devres_release_all(dev);
    driver_sysfs_remove(dev);
    dev->driver = NULL;

    if (ret == -EPROBE_DEFER) {
        /* Driver requested deferred probing */
        dev_info(dev, "Driver %s requests probe deferral\n", drv->name);
        driver_deferred_probe_add(dev);
    } else if (ret != -ENODEV && ret != -ENXIO) {
        /* driver matched but the probe failed */
        printk(KERN_WARNING
               "%s: probe of %s failed with error %d\n",
               drv->name, dev_name(dev), ret);
    } else {
        pr_debug("%s: probe of %s rejects match %d\n",
               drv->name, dev_name(dev), ret);
    }
    /*
     * Ignore errors returned by ->probe so that the next driver can try
     * its luck.
     */
    ret = 0;
done:
    atomic_dec(&probe_count);
    wake_up(&probe_waitqueue);
    return ret;
}

really_probe为什么没有返回?

下面看函数really_probe的调用:
PID: 376    TASK: ee2a2800  CPU: 0   COMMAND: "khubd"
 #0 [<c05529b0>] (__schedule) from [<c0553154>]
 #1 [<c05530cc>] (schedule) from [<c033d818>]
 #2 [<c033d754>] (usb_kill_urb) from [<c03d6c84>]
 #3 [<c03d6bf4>] (usbhid_init_reports) from [<c03d7154>]
 #4 [<c03d6d8c>] (usbhid_start) from [<c03cd620>]
 #5 [<c03cd538>] (hid_device_probe) from [<c02892e0>]
 #6 [<c028920c>] (driver_probe_device) from [<c02894e8>]
 #7 [<c02894a0>] (__device_attach) from [<c02879dc>]
 #8 [<c0287988>] (bus_for_each_drv) from [<c02895a4>]
 #9 [<c0289524>] (device_attach) from [<c02885c4>]

static int __device_attach(struct device_driver *drv, void *data)
{
    struct device *dev = data;

    if (!driver_match_device(drv, dev))
        return 0;

    return driver_probe_device(drv, dev);
}

int driver_probe_device(struct device_driver *drv, struct device *dev)
{
    int ret = 0;

    if (!device_is_registered(dev))
        return -ENODEV;

    pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
         drv->bus->name, __func__, dev_name(dev), drv->name);

    pm_runtime_get_noresume(dev);
    pm_runtime_barrier(dev);
    ret = really_probe(dev, drv);
    pm_runtime_put_sync(dev);

    return ret;
}

static struct hid_ll_driver usb_hid_driver = {
    .parse = usbhid_parse,
    .start = usbhid_start,
    .stop = usbhid_stop,
    .open = usbhid_open,
    .close = usbhid_close,
    .power = usbhid_power,
    .hidinput_input_event = usb_hidinput_input_event,
};

static int hid_device_probe(struct device *dev)
    -> hid_hw_start(hdev, HID_CONNECT_DEFAULT);
        -> hdev->ll_driver->start(hdev)[usbhid_start]
            -> usbhid_init_reports(hid);
                -> usb_kill_urb
                    ->wait_event(usb_kill_urb_queue, atomic_read(&urb->use_count) == 0);
事情的根本原因在进程khubd的函数really_probe阻塞在hid_device_probe中,导致probe_count != 0;
进程PID: 1      TASK: ee02ac00  CPU: 0   COMMAND: "swapper/0"这样就阻塞在
wait_for_device_probe() -> wait_event(probe_waitqueue, atomic_read(&probe_count) == 0);


进程间通信的实现方法:

这里可以看出两个进程间【一个进程完成某件事情后,告诉另一个进程的方法】;
PID: 376    TASK: ee2a2800  CPU: 0   COMMAND: "khubd"
    atomic_dec(&probe_count);
    wake_up(&probe_waitqueue);

PID: 1      TASK: ee02ac00  CPU: 0   COMMAND: "swapper/0"[名字有点奇怪应该是 init啊]
    wait_event(probe_waitqueue, atomic_read(&probe_count) == 0);
在文件drivers/base/dd.c中,定义了全局变量:
static atomic_t probe_count = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(probe_waitqueue);
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值