hung 之 softlockup & hardlockup 检测

目录

1. softlockup & hardlockup 的含义

2. softlockup & hardlockup 检测机制

2.1 基本原理

2.2 功能开关

2.3 功能配置

2.4 代码实现

2.4.1 初始化

2.4.1.1 初始化场景

2.4.1.2 初始化方法

2.4.2 softlockup 检测

2.4.3 hardlockup 检测

3. 忽略 lockup 检查

3.1 touch_nmi_watchdog

3.2 touch_all_softlockup_watchdogs

1. softlockup & hardlockup 的含义

softlockup 指的是这样一种场景:由于内核程序设计问题,导致CPU长时间关闭抢占。

hardlockup 指的是这样一种场景:由于内核程序设计问题,导致CPU时钟中断长时间禁用。

softlockup 或 hardlockup 发生时,CPU被当前运行的任务独占,而其他任务得不到调度。

2. softlockup & hardlockup 检测机制

2.1 基本原理

内核检测 softlockup & hardlockup 的机制为 lockup detector。

lockup detector 本质上也是一个看门狗(watchdog)的思路,包括

1. 两条狗,即两个变量 watchdog_touch_ts(softlockup 狗)、hrtimer_interrupts(hardlockup 狗)。

2. 一个定时器,定时器的回调方法中更新 hrtimer_interrupts 的值加 1(喂 hardlockup 狗),同时在 CPU上添加一个喂狗任务,检查 softlockup 狗的状态。喂狗任务执行时将 watchdog_touch_ts 的值加1(喂 softlockup 狗)。

3. NMI计数器(PMU中断),回调方法中检查 hardlockup 狗的状态。

如果超过一定时间 hrtimer_interrupts 值没变化,则认为发生了 hardlockup;

如果超过一定时间 watchdog_touch_ts 值没变化,则认为发生了 softlockup。

2.2 功能开关

CONFIG_LOCKUP_DETECTOR 控制 lockup detector 功能是否使能。

CONFIG_SOFTLOCKUP_DETECTOR 控制 softlockup detector 功能是否使能。

CONFIG_HARDLOCKUP_DETECTOR_PERF 控制 hardlockup detector 功能是否使能。

下面是我本地的 Android 设备配置

yudi:/ # zcat /proc/config.gz |grep CONFIG_LOCKUP_DETECTOR
yudi:/ #

yudi:/ # zcat /proc/config.gz |grep CONFIG_HARDLOCKUP_DETECTOR_PERF
yudi:/ #

2.3 功能配置

lockup detector 功能通过 /proc/sys/kernel/xxx 节点配置,包括

节点功能
/proc/sys/kernel/watchdog开/关 softlock watchdog,对应变量 watchdog_user_enabled
/proc/sys/kernel/nmi_watchdog开/关 hardlock watchdog,对应变量 nmi_watchdog_user_enabled
/proc/sys/kernel/watchdog_thresh表示 hardlockup 阈值(即时钟中断禁用时间超过阈值时,认为发生 hardlockup),对应变量 watchdog_thresh
/proc/sys/kernel/watchdog_cpumask表示目标 CPU IDs 的掩码,对应变量 watchdog_allowed_mask 

可以通过 sysctl 系统调用或者读写 /proc/sys/kernel/xxx 节点的方式获取、修改相关变量的值。

例如,通过写 /proc/sys/kernel/watchdog 节点可以修改 watchdog_user_enabled 的值,从而控制 softlockup watchdog 功能的开关。

//kernel/sysctl.c

#if defined(CONFIG_LOCKUP_DETECTOR)
        {
                .procname       = "watchdog",
                .data                = &watchdog_user_enabled,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler   = proc_watchdog,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "watchdog_thresh",
                .data                = &watchdog_thresh,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_watchdog_thresh,
                .extra1                = SYSCTL_ZERO,
                .extra2                = &sixty,
        },
        {
                .procname       = "nmi_watchdog",
                .data                = &nmi_watchdog_user_enabled,
                .maxlen                = sizeof(int),
                .mode                = NMI_WATCHDOG_SYSCTL_PERM,
                .proc_handler   = proc_nmi_watchdog,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "watchdog_cpumask",
                .data                = &watchdog_cpumask_bits,
                .maxlen                = NR_CPUS,
                .mode                = 0644,
                .proc_handler        = proc_watchdog_cpumask,
        },
...

例如,写 /proc/sys/kernel/watchdog 节点,设置 watchdog_user_enabled 变量的值,如果值改变则调用 proc_watchdog_update 方法更新 watchdog action。

// kernel/watchdog.c

/*
 * common function for watchdog, nmi_watchdog and soft_watchdog parameter
 *
 * caller             | table->data points to      | 'which'
 * -------------------|----------------------------|--------------------------
 * proc_watchdog      | watchdog_user_enabled      | NMI_WATCHDOG_ENABLED |
 *                    |                            | SOFT_WATCHDOG_ENABLED
 * -------------------|----------------------------|--------------------------
 * proc_nmi_watchdog  | nmi_watchdog_user_enabled  | NMI_WATCHDOG_ENABLED
 * -------------------|----------------------------|--------------------------
 * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
 */
static int proc_watchdog_common(int which, struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        int err, old, *param = table->data;

        mutex_lock(&watchdog_mutex);

        if (!write) {
                /*
                 * On read synchronize the userspace interface. This is a
                 * racy snapshot.
                 */
                *param = (watchdog_enabled & which) != 0;
                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        } else {
                old = READ_ONCE(*param);
                // 3
                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
                if (!err && old != READ_ONCE(*param))
                        proc_watchdog_update(); // 4
        }
        mutex_unlock(&watchdog_mutex);
        return err;
}

/*
 * /proc/sys/kernel/watchdog
 */
// 1
int proc_watchdog(struct ctl_table *table, int write,
                  void *buffer, size_t *lenp, loff_t *ppos)
{
        // 2
        return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
                                    table, write, buffer, lenp, ppos);
}
 

2.4 代码实现

2.4.1 初始化

2.4.1.1 初始化场景

lockup detector 功能的初始化方法是 __lockup_detector_reconfigure

内核启动或 lockup detector 参数发生变化,调用初始化方法。

内核启动时,在 lockup_detector_setup 方法中调用初始化方法。

/*
 * Create the watchdog thread infrastructure and configure the detector(s).
 *
 * The threads are not unparked as watchdog_allowed_mask is empty.  When
 * the threads are successfully initialized, take the proper locks and
 * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
 */
static __init void lockup_detector_setup(void)
{
        /*
         * If sysctl is off and watchdog got disabled on the command line,
         * nothing to do here.
         */
        lockup_detector_update_enable();

        if (!IS_ENABLED(CONFIG_SYSCTL) &&
            !(watchdog_enabled && watchdog_thresh))
                return;

        mutex_lock(&watchdog_mutex);
        __lockup_detector_reconfigure();
        softlockup_initialized = true;
        mutex_unlock(&watchdog_mutex);
}

lockup detector 参数(例如 watchdog_user_enabled)变化时,在 proc_watchdog_update 方法中调用初始化方法。

/* Propagate any changes to the watchdog threads */
static void proc_watchdog_update(void)
{
        /* Remove impossible cpus to keep sysctl output clean. */
        cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
        __lockup_detector_reconfigure();
}

在内核启动时,初始化 lockup detector 的方法中,会检查 CONFIG_SYSCTL 是否使能。如果CONFIG_SYSCTL 未使能,且 watchdog_enabled 或 watchdog_thresh 中有至少一个变量没有有效值(即为0),那么 lockup detector 初始化失败,直接返回。

不过,虽然 watchdog_enabled 的缺省值是 0,但是由于 CONFIG_SYSCTL 一般都是使能的,所以通常来说 lockup detector 初始化不会失败。

下面是 watchdog_enabled 和 watchdog_thresh 变量的定义。

unsigned long __read_mostly watchdog_enabled;
...
int __read_mostly watchdog_thresh = 10;

在 Linux 内核中,定义全局变量时,它们的默认值取决于变量的存储类别和位置。对于未显式初始化的全局变量,它们的默认值为 0。这是因为全局变量会被存储在 BSS(Block Started by Symbol)段中,BSS 段在程序启动时会被初始化为 0。因此 watchdog_enabled 的默认值是 0

另外 watchdog_enabled 还设置了 __read_mostly 属性,这是一个内核特定的属性,用于提示编译器将变量放置在一个更适合频繁读取的内存区域,但这不会改变变量的初始化规则。

2.4.1.2 初始化方法

在 softlockup detector 使能时(CONFIG_SOFTLOCKUP_DETECTOR 定义)先调用 softlockup_start_all 使能 softlockup detector,然后再调用 watchdog_nmi_start(实际上是一个空函数,真正起作用的是 watchdog_nmi_enable 方法) 使能 hardlockup detector。

否则只调用 watchdog_nmi_start 使能 hardlockup detector。

#ifdef CONFIG_SOFTLOCKUP_DETECTOR
...
static void __lockup_detector_reconfigure(void)
{
        cpus_read_lock();
        watchdog_nmi_stop();

        softlockup_stop_all();
        set_sample_period();
        lockup_detector_update_enable();
        if (watchdog_enabled && watchdog_thresh)
                softlockup_start_all();

        watchdog_nmi_start();
        cpus_read_unlock();
        /*
         * Must be called outside the cpus locked section to prevent
         * recursive locking in the perf code.
         */
        __lockup_detector_cleanup();
}
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
static void __lockup_detector_reconfigure(void)
{
        cpus_read_lock();
        watchdog_nmi_stop();
        lockup_detector_update_enable();
        watchdog_nmi_start();
        cpus_read_unlock();
}
...

2.4.2 softlockup 检测

softlockup detector 初始化时,调用 smp_call_on_cpu 在 watchdog_allowed_mask 标记的所有 CPU(一般是对所有CPU)上执行一次 softlockup_start_fn 方法。

// kernel/watchdog.c
static void softlockup_start_all(void)
{
        int cpu;

        cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
        for_each_cpu(cpu, &watchdog_allowed_mask)
                smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
}

smp_call_on_cpu 本质上是向目标 CPU 发送一个跨处理器中断(Inter-Processor Interrupt, IPI),并在目标 CPU 上的中断处理程序中执行目标函数。这个函数会在目标 CPU 上的中断上下文中执行,通常是在中断处理程序中或者在中断处理完成后立即执行。这确保了函数能够在指定的 CPU 上执行,而无需创建新的线程。

  1. 发送 IPI: 调用 smp_call_on_cpu 会导致当前 CPU 向目标 CPU 发送一个 IPI 请求。
  2. 处理 IPI: 目标 CPU 收到 IPI 后,会触发中断,进入中断处理程序。
  3. 执行目标函数: 在中断处理程序中,目标函数会被调用,传递给 smp_call_on_cpu 的参数也会传递给目标函数。
  4. 中断退出: 目标函数执行完毕后,中断处理程序退出,目标 CPU 恢复正常的执行流。

softlockup_start_fn 方法在当前 CPU 上设置并启动定时器 watchdog_hrtimer,然后更新 watchdog_touch_ts 变量的值(watchdog_touch_ts 变量就是被喂的狗),最后调用 watchdog_nmi_enable 使能 hardlockup detector。

定时器到达时的处理函数设置为 watchdog_timer_fn;

定时器周期设置为 sample_period;

static int softlockup_start_fn(void *data)
{
        watchdog_enable(smp_processor_id());
        return 0;
}
static void watchdog_enable(unsigned int cpu)
{
        struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
        struct completion *done = this_cpu_ptr(&softlockup_completion);

        WARN_ON_ONCE(cpu != smp_processor_id());

        init_completion(done);
        complete(done);

        /*
         * Start the timer first to prevent the NMI watchdog triggering
         * before the timer has a chance to fire.
         */
        // 设置并启动定时器
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        hrtimer->function = watchdog_timer_fn;
        hrtimer_start(hrtimer, ns_to_ktime(sample_period),
                      HRTIMER_MODE_REL_PINNED_HARD);

        /* Initialize timestamp */
        // 更新一下 watchdog_touch_ts(设置为当前的时间戳)
        update_touch_ts();
        /* Enable the perf event */
        // 使能 hard lockup
        if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
                watchdog_nmi_enable(cpu);
}

更新 watchdog_touch_ts,即将 watchdog_touch_ts 变量修改为当前的 timestamp。

/* Commands for resetting the watchdog */
static void update_touch_ts(void)
{
        __this_cpu_write(watchdog_touch_ts, get_timestamp());
}

定时器周期 sample_period

sample_period 缺省值是 0,但是在初始化时,会调用 set_sample_period 设置 sample_period 的值。设置为 20s/5 = 4s。

static void set_sample_period(void)
{
        /*
         * convert watchdog_thresh from seconds to ns
         * the divide by 5 is to give hrtimer several chances (two
         * or three with the current relation between the soft
         * and hard thresholds) to increment before the
         * hardlockup detector generates a warning
         */
        sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
        watchdog_update_hrtimer_threshold(sample_period);
}
int __read_mostly watchdog_thresh = 10;

/*
 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
 * lockups can have false positives under extreme conditions. So we generally
 * want a higher threshold for soft lockups than for hard lockups. So we couple
 * the thresholds with a factor: we make the soft threshold twice the amount of
 * time the hard threshold is.
 */
static int get_softlockup_thresh(void)
{
        return watchdog_thresh * 2;
}

定时器到达时的处理函数 watchdog_timer_fn

定时器周期默认是 5s,softlockup 阈值默认是 20s。

定时器每次到达时,在目标 cpu 上执行一个(异步)喂狗任务,喂狗任务将变量 watchdog_touch_ts 值加 1。

如果经过 4 个定时器周期,watchdog_touch_ts 值都没有变化,则说明喂狗任务经过 20s 都没有得到执行,判断发生了 softlockup。 

/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
        // watchdog_touch_ts 的值
        unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
        struct pt_regs *regs = get_irq_regs();
        int duration;
        int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;

        if (!watchdog_enabled)
                return HRTIMER_NORESTART;

        /* kick the hardlockup detector */
        watchdog_interrupt_count();

        /* kick the softlockup detector */
        // completion_done 检查上一次的喂狗是否完成,如果上一次喂狗还没完成,这里就不会再喂狗了。
        // 否则,设置喂狗状态为未完成状态,然后调用 stop_one_cpu_nowait 在目标 CPU 上添加一个喂狗任务(softlockup_fn)。
        // 注意,喂狗任务不会 block 当前线程的执行
        if (completion_done(this_cpu_ptr(&softlockup_completion))) {
                reinit_completion(this_cpu_ptr(&softlockup_completion));
                stop_one_cpu_nowait(smp_processor_id(),
                                softlockup_fn, NULL,
                                this_cpu_ptr(&softlockup_stop_work));
        }

        /* .. and repeat */
        // repeat 定时器
        hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));

        if (touch_ts == SOFTLOCKUP_RESET) {
                if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
                        /*
                         * If the time stamp was touched atomically
                         * make sure the scheduler tick is up to date.
                         */
                        __this_cpu_write(softlockup_touch_sync, false);
                        sched_clock_tick();
                }

                /* Clear the guest paused flag on watchdog reset */
                kvm_check_and_clear_guest_paused();
                update_touch_ts();
                return HRTIMER_RESTART;
        }

        /* check for a softlockup
         * This is done by making sure a high priority task is
         * being scheduled.  The task touches the watchdog to
         * indicate it is getting cpu time.  If it hasn't then
         * this is a good indication some task is hogging the cpu
         */
        // is_softlockup 方法检查 watchdog_touch_ts 值与当前时间戳的差值,
        // 如果差值超过阈值,说明 watchdog_touch_ts 长时间未更新,即喂狗方法长时间未得到执行
        duration = is_softlockup(touch_ts);
        
        // 如果喂狗方法长时间未得到执行,dump stack 甚至 panic
        if (unlikely(duration)) {
                /*
                 * If a virtual machine is stopped by the host it can look to
                 * the watchdog like a soft lockup, check to see if the host
                 * stopped the vm before we issue the warning
                 */
                if (kvm_check_and_clear_guest_paused())
                        return HRTIMER_RESTART;

                /*
                 * Prevent multiple soft-lockup reports if one cpu is already
                 * engaged in dumping all cpu back traces.
                 */
                if (softlockup_all_cpu_backtrace) {
                        if (test_and_set_bit_lock(0, &soft_lockup_nmi_warn))
                                return HRTIMER_RESTART;
                }

                /* Start period for the next softlockup warning. */
                update_touch_ts();

                pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
                        smp_processor_id(), duration,
                        current->comm, task_pid_nr(current));
                print_modules();
                print_irqtrace_events(current);
                if (regs)
                        show_regs(regs);
                else
                        dump_stack();

                if (softlockup_all_cpu_backtrace) {
                        trigger_allbutself_cpu_backtrace();
                        clear_bit_unlock(0, &soft_lockup_nmi_warn);
                }

                add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
                if (softlockup_panic)
                        panic("softlockup: hung tasks");
        }

        return HRTIMER_RESTART;
}

喂狗函数 softlockup_fn,执行

  1. 更新 watchdog_touch_ts 的值
  2. 设置喂狗状态为完成
/*
 * The watchdog thread function - touches the timestamp.
 *
 * It only runs once every sample_period seconds (4 seconds by
 * default) to reset the softlockup timestamp. If this gets delayed
 * for more than 2*watchdog_thresh seconds then the debug-printout
 * triggers in watchdog_timer_fn().
 */
static int softlockup_fn(void *data)
{
        update_touch_ts();
        complete(this_cpu_ptr(&softlockup_completion));

        return 0;
}

为什么喂狗任务 softlockup_fn 要通过 stop_one_cpu_nowait 方法放到目标 CPU 上执行,而不是用 smp_call_on_cpu。

1. stop_one_cpu_nowait 的工作机制

stop_one_cpu_nowait 使用的是工作队列机制,将一个工作项安排到目标 CPU 上执行。如果目标 CPU 禁止了抢占,工作队列项将会被延迟执行,直到抢占再次被允许。

2. CPU 禁止抢占

如果目标 CPU 禁止了抢占,stop_one_cpu_nowait 安排的任务可能会受到影响。在 Linux 内核中,抢占(preemption)是指允许高优先级任务中断低优先级任务的执行。当抢占被禁止时,内核不会切换到其他任务,即使有高优先级的任务需要执行。这可能导致安排在目标 CPU 上的任务无法及时执行,直到抢占被允许。

当目标 CPU 禁止了抢占时,内核会保持当前任务的执行,直到当前任务显式地放弃 CPU 或抢占再次被允许。这意味着:

        a. 当前任务的独占执行:当前任务将独占 CPU,任何其他任务,包括通过 stop_one_cpu_nowait 安排的任务,都不会得到执行。

        b. 延迟执行:stop_one_cpu_nowait 安排的任务将在目标 CPU 恢复抢占后才有机会执行。

而 smp_call_on_cpu 是通过核间中断的方式在目标CPU上执行函数,即使CPU禁用抢占也依然会执行。

由于 softlockup 的目的就是为了发现(软件设计问题导致)CPU长时间关闭抢占的情况,因此用 smp_call_on_cpu 的方式来喂狗是不行的。

2.4.3 hardlockup 检测

使能 hardlockup 的方法是 watchdog_nmi_enable。

/*
 * These functions can be overridden if an architecture implements its
 * own hardlockup detector.
 *
 * watchdog_nmi_enable/disable can be implemented to start and stop when
 * softlockup watchdog threads start and stop. The arch must select the
 * SOFTLOCKUP_DETECTOR Kconfig.
 */
int __weak watchdog_nmi_enable(unsigned int cpu)
{
        hardlockup_detector_perf_enable();
        return 0;
}

hardlockup_detector_perf_enable 方法由 watchdog_hld 模块实现,并且由宏 CONFIG_HARDLOCKUP_DETECTOR_PERF 控制使能。

// include/linux/nmi.h

#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
extern void arch_touch_nmi_watchdog(void);
extern void hardlockup_detector_perf_stop(void);
extern void hardlockup_detector_perf_restart(void);
extern void hardlockup_detector_perf_disable(void);
extern void hardlockup_detector_perf_enable(void);
extern void hardlockup_detector_perf_cleanup(void);
extern int hardlockup_detector_perf_init(void);
#else
static inline void hardlockup_detector_perf_stop(void) { }
static inline void hardlockup_detector_perf_restart(void) { }
static inline void hardlockup_detector_perf_disable(void) { }
static inline void hardlockup_detector_perf_enable(void) { }
static inline void hardlockup_detector_perf_cleanup(void) { }
# if !defined(CONFIG_HAVE_NMI_WATCHDOG)
static inline int hardlockup_detector_perf_init(void) { return -ENODEV; }
static inline void arch_touch_nmi_watchdog(void) {}
# else
static inline int hardlockup_detector_perf_init(void) { return 0; }
# endif
#endif

hardlockup_detector_perf_enable 方法定义在 watchdog_hld.c 中。

调用 hardlockup_detector_event_create 方法。

// kernel/watchdog_hld.c

/**
 * hardlockup_detector_perf_enable - Enable the local event
 */
void hardlockup_detector_perf_enable(void)
{
        if (hardlockup_detector_event_create())
                return;

        /* use original value for check */
        if (!atomic_fetch_inc(&watchdog_cpus))
                pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");

        perf_event_enable(this_cpu_read(watchdog_ev));
}

hardlockup_detector_perf_enable 方法。

static int hardlockup_detector_event_create(void)
{
        unsigned int cpu = smp_processor_id();
        struct perf_event_attr *wd_attr;
        struct perf_event *evt;

        wd_attr = &wd_hw_attr;
        wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);

        /* Try to register using hardware perf events */
        evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
                                               watchdog_overflow_callback, NULL);
        if (IS_ERR(evt)) {
                pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
                         PTR_ERR(evt));
                return PTR_ERR(evt);
        }
        this_cpu_write(watchdog_ev, evt);
        return 0;
}

这里的 watchdog_thresh 是 watchdog.c 中定义的变量,默认值是 10s。

将 watchdog_thresh 转换成 CPU周期数,然后调用 perf_event_create_kernel_counter 方法向 PMU(Performance Monitor Unit)注册一个回调,当 PMU 计数超过目标CPU周期数时执行回调方法 watchdog_overflow_callback。

watchdog_overflow_callback 调用 is_hardlockup(定义在 watchdog.c 中)检查定时器中断数(即变量 hrtimer_interrupts,也定义在 watchdog.c 中)是否有变化。

如果没有变化就认为定时器中断被禁用了很长时间,触发 dump stack 甚至 panic。

/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
                                       struct perf_sample_data *data,
                                       struct pt_regs *regs)
{
        /* Ensure the watchdog never gets throttled */
        event->hw.interrupts = 0;

        if (__this_cpu_read(watchdog_nmi_touch) == true) {
                __this_cpu_write(watchdog_nmi_touch, false);
                return;
        }

        if (!watchdog_check_timestamp())
                return;

        /* check for a hardlockup
         * This is done by making sure our timer interrupt
         * is incrementing.  The timer interrupt should have
         * fired multiple times before we overflow'd.  If it hasn't
         * then this is a good indication the cpu is stuck
         */
        if (is_hardlockup()) {
                int this_cpu = smp_processor_id();

                /* only print hardlockups once */
                if (__this_cpu_read(hard_watchdog_warn) == true)
                        return;

                pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
                         this_cpu);
                print_modules();
                print_irqtrace_events(current);
                if (regs)
                        show_regs(regs);
                else
                        dump_stack();

                /*
                 * Perform all-CPU dump only once to avoid multiple hardlockups
                 * generating interleaving traces
                 */
                if (sysctl_hardlockup_all_cpu_backtrace &&
                                !test_and_set_bit(0, &hardlockup_allcpu_dumped))
                        trigger_allbutself_cpu_backtrace();

                if (hardlockup_panic)
                        nmi_panic(regs, "Hard LOCKUP");

                __this_cpu_write(hard_watchdog_warn, true);
                return;
        }

        __this_cpu_write(hard_watchdog_warn, false);
        return;
}

hrtimer_interrupts 变量以及 is_hardlockup 方法的定义。

// kernel/watchdog.c

static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);

/* watchdog detector functions */
bool is_hardlockup(void)
{
	unsigned long hrint = __this_cpu_read(hrtimer_interrupts);

	if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
		return true;

	__this_cpu_write(hrtimer_interrupts_saved, hrint);
	return false;
}

hrtimer_interrupts 变量值的更新。

在 softlockup 定时器的回调方法 watchdog_timer_fn 中,更新 hrtimer_interrupts 的值(+1)。

// kernel/watchdog.c

/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
	unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
	struct pt_regs *regs = get_irq_regs();
	int duration;
	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;

	if (!watchdog_enabled)
		return HRTIMER_NORESTART;

	/* kick the hardlockup detector */
	watchdog_interrupt_count();
...

定时器周期默认是 5s,hardlockup 阈值默认是 10s。

定时器每次到达时,更新 hrtimer_interrupts 的值(+1)。

如果经过 2 个定时器周期,hrtimer_interrupts 值都没有变化,则说明经过 10s 没有产生时钟中断,判断发生了 hardlockup。 

3. 忽略 lockup 检查

内核模块可以通过调用 touch_nmi_watchdog 或 touch_all_softlockup_watchdogs 方法影响 softlockup、hardlockup 的 check。

touch_nmi_watchdog 可以让忽略当前一次的 hardlockup 检查;

touch_all_softlockup_watchdogs 可以让忽略当前一次的 softlockup 检查。

3.1 touch_nmi_watchdog

touch_nmi_watchdog 方法。

将变量 watchdog_nmi_touch 置为 true!

// include/linux/nmi.h

/**
 * touch_nmi_watchdog - restart NMI watchdog timeout.
 *
 * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
 * may be used to reset the timeout - for code which intentionally
 * disables interrupts for a long time. This call is stateless.
 */
static inline void touch_nmi_watchdog(void)
{
        arch_touch_nmi_watchdog();
        touch_softlockup_watchdog();
}
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);

notrace void arch_touch_nmi_watchdog(void)
{
        /*
         * Using __raw here because some code paths have
         * preemption enabled.  If preemption is enabled
         * then interrupts should be enabled too, in which
         * case we shouldn't have to worry about the watchdog
         * going off.
         */
        raw_cpu_write(watchdog_nmi_touch, true);
}
EXPORT_SYMBOL(arch_touch_nmi_watchdog);

hardlockup 的 PMU nmi 中断回调方法中,首先就会检查一下 watchdog_nmi_touch 的值。

如果 watchdog_nmi_touch 被设置成了 true,就只是将 watchdog_nmi_touch 重新设为 false 就返回了,不继续检查是否发生了 hardlockup(等下一次 PMU nmi 中断再检查)。

// kernel/watchdog_hld.c

/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
                                       struct perf_sample_data *data,
                                       struct pt_regs *regs)
{
        /* Ensure the watchdog never gets throttled */
        event->hw.interrupts = 0;

        if (__this_cpu_read(watchdog_nmi_touch) == true) {
                __this_cpu_write(watchdog_nmi_touch, false);
                return;
        }
        ...

3.2 touch_all_softlockup_watchdogs

touch_all_softlockup_watchdogs 方法。

  1. 将每个 CPU 的 watchdog_touch_ts 变量置为 SOFTLOCKUP_RESET(LONG_MAX)

  2. 调用 wq_watchdog_touch(-1) 将 workqueue 模块的变量 wq_watchdog_touched 置为当前的 jiffies(表示当前时间的 jiffies 数量)

注意. wq_watchdog_touch(-1) 实际上与 lockup 无关,它影响的是 workqueue 的 watchdog(检测 workqueue 卡死的机制,跟 lockup watchdog 不是一回事)。

// kernel/watchdog.c

void touch_all_softlockup_watchdogs(void)
{
        int cpu;

        /*
         * watchdog_mutex cannpt be taken here, as this might be called
         * from (soft)interrupt context, so the access to
         * watchdog_allowed_cpumask might race with a concurrent update.
         *
         * The watchdog time stamp can race against a concurrent real
         * update as well, the only side effect might be a cycle delay for
         * the softlockup check.
         */
        for_each_cpu(cpu, &watchdog_allowed_mask)
                per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET;
        wq_watchdog_touch(-1);
}

在 softlockup 检测定时器的回调函数 watchdog_timer_fn 中,如果检查到 watchdog_touch_ts 的值是 SOFTLOCKUP_RESET,不会认为是发生了 softlockup,而是直接返回。

// kernel/watchdog.c

/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
        unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
        struct pt_regs *regs = get_irq_regs();
        int duration;
        int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;

        if (!watchdog_enabled)
                return HRTIMER_NORESTART;

        /* kick the hardlockup detector */
        watchdog_interrupt_count();

        /* kick the softlockup detector */
        if (completion_done(this_cpu_ptr(&softlockup_completion))) {
                reinit_completion(this_cpu_ptr(&softlockup_completion));
                stop_one_cpu_nowait(smp_processor_id(),
                                softlockup_fn, NULL,
                                this_cpu_ptr(&softlockup_stop_work));
        }

        /* .. and repeat */
        hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));

        if (touch_ts == SOFTLOCKUP_RESET) {
                if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
                        /*
                         * If the time stamp was touched atomically
                         * make sure the scheduler tick is up to date.
                         */
                        __this_cpu_write(softlockup_touch_sync, false);
                        sched_clock_tick();
                }

                /* Clear the guest paused flag on watchdog reset */
                kvm_check_and_clear_guest_paused();
                update_touch_ts();
                return HRTIMER_RESTART;
        }
        ... ...

  • 20
    点赞
  • 31
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值