一文带你了解linux timer子系统(一):timer

引言

Linux timer子系统分为两部分:定时和计时,定时(timer)就像日常生活中的闹钟一样,时间到了,就处理对应的事务,而计时(timekeeping)主要是用来统计自开机以来的时间,维护系统的jiffies,以及内核的wall clock。因最近分析一个cpuidle性能问题,阅读了kernel6.10 timer子系统timer代码,在此谈一谈自己的理解。

Arm64 Generic Timer架构

随着技术的发展,arm架构越来越受市场青睐,如今Arm Soc时钟设计普遍采用了Generic timer架构。以arm64为例:

从上图可以看出,Arm64内部有一个counter,因为它是全局的,所以称为system counter,system counter处于always-on domain,按照固定的频率累计计数,而其他的core,也就是图中的PE,共享这个system counter,都使用它作为自己的clock source,因此所有core共享同一个时间。

Linux timer

基本概念

1、clock source

顾名思义,clock source就是指时钟源,是对上图中的counter一种抽象,内核使用struct clocksource结构体来表示一个clock source。

struct clocksource {
        u64                     (*read)(struct clocksource *cs);
        u64                     mask;
        u32                     mult;
        u32                     shift;
        u64                     max_idle_ns;
        u32                     maxadj;
        u32                     uncertainty_margin;
#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
        struct arch_clocksource_data archdata;
#endif
        u64                     max_cycles;
        const char              *name;
        struct list_head        list;
        int                     rating;
        enum clocksource_ids    id;
        enum vdso_clock_mode    vdso_clock_mode;
        unsigned long           flags;

        int                     (*enable)(struct clocksource *cs);
        void                    (*disable)(struct clocksource *cs);
        void                    (*suspend)(struct clocksource *cs);
        void                    (*resume)(struct clocksource *cs);
        void                    (*mark_unstable)(struct clocksource *cs);
        void                    (*tick_stable)(struct clocksource *cs);

        /* private: */
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
        /* Watchdog related data, used by the framework */
        struct list_head        wd_list;
        u64                     cs_last;
        u64                     wd_last;
#endif
        struct module           *owner;
};

如果你留意观察linux kernel的boot log,你会发现类似如下的log:

clocksource: arch_sys_counter: mask: 0x1ffffffffffffff max_cycles: 0x1cd42e208c, max_idle_ns: 881590405314 ns

这就是所谓的arch timer,下文会详细说明其注册过程。

2、clock event device

总所周知,timer有闹钟的功能,一个闹钟时间到了,需要处理相应的事务,所以内核定义了struct clock_event_device结构体

struct clock_event_device {
        void                    (*event_handler)(struct clock_event_device *);
        int                     (*set_next_event)(unsigned long evt, struct clock_event_device *);
        int                     (*set_next_ktime)(ktime_t expires, struct clock_event_device *);
        ktime_t                 next_event;
        u64                     max_delta_ns;
        u64                     min_delta_ns;
        u32                     mult;
        u32                     shift;
        enum clock_event_state  state_use_accessors;
        unsigned int            features;
        unsigned long           retries;

        int                     (*set_state_periodic)(struct clock_event_device *);
        int                     (*set_state_oneshot)(struct clock_event_device *);
        int                     (*set_state_oneshot_stopped)(struct clock_event_device *);
        int                     (*set_state_shutdown)(struct clock_event_device *);
        int                     (*tick_resume)(struct clock_event_device *);

        void                    (*broadcast)(const struct cpumask *mask);
        void                    (*suspend)(struct clock_event_device *);
        void                    (*resume)(struct clock_event_device *);
        unsigned long           min_delta_ticks;
        unsigned long           max_delta_ticks;

        const char              *name;
        int                     rating;
        int                     irq;
        int                     bound_on;
        const struct cpumask    *cpumask;
        struct list_head        list;
        struct module           *owner;
} ____cacheline_aligned;

3、tick device

内核在clock_event_device的基础上做一层简单封装,就有了tick device

struct tick_device {
        struct clock_event_device *evtdev;
        enum tick_device_mode mode;
};

 tick_broadcast_device,是一种可以做broadcast的tick device,cpuidle经常需要与之打交道。

4、tick sched 

进程的调度离不开tick(系统滴答),tick sched就是对早期内核tick的一种仿真

struct tick_sched {
        /* Common flags */
        unsigned long                   flags;

        /* Tick handling: jiffies stall check */
        unsigned int                    stalled_jiffies;
        unsigned long                   last_tick_jiffies;

        /* Tick handling */
        struct hrtimer                  sched_timer;
        ktime_t                         last_tick;
        ktime_t                         next_tick;
        unsigned long                   idle_jiffies;
        ktime_t                         idle_waketime;
        unsigned int                    got_idle_tick;

        /* Idle entry */
        seqcount_t                      idle_sleeptime_seq;
        ktime_t                         idle_entrytime;

        /* Tick stop */
        unsigned long                   last_jiffies;
        u64                             timer_expires_base;
        u64                             timer_expires;
        u64                             next_timer;
        ktime_t                         idle_expires;
        unsigned long                   idle_calls;
        unsigned long                   idle_sleeps;

        /* Idle exit */
        ktime_t                         idle_exittime;
        ktime_t                         idle_sleeptime;
        ktime_t                         iowait_sleeptime;

        /* Full dynticks handling */
        atomic_t                        tick_dep_mask;

        /* Clocksource changes */
        unsigned long                   check_clocks;
};

5、hrtimer

相对早期timer wheel而言,早期的tick是由HZ来决定,如果HZ设置为250,则tick就是4ms,然而随着技术的发展,网络等应用程序对tick有了更高的要求,4ms已无法满足,所以有了现在hrtimer。hrtime具有更高的精度,可以达到ns级别。

struct hrtimer {
        struct timerqueue_node          node;
        ktime_t                         _softexpires;
        enum hrtimer_restart            (*function)(struct hrtimer *);
        struct hrtimer_clock_base       *base;
        u8                              state;
        u8                              is_rel;
        u8                              is_soft;
        u8                              is_hard;
};

而struct hrtimer_clock_base定义如下:

struct hrtimer_clock_base {
        struct hrtimer_cpu_base *cpu_base;
        unsigned int            index;
        clockid_t               clockid;
        seqcount_raw_spinlock_t seq;
        struct hrtimer          *running;
        struct timerqueue_head  active;
        ktime_t                 (*get_time)(void);
        ktime_t                 offset;
} __hrtimer_clock_base_align;

struct hrtimer_cpu_base定义如下:

struct hrtimer_cpu_base {
        raw_spinlock_t                  lock;
        unsigned int                    cpu;
        unsigned int                    active_bases;
        unsigned int                    clock_was_set_seq;
        unsigned int                    hres_active             : 1,
                                        in_hrtirq               : 1,
                                        hang_detected           : 1,
                                        softirq_activated       : 1,
                                        online                  : 1;
#ifdef CONFIG_HIGH_RES_TIMERS
        unsigned int                    nr_events;
        unsigned short                  nr_retries;
        unsigned short                  nr_hangs;
        unsigned int                    max_hang_time;
#endif
#ifdef CONFIG_PREEMPT_RT
        spinlock_t                      softirq_expiry_lock;
        atomic_t                        timer_waiters;
#endif
        ktime_t                         expires_next;
        struct hrtimer                  *next_timer;
        ktime_t                         softirq_expires_next;
        struct hrtimer                  *softirq_next_timer;
        struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
} ____cacheline_aligned;

timer子系统init流程

一切还得从start_kernel()开始说起:

start_kernel()

    .......
    
    tick_init();

    init_timers();

    hrtimers_init();

    timekeeping_init();

    time_init();

    sched_clock_init();

    ........

tick_init()

/**
 * tick_init - initialize the tick control
 */
void __init tick_init(void)
{
        tick_broadcast_init();
        tick_nohz_init();
}
void __init tick_broadcast_init(void)
{
        zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
        zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);
        zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
#ifdef CONFIG_TICK_ONESHOT
        zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
        zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
        zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
#endif
}

分配了tick_broadcast_mask,tick_broadcast_on,tmpmask,tick_broadcast_oneshot_mask,tick_broadcast_pending_mask,tick_broadcast_force_mask。

init_timers()

void __init init_timers(void)
{
        init_timer_cpus();
        posix_cputimers_init_work();
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
static void __init init_timer_cpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                init_timer_cpu(cpu);
}
static void __init init_timer_cpu(int cpu)
{
        struct timer_base *base;
        int i;

        for (i = 0; i < NR_BASES; i++) {
                base = per_cpu_ptr(&timer_bases[i], cpu);
                base->cpu = cpu;
                raw_spin_lock_init(&base->lock);
                base->clk = jiffies;
                base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
                timer_base_init_expiry_lock(base);
        }
}

初始化percpu timer_bases[]

hrtimers_init()

void __init hrtimers_init(void)
{
        hrtimers_prepare_cpu(smp_processor_id());
        open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}
/*
 * Functions related to boot-time initialization:
 */
int hrtimers_prepare_cpu(unsigned int cpu)
{
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;

        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];

                clock_b->cpu_base = cpu_base;
                seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
                timerqueue_init_head(&clock_b->active);
        }

        cpu_base->cpu = cpu;
        cpu_base->active_bases = 0;
        cpu_base->hres_active = 0;
        cpu_base->hang_detected = 0;
        cpu_base->next_timer = NULL;
        cpu_base->softirq_next_timer = NULL;
        cpu_base->expires_next = KTIME_MAX;
        cpu_base->softirq_expires_next = KTIME_MAX;
        cpu_base->online = 1;
        hrtimer_cpu_base_init_expiry_lock(cpu_base);
        return 0;
}

其中hrtimer_bases定义如下:

/*
 * The timer bases:
 *
 * There are more clockids than hrtimer bases. Thus, we index
 * into the timer bases by the hrtimer_base_type enum. When trying
 * to reach a base using a clockid, hrtimer_clockid_to_base()
 * is used to convert from clockid to the proper hrtimer_base_type.
 */
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
        .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
        .clock_base =
        {
                {
                        .index = HRTIMER_BASE_MONOTONIC,
                        .clockid = CLOCK_MONOTONIC,
                        .get_time = &ktime_get,
                },
                {
                        .index = HRTIMER_BASE_REALTIME,
                        .clockid = CLOCK_REALTIME,
                        .get_time = &ktime_get_real,
                },
                {
                        .index = HRTIMER_BASE_BOOTTIME,
                        .clockid = CLOCK_BOOTTIME,
                        .get_time = &ktime_get_boottime,
                },
                {
                        .index = HRTIMER_BASE_TAI,
                        .clockid = CLOCK_TAI,
                        .get_time = &ktime_get_clocktai,
                },
                {
                        .index = HRTIMER_BASE_MONOTONIC_SOFT,
                        .clockid = CLOCK_MONOTONIC,
                        .get_time = &ktime_get,
                },
                {
                        .index = HRTIMER_BASE_REALTIME_SOFT,
                        .clockid = CLOCK_REALTIME,
                        .get_time = &ktime_get_real,
                },
                {
                        .index = HRTIMER_BASE_BOOTTIME_SOFT,
                        .clockid = CLOCK_BOOTTIME,
                        .get_time = &ktime_get_boottime,
                },
                {
                        .index = HRTIMER_BASE_TAI_SOFT,
                        .clockid = CLOCK_TAI,
                        .get_time = &ktime_get_clocktai,
                },
        }
};

初始化boot_cpu(一般cpu0)对应的cpu_base

timekeeping_init()

这部分属于计时部分,暂时先略过

time_init()

void __init time_init(void)
{
        u32 arch_timer_rate;

        of_clk_init(NULL);
        timer_probe();

        tick_setup_hrtimer_broadcast();

        arch_timer_rate = arch_timer_get_rate();
        if (!arch_timer_rate)
                panic("Unable to initialise architected timer.\n");

        /* Calibrate the delay loop directly */
        lpj_fine = arch_timer_rate / HZ;

        pv_time_init();
}
of_clk_init(NULL)

这里提到了__clk_of_table,从System.map内容来看:

timer_probe()
void __init timer_probe(void)
{
        struct device_node *np;
        const struct of_device_id *match;
        of_init_fn_1_ret init_func_ret;
        unsigned timers = 0;
        int ret;

        for_each_matching_node_and_match(np, __timer_of_table, &match) {
                if (!of_device_is_available(np))
                        continue;

                init_func_ret = match->data;

                ret = init_func_ret(np);
                if (ret) {
                        if (ret != -EPROBE_DEFER)
                                pr_err("Failed to initialize '%pOF': %d\n", np,
                                       ret);
                        continue;
                }

                timers++;
        }

        timers += acpi_probe_device_table(timer);

        if (!timers)
                pr_crit("%s: no matching timers found\n", __func__);
}

这里提到了__timer_of_table,从System.map内容看:

 

这里以armv8_arch_timer为例:

drivers/clocksource/arm_arch_timer.c

 TIMER_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init)

因此上面代码中init_func_ret等于arch_timer_of_init()。

static int __init arch_timer_of_init(struct device_node *np)
{
        int i, irq, ret;
        u32 rate;
        bool has_names;

        if (arch_timers_present & ARCH_TIMER_TYPE_CP15) {
                pr_warn("multiple nodes in dt, skipping\n");
                return 0;
        }

        arch_timers_present |= ARCH_TIMER_TYPE_CP15;

        has_names = of_property_read_bool(np, "interrupt-names");

        for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++) {
                if (has_names)
                        irq = of_irq_get_byname(np, arch_timer_ppi_names[i]);
                else
                        irq = of_irq_get(np, i);
                if (irq > 0)
                        arch_timer_ppi[i] = irq;
        }

        arch_timer_populate_kvm_info();

        rate = arch_timer_get_cntfrq();
        arch_timer_of_configure_rate(rate, np);

        arch_timer_c3stop = !of_property_read_bool(np, "always-on");

        /* Check for globally applicable workarounds */
        arch_timer_check_ool_workaround(ate_match_dt, np);

        /*
         * If we cannot rely on firmware initializing the timer registers then
         * we should use the physical timers instead.
         */
        if (IS_ENABLED(CONFIG_ARM) &&
            of_property_read_bool(np, "arm,cpu-registers-not-fw-configured"))
                arch_timer_uses_ppi = ARCH_TIMER_PHYS_SECURE_PPI;
        else
                arch_timer_uses_ppi = arch_timer_select_ppi();

        if (!arch_timer_ppi[arch_timer_uses_ppi]) {
                pr_err("No interrupt available, giving up\n");
                return -EINVAL;
        }

        /* On some systems, the counter stops ticking when in suspend. */
        arch_counter_suspend_stop = of_property_read_bool(np,
                                                         "arm,no-tick-in-suspend");

        ret = arch_timer_register();
        if (ret)
                return ret;

        if (arch_timer_needs_of_probing())
                return 0;

        return arch_timer_common_init();
}

1、从dts中获取中断号

2、选择中断类型为ARCH_TIMER_PHYS_NONSECURE_PPI

3、arch_timer_register()

        分配一个percpu类型clock_event_device arch_timer_evt

        申请arch timer中断,percpu类型,中断handler=arch_timer_handler_phys

        arch_timer_cpu_pm_init() 注册cpu进出cpuidle notify,用来save和restore cntval

        arch_timer_starting_cpu()

                __arch_timer_setup(ARCH_TIMER_TYPE_CP15, clk)

                对percpu arch_timer_evt进行初始化

                clockevents_config_and_register()

                        clockevents_register_device

                                list_add(&dev->list, &clockevent_devices);

                                tick_check_new_device(dev)

                                               tick_setup_device

                                                        初始化tick_device                                              

4、arch_timer_common_init()

                arch_timer_banner(arch_timers_present)//打印arch_timer: cp15 timer(s) running at 1000.00MHz (phys).

                arch_counter_register(arch_timers_present)

                                clocksource_register_hz(&clocksource_counter, arch_timer_rate)

                                                __clocksource_register_scale(cs, 1, hz)

                                                                __clocksource_update_freq_scale(cs, scale, freq)

                                sched_clock_register(arch_timer_read_counter, width, arch_timer_rate)

                                //打印sched_clock: 61 bits at 1000MHz, resolution 1ns, wraps every 4398046511103ns

                arch_timer_arch_init()

tick_setup_hrtimer_broadcast();
void tick_setup_hrtimer_broadcast(void)
{
        hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
        bctimer.function = bc_handler;
        clockevents_register_device(&ce_broadcast_hrtimer);
}

  clockevents_register_device

                  list_add(&dev->list, &clockevent_devices);

                  tick_check_new_device(dev)

                                  tick_install_broadcast_device()

                                                  初始化tick_broadcast_device

sched_clock_init()

void __init sched_clock_init(void)
{
        static_branch_inc(&sched_clock_running);
        local_irq_disable();
        generic_sched_clock_init();
        local_irq_enable();
}

其中generic_sched_clock_init()定义如下:

void __init generic_sched_clock_init(void)
{
        /*
         * If no sched_clock() function has been provided at that point,
         * make it the final one.
         */
        if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
                sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);

        update_sched_clock();

        /*
         * Start the timer to keep sched_clock() properly updated and
         * sets the initial epoch.
         */
        hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        sched_clock_timer.function = sched_clock_poll;
        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
}

从上面看sched_clock也定义了一个hrtimer,并且function为sched_clock_poll

从periodic切换为oneshot过程

从上面init过程可以看出,一开始arch timer的event_handler为tick_handle_periodic(),具体调用流程为:

tick_handle_periodic()

        tick_periodic(cpu)

                update_process_times()

                        run_local_timers()

                                hrtimer_run_queues()

                                        tick_check_oneshot_change()

在tick_check_oneshot_change()函数中,会判断是否timekeeping_valid_for_hres()和tick_is_oneshot_available()同时为真,如果同时为真,才会使tick_check_oneshot_change()返回1,才会真正开始切换为oneshot模式

[    0.323410] Call trace:
[    0.323533]  dump_backtrace+0x9c/0x100
[    0.323664]  show_stack+0x20/0x38
[    0.323724]  dump_stack_lvl+0x78/0x90
[    0.323785]  dump_stack+0x18/0x28
[    0.323827]  tk_setup_internals.constprop.0+0x3c/0x150
[    0.323887]  change_clocksource+0xe4/0x100
[    0.323936]  multi_cpu_stop+0xa4/0x178
[    0.323980]  cpu_stopper_thread+0x9c/0x130
[    0.324038]  smpboot_thread_fn+0x1c4/0x288
[    0.324091]  kthread+0x124/0x138
[    0.324130]  ret_from_fork+0x10/0x20

 这里才会设置timekeeping_valid_for_hres()中的tk->tkr_mono.clock,在之前tk->tkr_mono.clock一直为NULL,因为在timekeeping_init()时,没有任何clocksource注册到系统中。

[    0.325209] Call trace:
[    0.325255]  dump_backtrace+0x9c/0x100
[    0.325301]  show_stack+0x20/0x38
[    0.325338]  dump_stack_lvl+0x78/0x90
[    0.325379]  dump_stack+0x18/0x28
[    0.325415]  tick_clock_notify+0x58/0x100
[    0.325456]  timekeeping_notify+0x4c/0x88
[    0.325499]  __clocksource_select+0x174/0x2e8
[    0.325546]  clocksource_select+0x38/0x50
[    0.325589]  clocksource_done_booting+0x48/0x70
[    0.325641]  do_one_initcall+0x50/0x2a0
[    0.325680]  kernel_init_freeable+0x21c/0x3f0
[    0.325727]  kernel_init+0x28/0x140
[    0.325765]  ret_from_fork+0x10/0x20

 这里会重新设置&ts->check_clocks,tick_check_oneshot_change()函数会再次判断timekeeping_valid_for_hres()是否为真。

注意:即使在注册tick_broadcast_device bc_timer时,也会调用tick_clock_notify来设置&ts->check_clocks,但因为那时tk->tkr_mono.clock为NULL,导致无法满足切换oneshot模式的要求。

hrtimer_switch_to_hres()

        tick_init_highres()

                tick_switch_to_oneshot(hrtimer_interrupt)

                        td->mode = TICKDEV_MODE_ONESHOT

                        dev->event_handler = hrtimer_interrupt

                        clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT)

                         tick_broadcast_switch_to_oneshot()

                                 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT

                                 bc = tick_broadcast_device.evtdev

                                 tick_broadcast_setup_oneshot(bc, oldmode == TICKDEV_MODE_PERIODIC);

                                         bc->event_handler = tick_handle_oneshot_broadcast;

                                         bc->next_event = KTIME_MAX

                                         if (from_periodic) {

                                                cpumask_copy(tmpmask, tick_broadcast_mask)

                                                cpumask_clear_cpu(cpu, tmpmask)

                                                cpumask_or(tick_broadcast_oneshot_mask, tick_broadcast_oneshot_mask, tmpmask);

                                                nexttick = tick_get_next_period();

                                                 tick_broadcast_init_next_event(tmpmask, nexttick)

                                        }

        tick_setup_sched_timer()

                hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);

                ts->sched_timer.function = tick_sched_timer

                hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update())

                hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);

                hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);

为tick sched定义一个hrtimer, sched_timer,expires为jiffies,一直循环下去,直到cpu的runqueue为空,cpu选择idle进程,从而让自己进入cpu idle,这时为了省电,才会disable sched_timer

内核版本:kernel6.10

实验环境:Aarch64 + Qemu

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值