引言
Linux timer子系统分为两部分:定时和计时,定时(timer)就像日常生活中的闹钟一样,时间到了,就处理对应的事务,而计时(timekeeping)主要是用来统计自开机以来的时间,维护系统的jiffies,以及内核的wall clock。因最近分析一个cpuidle性能问题,阅读了kernel6.10 timer子系统timer代码,在此谈一谈自己的理解。
Arm64 Generic Timer架构
随着技术的发展,arm架构越来越受市场青睐,如今Arm Soc时钟设计普遍采用了Generic timer架构。以arm64为例:
从上图可以看出,Arm64内部有一个counter,因为它是全局的,所以称为system counter,system counter处于always-on domain,按照固定的频率累计计数,而其他的core,也就是图中的PE,共享这个system counter,都使用它作为自己的clock source,因此所有core共享同一个时间。
Linux timer
基本概念
1、clock source
顾名思义,clock source就是指时钟源,是对上图中的counter一种抽象,内核使用struct clocksource结构体来表示一个clock source。
struct clocksource {
u64 (*read)(struct clocksource *cs);
u64 mask;
u32 mult;
u32 shift;
u64 max_idle_ns;
u32 maxadj;
u32 uncertainty_margin;
#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
struct arch_clocksource_data archdata;
#endif
u64 max_cycles;
const char *name;
struct list_head list;
int rating;
enum clocksource_ids id;
enum vdso_clock_mode vdso_clock_mode;
unsigned long flags;
int (*enable)(struct clocksource *cs);
void (*disable)(struct clocksource *cs);
void (*suspend)(struct clocksource *cs);
void (*resume)(struct clocksource *cs);
void (*mark_unstable)(struct clocksource *cs);
void (*tick_stable)(struct clocksource *cs);
/* private: */
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
/* Watchdog related data, used by the framework */
struct list_head wd_list;
u64 cs_last;
u64 wd_last;
#endif
struct module *owner;
};
如果你留意观察linux kernel的boot log,你会发现类似如下的log:
clocksource: arch_sys_counter: mask: 0x1ffffffffffffff max_cycles: 0x1cd42e208c, max_idle_ns: 881590405314 ns
这就是所谓的arch timer,下文会详细说明其注册过程。
2、clock event device
总所周知,timer有闹钟的功能,一个闹钟时间到了,需要处理相应的事务,所以内核定义了struct clock_event_device结构体
struct clock_event_device {
void (*event_handler)(struct clock_event_device *);
int (*set_next_event)(unsigned long evt, struct clock_event_device *);
int (*set_next_ktime)(ktime_t expires, struct clock_event_device *);
ktime_t next_event;
u64 max_delta_ns;
u64 min_delta_ns;
u32 mult;
u32 shift;
enum clock_event_state state_use_accessors;
unsigned int features;
unsigned long retries;
int (*set_state_periodic)(struct clock_event_device *);
int (*set_state_oneshot)(struct clock_event_device *);
int (*set_state_oneshot_stopped)(struct clock_event_device *);
int (*set_state_shutdown)(struct clock_event_device *);
int (*tick_resume)(struct clock_event_device *);
void (*broadcast)(const struct cpumask *mask);
void (*suspend)(struct clock_event_device *);
void (*resume)(struct clock_event_device *);
unsigned long min_delta_ticks;
unsigned long max_delta_ticks;
const char *name;
int rating;
int irq;
int bound_on;
const struct cpumask *cpumask;
struct list_head list;
struct module *owner;
} ____cacheline_aligned;
3、tick device
内核在clock_event_device的基础上做一层简单封装,就有了tick device
struct tick_device {
struct clock_event_device *evtdev;
enum tick_device_mode mode;
};
tick_broadcast_device,是一种可以做broadcast的tick device,cpuidle经常需要与之打交道。
4、tick sched
进程的调度离不开tick(系统滴答),tick sched就是对早期内核tick的一种仿真
struct tick_sched {
/* Common flags */
unsigned long flags;
/* Tick handling: jiffies stall check */
unsigned int stalled_jiffies;
unsigned long last_tick_jiffies;
/* Tick handling */
struct hrtimer sched_timer;
ktime_t last_tick;
ktime_t next_tick;
unsigned long idle_jiffies;
ktime_t idle_waketime;
unsigned int got_idle_tick;
/* Idle entry */
seqcount_t idle_sleeptime_seq;
ktime_t idle_entrytime;
/* Tick stop */
unsigned long last_jiffies;
u64 timer_expires_base;
u64 timer_expires;
u64 next_timer;
ktime_t idle_expires;
unsigned long idle_calls;
unsigned long idle_sleeps;
/* Idle exit */
ktime_t idle_exittime;
ktime_t idle_sleeptime;
ktime_t iowait_sleeptime;
/* Full dynticks handling */
atomic_t tick_dep_mask;
/* Clocksource changes */
unsigned long check_clocks;
};
5、hrtimer
相对早期timer wheel而言,早期的tick是由HZ来决定,如果HZ设置为250,则tick就是4ms,然而随着技术的发展,网络等应用程序对tick有了更高的要求,4ms已无法满足,所以有了现在hrtimer。hrtime具有更高的精度,可以达到ns级别。
struct hrtimer {
struct timerqueue_node node;
ktime_t _softexpires;
enum hrtimer_restart (*function)(struct hrtimer *);
struct hrtimer_clock_base *base;
u8 state;
u8 is_rel;
u8 is_soft;
u8 is_hard;
};
而struct hrtimer_clock_base定义如下:
struct hrtimer_clock_base {
struct hrtimer_cpu_base *cpu_base;
unsigned int index;
clockid_t clockid;
seqcount_raw_spinlock_t seq;
struct hrtimer *running;
struct timerqueue_head active;
ktime_t (*get_time)(void);
ktime_t offset;
} __hrtimer_clock_base_align;
struct hrtimer_cpu_base定义如下:
struct hrtimer_cpu_base {
raw_spinlock_t lock;
unsigned int cpu;
unsigned int active_bases;
unsigned int clock_was_set_seq;
unsigned int hres_active : 1,
in_hrtirq : 1,
hang_detected : 1,
softirq_activated : 1,
online : 1;
#ifdef CONFIG_HIGH_RES_TIMERS
unsigned int nr_events;
unsigned short nr_retries;
unsigned short nr_hangs;
unsigned int max_hang_time;
#endif
#ifdef CONFIG_PREEMPT_RT
spinlock_t softirq_expiry_lock;
atomic_t timer_waiters;
#endif
ktime_t expires_next;
struct hrtimer *next_timer;
ktime_t softirq_expires_next;
struct hrtimer *softirq_next_timer;
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
} ____cacheline_aligned;
timer子系统init流程
一切还得从start_kernel()开始说起:
start_kernel()
.......
tick_init();
init_timers();
hrtimers_init();
timekeeping_init();
time_init();
sched_clock_init();
........
tick_init()
/**
* tick_init - initialize the tick control
*/
void __init tick_init(void)
{
tick_broadcast_init();
tick_nohz_init();
}
void __init tick_broadcast_init(void)
{
zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);
zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
#ifdef CONFIG_TICK_ONESHOT
zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
#endif
}
分配了tick_broadcast_mask,tick_broadcast_on,tmpmask,tick_broadcast_oneshot_mask,tick_broadcast_pending_mask,tick_broadcast_force_mask。
init_timers()
void __init init_timers(void)
{
init_timer_cpus();
posix_cputimers_init_work();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
static void __init init_timer_cpus(void)
{
int cpu;
for_each_possible_cpu(cpu)
init_timer_cpu(cpu);
}
static void __init init_timer_cpu(int cpu)
{
struct timer_base *base;
int i;
for (i = 0; i < NR_BASES; i++) {
base = per_cpu_ptr(&timer_bases[i], cpu);
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
timer_base_init_expiry_lock(base);
}
}
初始化percpu timer_bases[]
hrtimers_init()
void __init hrtimers_init(void)
{
hrtimers_prepare_cpu(smp_processor_id());
open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}
/*
* Functions related to boot-time initialization:
*/
int hrtimers_prepare_cpu(unsigned int cpu)
{
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
clock_b->cpu_base = cpu_base;
seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
timerqueue_init_head(&clock_b->active);
}
cpu_base->cpu = cpu;
cpu_base->active_bases = 0;
cpu_base->hres_active = 0;
cpu_base->hang_detected = 0;
cpu_base->next_timer = NULL;
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
cpu_base->online = 1;
hrtimer_cpu_base_init_expiry_lock(cpu_base);
return 0;
}
其中hrtimer_bases定义如下:
/*
* The timer bases:
*
* There are more clockids than hrtimer bases. Thus, we index
* into the timer bases by the hrtimer_base_type enum. When trying
* to reach a base using a clockid, hrtimer_clockid_to_base()
* is used to convert from clockid to the proper hrtimer_base_type.
*/
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
.clock_base =
{
{
.index = HRTIMER_BASE_MONOTONIC,
.clockid = CLOCK_MONOTONIC,
.get_time = &ktime_get,
},
{
.index = HRTIMER_BASE_REALTIME,
.clockid = CLOCK_REALTIME,
.get_time = &ktime_get_real,
},
{
.index = HRTIMER_BASE_BOOTTIME,
.clockid = CLOCK_BOOTTIME,
.get_time = &ktime_get_boottime,
},
{
.index = HRTIMER_BASE_TAI,
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
{
.index = HRTIMER_BASE_MONOTONIC_SOFT,
.clockid = CLOCK_MONOTONIC,
.get_time = &ktime_get,
},
{
.index = HRTIMER_BASE_REALTIME_SOFT,
.clockid = CLOCK_REALTIME,
.get_time = &ktime_get_real,
},
{
.index = HRTIMER_BASE_BOOTTIME_SOFT,
.clockid = CLOCK_BOOTTIME,
.get_time = &ktime_get_boottime,
},
{
.index = HRTIMER_BASE_TAI_SOFT,
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
}
};
初始化boot_cpu(一般cpu0)对应的cpu_base
timekeeping_init()
这部分属于计时部分,暂时先略过
time_init()
void __init time_init(void)
{
u32 arch_timer_rate;
of_clk_init(NULL);
timer_probe();
tick_setup_hrtimer_broadcast();
arch_timer_rate = arch_timer_get_rate();
if (!arch_timer_rate)
panic("Unable to initialise architected timer.\n");
/* Calibrate the delay loop directly */
lpj_fine = arch_timer_rate / HZ;
pv_time_init();
}
of_clk_init(NULL)
这里提到了__clk_of_table,从System.map内容来看:
timer_probe()
void __init timer_probe(void)
{
struct device_node *np;
const struct of_device_id *match;
of_init_fn_1_ret init_func_ret;
unsigned timers = 0;
int ret;
for_each_matching_node_and_match(np, __timer_of_table, &match) {
if (!of_device_is_available(np))
continue;
init_func_ret = match->data;
ret = init_func_ret(np);
if (ret) {
if (ret != -EPROBE_DEFER)
pr_err("Failed to initialize '%pOF': %d\n", np,
ret);
continue;
}
timers++;
}
timers += acpi_probe_device_table(timer);
if (!timers)
pr_crit("%s: no matching timers found\n", __func__);
}
这里提到了__timer_of_table,从System.map内容看:
这里以armv8_arch_timer为例:
drivers/clocksource/arm_arch_timer.c
TIMER_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init)
因此上面代码中init_func_ret等于arch_timer_of_init()。
static int __init arch_timer_of_init(struct device_node *np)
{
int i, irq, ret;
u32 rate;
bool has_names;
if (arch_timers_present & ARCH_TIMER_TYPE_CP15) {
pr_warn("multiple nodes in dt, skipping\n");
return 0;
}
arch_timers_present |= ARCH_TIMER_TYPE_CP15;
has_names = of_property_read_bool(np, "interrupt-names");
for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++) {
if (has_names)
irq = of_irq_get_byname(np, arch_timer_ppi_names[i]);
else
irq = of_irq_get(np, i);
if (irq > 0)
arch_timer_ppi[i] = irq;
}
arch_timer_populate_kvm_info();
rate = arch_timer_get_cntfrq();
arch_timer_of_configure_rate(rate, np);
arch_timer_c3stop = !of_property_read_bool(np, "always-on");
/* Check for globally applicable workarounds */
arch_timer_check_ool_workaround(ate_match_dt, np);
/*
* If we cannot rely on firmware initializing the timer registers then
* we should use the physical timers instead.
*/
if (IS_ENABLED(CONFIG_ARM) &&
of_property_read_bool(np, "arm,cpu-registers-not-fw-configured"))
arch_timer_uses_ppi = ARCH_TIMER_PHYS_SECURE_PPI;
else
arch_timer_uses_ppi = arch_timer_select_ppi();
if (!arch_timer_ppi[arch_timer_uses_ppi]) {
pr_err("No interrupt available, giving up\n");
return -EINVAL;
}
/* On some systems, the counter stops ticking when in suspend. */
arch_counter_suspend_stop = of_property_read_bool(np,
"arm,no-tick-in-suspend");
ret = arch_timer_register();
if (ret)
return ret;
if (arch_timer_needs_of_probing())
return 0;
return arch_timer_common_init();
}
1、从dts中获取中断号
2、选择中断类型为ARCH_TIMER_PHYS_NONSECURE_PPI
3、arch_timer_register()
分配一个percpu类型clock_event_device arch_timer_evt
申请arch timer中断,percpu类型,中断handler=arch_timer_handler_phys
arch_timer_cpu_pm_init() 注册cpu进出cpuidle notify,用来save和restore cntval
arch_timer_starting_cpu()
__arch_timer_setup(ARCH_TIMER_TYPE_CP15, clk)
对percpu arch_timer_evt进行初始化
clockevents_config_and_register()
clockevents_register_device
list_add(&dev->list, &clockevent_devices);
tick_check_new_device(dev)
tick_setup_device
初始化tick_device
4、arch_timer_common_init()
arch_timer_banner(arch_timers_present)//打印arch_timer: cp15 timer(s) running at 1000.00MHz (phys).
arch_counter_register(arch_timers_present)
clocksource_register_hz(&clocksource_counter, arch_timer_rate)
__clocksource_register_scale(cs, 1, hz)
__clocksource_update_freq_scale(cs, scale, freq)
sched_clock_register(arch_timer_read_counter, width, arch_timer_rate)
//打印sched_clock: 61 bits at 1000MHz, resolution 1ns, wraps every 4398046511103ns
arch_timer_arch_init()
tick_setup_hrtimer_broadcast();
void tick_setup_hrtimer_broadcast(void)
{
hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
bctimer.function = bc_handler;
clockevents_register_device(&ce_broadcast_hrtimer);
}
clockevents_register_device
list_add(&dev->list, &clockevent_devices);
tick_check_new_device(dev)
tick_install_broadcast_device()
初始化tick_broadcast_device
sched_clock_init()
void __init sched_clock_init(void)
{
static_branch_inc(&sched_clock_running);
local_irq_disable();
generic_sched_clock_init();
local_irq_enable();
}
其中generic_sched_clock_init()定义如下:
void __init generic_sched_clock_init(void)
{
/*
* If no sched_clock() function has been provided at that point,
* make it the final one.
*/
if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
update_sched_clock();
/*
* Start the timer to keep sched_clock() properly updated and
* sets the initial epoch.
*/
hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
sched_clock_timer.function = sched_clock_poll;
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
}
从上面看sched_clock也定义了一个hrtimer,并且function为sched_clock_poll
从periodic切换为oneshot过程
从上面init过程可以看出,一开始arch timer的event_handler为tick_handle_periodic(),具体调用流程为:
tick_handle_periodic()
tick_periodic(cpu)
update_process_times()
run_local_timers()
hrtimer_run_queues()
tick_check_oneshot_change()
在tick_check_oneshot_change()函数中,会判断是否timekeeping_valid_for_hres()和tick_is_oneshot_available()同时为真,如果同时为真,才会使tick_check_oneshot_change()返回1,才会真正开始切换为oneshot模式
[ 0.323410] Call trace:
[ 0.323533] dump_backtrace+0x9c/0x100
[ 0.323664] show_stack+0x20/0x38
[ 0.323724] dump_stack_lvl+0x78/0x90
[ 0.323785] dump_stack+0x18/0x28
[ 0.323827] tk_setup_internals.constprop.0+0x3c/0x150
[ 0.323887] change_clocksource+0xe4/0x100
[ 0.323936] multi_cpu_stop+0xa4/0x178
[ 0.323980] cpu_stopper_thread+0x9c/0x130
[ 0.324038] smpboot_thread_fn+0x1c4/0x288
[ 0.324091] kthread+0x124/0x138
[ 0.324130] ret_from_fork+0x10/0x20
这里才会设置timekeeping_valid_for_hres()中的tk->tkr_mono.clock,在之前tk->tkr_mono.clock一直为NULL,因为在timekeeping_init()时,没有任何clocksource注册到系统中。
[ 0.325209] Call trace:
[ 0.325255] dump_backtrace+0x9c/0x100
[ 0.325301] show_stack+0x20/0x38
[ 0.325338] dump_stack_lvl+0x78/0x90
[ 0.325379] dump_stack+0x18/0x28
[ 0.325415] tick_clock_notify+0x58/0x100
[ 0.325456] timekeeping_notify+0x4c/0x88
[ 0.325499] __clocksource_select+0x174/0x2e8
[ 0.325546] clocksource_select+0x38/0x50
[ 0.325589] clocksource_done_booting+0x48/0x70
[ 0.325641] do_one_initcall+0x50/0x2a0
[ 0.325680] kernel_init_freeable+0x21c/0x3f0
[ 0.325727] kernel_init+0x28/0x140
[ 0.325765] ret_from_fork+0x10/0x20
这里会重新设置&ts->check_clocks,tick_check_oneshot_change()函数会再次判断timekeeping_valid_for_hres()是否为真。
注意:即使在注册tick_broadcast_device bc_timer时,也会调用tick_clock_notify来设置&ts->check_clocks,但因为那时tk->tkr_mono.clock为NULL,导致无法满足切换oneshot模式的要求。
hrtimer_switch_to_hres()
tick_init_highres()
tick_switch_to_oneshot(hrtimer_interrupt)
td->mode = TICKDEV_MODE_ONESHOT
dev->event_handler = hrtimer_interrupt
clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT)
tick_broadcast_switch_to_oneshot()
tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT
bc = tick_broadcast_device.evtdev
tick_broadcast_setup_oneshot(bc, oldmode == TICKDEV_MODE_PERIODIC);
bc->event_handler = tick_handle_oneshot_broadcast;
bc->next_event = KTIME_MAX
if (from_periodic) {
cpumask_copy(tmpmask, tick_broadcast_mask)
cpumask_clear_cpu(cpu, tmpmask)
cpumask_or(tick_broadcast_oneshot_mask, tick_broadcast_oneshot_mask, tmpmask);
nexttick = tick_get_next_period();
tick_broadcast_init_next_event(tmpmask, nexttick)
}
tick_setup_sched_timer()
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
ts->sched_timer.function = tick_sched_timer
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update())
hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
为tick sched定义一个hrtimer, sched_timer,expires为jiffies,一直循环下去,直到cpu的runqueue为空,cpu选择idle进程,从而让自己进入cpu idle,这时为了省电,才会disable sched_timer
内核版本:kernel6.10
实验环境:Aarch64 + Qemu