内核在启动时通过RTC获取一个起始时间,然后就可以利用TSC、HPET等机制维护自己的时间。
1.时钟源
内核把每一个可用于计时的时钟抽象为时钟源(clocksource)结构:
/**
* struct clocksource - hardware abstraction for a free running counter
* Provides mostly state-free accessors to the underlying hardware.
* This is the structure used for system time.
*
* @name: ptr to clocksource name
* @list: list head for registration
* @rating: rating value for selection (higher is better)
* To avoid rating inflation the following
* list should give you a guide as to how
* to assign your clocksource a rating
* 1-99: Unfit for real use
* Only available for bootup and testing purposes.
* 100-199: Base level usability.
* Functional for real use, but not desired.
* 200-299: Good.
* A correct and usable clocksource.
* 300-399: Desired.
* A reasonably fast and accurate clocksource.
* 400-499: Perfect
* The ideal clocksource. A must-use where
* available.
* @read: returns a cycle value, passes clocksource as argument
* @enable: optional function to enable the clocksource
* @disable: optional function to disable the clocksource
* @mask: bitmask for two's complement
* subtraction of non 64 bit counters
* @mult: cycle to nanosecond multiplier
* @shift: cycle to nanosecond divisor (power of two)
* @max_idle_ns: max idle time permitted by the clocksource (nsecs)
* @flags: flags describing special properties
* @vread: vsyscall based read
* @suspend: suspend function for the clocksource, if necessary
* @resume: resume function for the clocksource, if necessary
*/
struct clocksource {
/*
* Hotpath data, fits in a single cache line when the
* clocksource itself is cacheline aligned.
*/
cycle_t (*read)(struct clocksource *cs);
cycle_t cycle_last;
cycle_t mask;
u32 mult;
u32 shift;
u64 max_idle_ns;
#ifdef CONFIG_IA64
void *fsys_mmio; /* used by fsyscall asm code */
#define CLKSRC_FSYS_MMIO_SET(mmio, addr) ((mmio) = (addr))
#else
#define CLKSRC_FSYS_MMIO_SET(mmio, addr) do { } while (0)
#endif
const char *name; //时钟的名字
struct list_head list; //所有的时钟通过list链接
int rating;
cycle_t (*vread)(void); //读取时间
int (*enable)(struct clocksource *cs);
void (*disable)(struct clocksource *cs);
unsigned long flags;
void (*suspend)(struct clocksource *cs);
void (*resume)(struct clocksource *cs);
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
/* Watchdog related data, used by the framework */
struct list_head wd_list;
cycle_t cs_last;
cycle_t wd_last;
#endif
} ____cacheline_aligned;
时钟源(clocksource)的初始化:
/*
* timekeeping_init - Initializes the clocksource and common timekeeping values
*/
void __init timekeeping_init(void)
{
struct clocksource *clock;
unsigned long flags;
struct timespec now, boot;
/*首先从RTC中读取当前系统的时间*/
read_persistent_clock(&now);
read_boot_clock(&boot);
write_seqlock_irqsave(&xtime_lock, flags);
ntp_init();
/*选定一个时钟*/
clock = clocksource_default_clock();
if (clock->enable)
clock->enable(clock);
timekeeper_setup_internals(clock);
/*设置全局时间变量和raw_time也就是以前的monotonic时间*/
xtime.tv_sec = now.tv_sec;
xtime.tv_nsec = now.tv_nsec;
raw_time.tv_sec = 0;
raw_time.tv_nsec = 0;
if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
boot.tv_sec = xtime.tv_sec;
boot.tv_nsec = xtime.tv_nsec;
}
set_normalized_timespec(&wall_to_monotonic,
-boot.tv_sec, -boot.tv_nsec);
total_sleep_time.tv_sec = 0;
total_sleep_time.tv_nsec = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
}
我们来看一下选中了哪个时钟源:
struct clocksource * __init __weak clocksource_default_clock(void)
{
return &clocksource_jiffies;
}
struct clocksource clocksource_jiffies = {
.name = "jiffies",
.rating = 1, /* lowest valid rating*/
.read = jiffies_read,
.mask = 0xffffffff, /*32bits*/
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
.shift = JIFFIES_SHIFT,
};
选中了jiffies时钟源,每一次时钟中断会加一,此时钟源的rating是1,也就是所有时钟源中优先级最低的,只有系统中其它时钟源都失效时才会使用该时钟源。否则,在以后的过程中,会初始化其它的时钟源。我们先来看在start_kernel中与时钟相关的初始化操作:
asmlinkage void __init start_kernel(void) { /* * Interrupts are still disabled. Do necessary setups, then * enable them */ tick_init(); //initialize the tick control ;Register the notifier with the clockevents framework boot_cpu_init(); ********* ********* ********* mm_init();
init_IRQ();prio_tree_init();init_timers();hrtimers_init();softirq_init();timekeeping_init(); //Initializes the clocksource and common timekeeping valuestime_init(); //Initialize TSC and delay the periodic timer init tolate x86_late_time_init() so ioremap works.********* ********* *********
profile_init();if (late_time_init) //这个和time_init完成的功能一样,为什么重复调用呢???late_time_init();sched_clock_init();********* ********* *********
rest_init();}我们看到,在timekeeping_init()后,立即就调用time_init()初始化TSC时钟源。********* ********* *********
/* * Initialize TSC and delay the periodic timer init to * late x86_late_time_init() so ioremap works. */ void __init time_init(void) { late_time_init = x86_late_time_init; } static __init void x86_late_time_init(void) { x86_init.timers.timer_init(); //调用hpet_time_init tsc_init(); //内核需要知道CPU的时钟周期,tsc_init()会计算并打印CPU当前主频 } /* Default timer init function */ void __init hpet_time_init(void) { if (!hpet_enable()) //HPET clock event device对象初始化,下午详述 setup_pit_timer(); //这也是个相当重要的函数,我们稍后会将讲。 setup_default_timer_irq(); } void __init setup_default_timer_irq(void) { setup_irq(0, &irq0); //设置时钟中断处理函数 } /** * setup_irq - setup an interrupt * @irq: Interrupt line to setup * @act: irqaction for the interrupt * * Used to statically setup interrupts in the early boot process. */ int setup_irq(unsigned int irq, struct irqaction *act) { int retval; struct irq_desc *desc = irq_to_desc(irq); chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); chip_bus_sync_unlock(desc); return retval; } static struct irqaction irq0 = { .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, .name = "timer" }; /* * Default timer interrupt handler for PIT/HPET */ static irqreturn_t timer_interrupt(int irq, void *dev_id) //默认的时钟中断处理函数 { /* Keep nmi watchdog up to date */ inc_irq_stat(irq0_irqs); global_clock_event->event_handler(global_clock_event); //重要函数,稍后讲述。 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ if (MCA_bus) outb_p(inb_p(0x61)| 0x80, 0x61); return IRQ_HANDLED; }
2.tickless机制
tickless机制确保CPU在idle状态不会产生不必要的时钟中断,从而保证CPU尽量处于节能状态。
采用tickless机制后,时钟中断周期不固定,因此需要在运行中频繁地操作时钟中断设备,来配置下一次发生中断的时机。为屏蔽各种设备的操作差异,内核使用clock_event_device结构来表示一个设备:
/** * struct clock_event_device - clock event device descriptor * @event_handler: Assigned by the framework to be called by the low * level handler of the event source * @set_next_event: set next event function * @next_event: local storage for the next event in oneshot mode * @max_delta_ns: maximum delta value in ns * @min_delta_ns: minimum delta value in ns * @mult: nanosecond to cycles multiplier * @shift: nanoseconds to cycles divisor (power of two) * @mode: operating mode assigned by the management code * @features: features * @retries: number of forced programming retries * @set_mode: set mode function * @broadcast: function to broadcast events * @min_delta_ticks: minimum delta value in ticks stored for reconfiguration * @max_delta_ticks: maximum delta value in ticks stored for reconfiguration * @name: ptr to clock event name * @rating: variable to rate clock event devices * @irq: IRQ number (only for non CPU local devices) * @cpumask: cpumask to indicate for which CPUs this device works * @list: list head for the management code */ struct clock_event_device { void (*event_handler)(struct clock_event_device *); //中断处理函数 int (*set_next_event)(unsigned long evt, //定时器下一次发生时钟中断的时机 struct clock_event_device *); ktime_t next_event; //下一次该设备发出中断的时间。 /* *每一次中断后要设置下一次中断的时机,max和min分别表示该设备的最大和最小中断周期。 */ u64 max_delta_ns; u64 min_delta_ns; u32 mult; u32 shift; enum clock_event_mode mode; unsigned int features; //表示设备的功能 unsigned long retries; void (*broadcast)(const struct cpumask *mask); void (*set_mode)(enum clock_event_mode mode, struct clock_event_device *); unsigned long min_delta_ticks; unsigned long max_delta_ticks; const char *name; //设备名 int rating; int irq; //中断请求号 const struct cpumask *cpumask; //CPU掩码 struct list_head list; } ____cacheline_aligned;
用cat /proc/timer_list察看我的电脑中的clock event list:Tick Device: mode: 1
Broadcast device
Clock Event Device: hpet
max_delta_ns: 149983013276
min_delta_ns: 13409
mult: 61496111
shift: 32
mode: 3
next_event: 9223372036854775807 nsecs
set_next_event: hpet_legacy_next_event
set_mode: hpet_legacy_set_mode
event_handler: tick_handle_oneshot_broadcast
retries: 0
tick_broadcast_mask: 00000000
tick_broadcast_oneshot_mask: 00000000
Tick Device: mode: 1
Per CPU device: 0
Clock Event Device: lapic
max_delta_ns: 103098591113
min_delta_ns: 1000
mult: 89461669
shift: 32
mode: 3
next_event: 7025048000000 nsecs
set_next_event: lapic_next_event
set_mode: lapic_timer_setup
event_handler: hrtimer_interrupt
retries: 0
Tick Device: mode: 1
Per CPU device: 1
Clock Event Device: lapic
max_delta_ns: 103098591113
min_delta_ns: 1000
mult: 89461669
shift: 32
mode: 3
next_event: 7025048000000 nsecs
set_next_event: lapic_next_event
set_mode: lapic_timer_setup
event_handler: hrtimer_interrupt
retries: 0
上面三个clock event device对象,其中lapic是两个CPU的本地时钟,当CPU进入一定的节能状态时,lapic时钟也会停止,此时需要另外的时钟接受处理原本属于lapic的时钟事件。这个过程被成为broadcast,因此有一个tick_broadcast_device指向当前使用的boradcast clock event device。在我的电脑中,tick_broadcast_device指向HPET时钟源的,也就是说当lapic停止时,HPET接受处理所有的时钟事件,它检查mask中的全部CPU的时钟队列,为它们处理到期的时钟。为了维护CPU和clock event device对象的对应关系,内核定义了一个tick_device结构:
struct tick_device { struct clock_event_device *evtdev; enum tick_device_mode mode; };
下面来看clock event device的初始化工作:/** * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ int __init hpet_enable(void) { unsigned long hpet_period; unsigned int id; u64 freq; int i; if (!is_hpet_capable()) //探测硬件系统是否支持HPET return 0; hpet_set_mapping(); //为HPET设置fixmap内存映射 /* * Read the period and check for a sane value: */ hpet_period = hpet_readl(HPET_PERIOD); /* * AMD SB700 based systems with spread spectrum enabled use a * SMM based HPET emulation to provide proper frequency * setting. The SMM code is initialized with the first HPET * register access and takes some time to complete. During * this time the config register reads 0xffffffff. We check * for max. 1000 loops whether the config register reads a non * 0xffffffff value to make sure that HPET is up and running * before we go further. A counting loop is safe, as the HPET * access takes thousands of CPU cycles. On non SB700 based * machines this check is only done once and has no side * effects. */ for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) { if (i == 1000) { printk(KERN_WARNING "HPET config register value = 0xFFFFFFFF. " "Disabling HPET\n"); goto out_nohpet; } } if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) goto out_nohpet; /* * The period is a femto seconds value. Convert it to a * frequency. */ freq = FSEC_PER_SEC; do_div(freq, hpet_period); hpet_freq = freq; /* * Read the HPET ID register to retrieve the IRQ routing * information and the number of channels */ id = hpet_readl(HPET_ID); hpet_print_config(); #ifdef CONFIG_HPET_EMULATE_RTC /* * The legacy routing mode needs at least two channels, tick timer * and the rtc emulation channel. */ if (!(id & HPET_ID_NUMBER)) goto out_nohpet; #endif /*为HPET注册clocksource对象 (用于计时)*/ if (hpet_clocksource_register()) goto out_nohpet; /*为HPET注册clock event device对象(用于时钟中断)*/ if (id & HPET_ID_LEGSUP) { hpet_legacy_clockevent_register(); return 1; } return 0; out_nohpet: hpet_clear_mapping(); hpet_address = 0; return 0; }
我们重点来关注为HPET注册clock event device操作:/* * The hpet clock event device */ static struct clock_event_device hpet_clockevent = { .name = "hpet", .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_mode = hpet_legacy_set_mode, .set_next_event = hpet_legacy_next_event, .irq = 0, .rating = 50, };
static void hpet_legacy_clockevent_register(void) { /* Start HPET legacy interrupts */ hpet_enable_legacy_int(); /* * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); clockevents_config_and_register(&hpet_clockevent, hpet_freq, HPET_MIN_PROG_DELTA, 0x7FFFFFFF); global_clock_event = &hpet_clockevent; /global_clock_event指向当前使用的全局clock event device对象。 printk(KERN_DEBUG "hpet clockevent registered\n"); } /** * clockevents_config_and_register - Configure and register a clock event device * @dev: device to register * @freq: The clock frequency * @min_delta: The minimum clock ticks to program in oneshot mode * @max_delta: The maximum clock ticks to program in oneshot mode * * min/max_delta can be 0 for devices which do not support oneshot mode. */ void clockevents_config_and_register(struct clock_event_device *dev, u32 freq, unsigned long min_delta, unsigned long max_delta) { dev->min_delta_ticks = min_delta; dev->max_delta_ticks = max_delta; clockevents_config(dev, freq); clockevents_register_device(dev); //******************************** } /** * clockevents_register_device - register a clock event device * @dev: device to register */ void clockevents_register_device(struct clock_event_device *dev) { unsigned long flags; BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); dev->cpumask = cpumask_of(smp_processor_id()); } raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); //把注册的clock event device 对象添加到全局的clock_devices链表中。 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); //请求CLOCK_EVT_NOTIFY_ADD的notify事件。 下文详述。 clockevents_notify_released(); raw_spin_unlock_irqrestore(&clockevents_lock, flags); }
这里需要解释一下clockevents_do_notify操作。在start_kernel中,有一个操作tick_init():
static struct notifier_block tick_notifier = { .notifier_call = tick_notify, }; /** * tick_init - initialize the tick control * * Register the notifier with the clockevents framework */ void __init tick_init(void) { clockevents_register_notifier(&tick_notifier); //初始化一个全局的notify对象 }
所以clockevents_do_notify的notify请求最终由tick_notify()函数处理:对于CLOCK_EVT_NOTIFY_ADD时间,最终将调用tick_check_device操作来响应。不同的CPU可以使用不同的clock event device,每一个CPU都有一个tick_device的结构来表示当前CPU使用的clock event device对象。tick_check_new_device的作用是通知当前CPU现在有个新的clock event device对象可以用了:/* * Notification about clock event devices */ static int tick_notify(struct notifier_block *nb, unsigned long reason, void *dev) { switch (reason) { case CLOCK_EVT_NOTIFY_ADD: return tick_check_new_device(dev); case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_OFF: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: tick_broadcast_on_off(reason, dev); break; case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: tick_broadcast_oneshot_control(reason); break; case CLOCK_EVT_NOTIFY_CPU_DYING: tick_handover_do_timer(dev); break; case CLOCK_EVT_NOTIFY_CPU_DEAD: tick_shutdown_broadcast_oneshot(dev); tick_shutdown_broadcast(dev); tick_shutdown(dev); break; case CLOCK_EVT_NOTIFY_SUSPEND: tick_suspend(); tick_suspend_broadcast(); break; case CLOCK_EVT_NOTIFY_RESUME: tick_resume(); break; default: break; } return NOTIFY_OK; }
/* * Check, if the new registered device should be used. */static int tick_check_new_device(struct clock_event_device *newdev){struct clock_event_device *curdev;struct tick_device *td;int cpu, ret = NOTIFY_OK;unsigned long flags;raw_spin_lock_irqsave(&tick_device_lock, flags);cpu = smp_processor_id();if (!cpumask_test_cpu(cpu, newdev->cpumask))goto out_bc;td = &per_cpu(tick_cpu_device, cpu);curdev = td->evtdev;/* cpu local device ? */if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {/* * If the cpu affinity of the device interrupt can not * be set, ignore it. */if (!irq_can_set_affinity(newdev->irq))goto out_bc;/* * If we have a cpu local device already, do not replace it * by a non cpu local device */if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))goto out_bc;}/* * If we have an active device, then check the rating and the oneshot * feature. */if (curdev) {/* * Prefer one shot capable devices ! */if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))goto out_bc;/* * Check the rating */if (curdev->rating >= newdev->rating)goto out_bc;}/* * Replace the eventually existing device by the new * device. If the current device is the broadcast device, do * not give it back to the clockevents layer ! */if (tick_is_broadcast_device(curdev)) {clockevents_shutdown(curdev);curdev = NULL;}clockevents_exchange_device(curdev, newdev);tick_setup_device(td, newdev, cpu, cpumask_of(cpu));if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)tick_oneshot_notify();raw_spin_unlock_irqrestore(&tick_device_lock, flags);return NOTIFY_STOP;out_bc:/* * Can the new device be used as a broadcast device ? */if (tick_check_broadcast_device(newdev))ret = NOTIFY_STOP;raw_spin_unlock_irqrestore(&tick_device_lock, flags);return ret;}
如果tick_check_new_device函数发现需要使用新注册的设备作为该CPU的当前设备,会调用tick_setup_device函数:
需要指出的是,对于周期模式,只需对设备进行一次初始化,而对与ONESHOT模式,每一次中断后,都需要设置设备下一次中断的时机。/* * Setup the tick device */ static void tick_setup_device(struct tick_device *td, struct clock_event_device *newdev, int cpu, const struct cpumask *cpumask) { ktime_t next_event; void (*handler)(struct clock_event_device *) = NULL; /* * First device setup ? */ /* 如果当前CPU还没有clock event device,就默认该新设备为周期性设备。 */ if (!td->evtdev) { /* * If no cpu took the do_timer update, assign it to * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { tick_do_timer_cpu = cpu; tick_next_period = ktime_get(); tick_period = ktime_set(0, NSEC_PER_SEC / HZ); //计算中断周期 } /* * Startup in periodic mode first. */ td->mode = TICKDEV_MODE_PERIODIC; } else { handler = td->evtdev->event_handler; next_event = td->evtdev->next_event; td->evtdev->event_handler = clockevents_handle_noop; } td->evtdev = newdev; /* * When the device is not per cpu, pin the interrupt to the * current cpu: */ /* 根据设备的工作模式,进行设置 */ if (!cpumask_equal(newdev->cpumask, cpumask)) irq_set_affinity(newdev->irq, cpumask); /* * When global broadcasting is active, check if the current * device is registered as a placeholder for broadcast mode. * This allows us to handle this x86 misfeature in a generic * way. */ if (tick_device_uses_broadcast(newdev, cpu)) //如果设备支持broadcast功能 return; if (td->mode == TICKDEV_MODE_PERIODIC) //如果设备的工作模式是周期性的,则设置event_handler函数指针,并配置设备的中断周期 tick_setup_periodic(newdev, 0); else tick_setup_oneshot(newdev, handler, next_event); //如果设备的工作模式是ONESHOT模式的,则设置event_handler和next_event,并设置设备下一次发出中断的时机。 }
Local APIC clock event device的初始化:
对于lapic的初始化,启动CPU(或者主CPU)是调用setup_boot_APIC_clock完成的:/* * The local apic timer can be used for any function which is CPU local. */ static struct clock_event_device lapic_clockevent = { .name = "lapic", .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, .shift = 32, .set_mode = lapic_timer_setup, .set_next_event = lapic_next_event, .broadcast = lapic_timer_broadcast, .rating = 100, .irq = -1, };
对于非启动CPU(或从CPU)则调用setup_secondary_APIC_clock()完成:/* * Setup the boot APIC * * Calibrate and verify the result. */ void __init setup_boot_APIC_clock(void) { /* * The local apic timer can be disabled via the kernel * commandline or from the CPU detection code. Register the lapic * timer as a dummy clock event source on SMP systems, so the * broadcast mechanism is used. On UP systems simply ignore it. */ if (disable_apic_timer) { pr_info("Disabling APIC timer\n"); /* No broadcast on UP ! */ if (num_possible_cpus() > 1) { lapic_clockevent.mult = 1; setup_APIC_timer(); } return; } apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" "calibrating APIC timer ...\n"); if (calibrate_APIC_clock()) { /* No broadcast on UP ! */ if (num_possible_cpus() > 1) setup_APIC_timer(); return; } /* * If nmi_watchdog is set to IO_APIC, we need the * PIT/HPET going. Otherwise register lapic as a dummy * device. */ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; /* Setup the lapic or request the broadcast */ setup_APIC_timer(); }
void __cpuinit setup_secondary_APIC_clock(void) { setup_APIC_timer(); }
/* * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. */ static void __cpuinit setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); if (this_cpu_has(X86_FEATURE_ARAT)) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; /* Make LAPIC timer preferrable over percpu HPET */ lapic_clockevent.rating = 150; } memcpy(levt, &lapic_clockevent, sizeof(*levt)); levt->cpumask = cpumask_of(smp_processor_id()); clockevents_register_device(levt); }