1 内核时间管理的相关组件
1.1 clocksource 和 clock_event_device
1.1.1 简介
外部时钟设备的主要作用是提供精确的计时功能和定期产生中断的功能,内部把提供精确计时的功能抽象为clocksource对象,把定期产生中断的功能抽象为clock_event_device对象。
《Linux内核精析》12.2.1 clocksource概述
《深⼊ LINUX 内核架构》P716
1.1.2 常见的外部时钟设备
TSC,HPET,ACPI PMT
《精通Linux内核开发》10.1 时间表⽰
《深⼊理解linux内核》P229
1.1.3 调试
/sys/devices/system/clocksource/
/sys/devices/system/clockevents/
1.2 timekeeping模块
1.2.1 简介
struct timekeeper定义在include/linux/timekeeper_internal.h中,保存了各种计时值。它是维护并操纵不同时间线的计时数据的主要数据结构,比如单调时间和原始时间
《Linux内核精析》12.3.2 timeval和timespec
1.2.2 数据结构
tk_core
//kernel/time/timekeeping.c
/*
* The most important data for readout fits into a single 64 byte
* cache line.
*/
static struct {
seqcount_t seq;
struct timekeeper timekeeper;
} tk_core ____cacheline_aligned = {
.seq = SEQCNT_ZERO(tk_core.seq),
};
struct timekeeper;
//include/linux/timekeeper_internal.h
/**
* struct timekeeper - Structure holding internal timekeeping values.
* @tkr_mono: The readout base structure for CLOCK_MONOTONIC
* @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW
* @xtime_sec: Current CLOCK_REALTIME time in seconds
* ......
*/
struct timekeeper {
struct tk_read_base tkr_mono;
struct tk_read_base tkr_raw;
u64 xtime_sec;
unsigned long ktime_sec;
......
};
struct tk_read_base;
//include/linux/timekeeper_internal.h
/**
* struct tk_read_base - base structure for timekeeping readout
* @clock: Current clocksource used for timekeeping.
* @mask: Bitmask for two's complement subtraction of non 64bit clocks
* @cycle_last: @clock cycle value at last update
* @mult: (NTP adjusted) multiplier for scaled math conversion
* @shift: Shift value for scaled math conversion
* @xtime_nsec: Shifted (fractional) nano seconds offset for readout
* @base: ktime_t (nanoseconds) base time for readout
* @base_real: Nanoseconds base value for clock REALTIME readout
* ......
*/
struct tk_read_base {
struct clocksource *clock;
u64 mask;
u64 cycle_last;
u32 mult; /********* 时间同步的关键变量 ********/
u32 shift;
u64 xtime_nsec;
ktime_t base;
u64 base_real;
};
1.2.3 struct timekeeper中时间变量的更新流程
tick_sched_timer();
-> tick_sched_do_timer();
-> tick_do_update_jiffies64();
-> update_wall_time();
-> timekeeping_advance();
-> accumulate_nsecs_to_secs();
-> k->xtime_sec++;
-> timekeeping_update();
-> tk_update_ktime_data(tk);
-> tk->ktime_sec = seconds;
-> tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
2 计算时间的流逝
时钟源硬件会产生固定周期的物理信号送给外部时钟设备,时钟设备硬件可以记录收到了多少个周期的时钟信号。
内核代码读取时钟设备硬件记录的周期数,然后将其转换成时间,周期数转换成时间的算法如下:
static inline s64 clocksource_cyc2ns(u64 cycles, u32 mult, u32 shift)
{
return ((u64) cycles * mult) >> shift;
}
时钟源硬件并不总是精确的,它们的频率可能不⼀样。这个时钟变化会导 致时间漂移。在这种情况下,可以调整mult变量来弥补这个时间漂移。
《精通Linux内核开发》10.2 硬件抽象
3 内核时间同步的关键变量:mult
应用层的时间同步程序如何修改内核的mult变量
应用层的时间同步程序(chronyd, phc2sys等)最终都会调用内核的do_adjtimex()来进行时间调整,这个流程会修改mult变量,如下:
do_adjtimex();
-> __do_adjtimex();
-> ntp_update_frequency();
-> tick_length += new_base - tick_length_base;
-> timekeeping_advance();
-> timekeeping_adjust(); //Adjust the multiplier to correct NTP error
-> tk->ntp_tick = ntp_tick_length();
-> mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
tk->xtime_remainder, tk->cycle_interval);
-> timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult);
-> tk->tkr_mono.mult += mult_adj;
4 抓取实际的内核数据进行验证
4.1 查看当前clocksource的频率
当前系统的clocksource是TSC,如下:
# cat /sys/devices/system/clocksource/clocksource0/current_clocksource
tsc
TSC时钟源的频率是 2419.200 MHz,信息如下:
# dmesg | grep -i TSC
[ 0.000000] tsc: Detected 2400.000 MHz processor
[ 0.000000] tsc: Detected 2419.200 MHz TSC
[ 0.044651] TSC deadline timer available
[ 0.159823] clocksource: tsc-early: mask: 0xffffffffffffffff max_cycles: 0x22df1149949, max_idle_ns: 440795312789 ns
[ 0.778100] clocksource: Switched to clocksource tsc-early
[ 0.805072] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x22df1149949, max_idle_ns: 440795312789 ns
[ 0.805086] clocksource: Switched to clocksource tsc
4.2 使用kprobe模块抓取内核的mult和shift变量
4.2.1 查看tk_core结构体对象的地址
数据结构关系如下,要想抓取mult和shift变量,我们需要首先获取tk_core。
通过/proc/kallsyms文件中查看到的tk_core地址为0xffffffffae4a0100,信息如下:
# cat /proc/kallsyms | grep tk_core
ffffffffae4a0100 b tk_core
4.2.2 实现kprobe模块
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/timekeeper_internal.h>
#include "your_kernel_src/kernel/time/tick-internal.h"
#include "your_kernel_src/kernel/time/ntp_internal.h"
#include "your_kernel_src/kernel/time/timekeeping_internal.h"
#define MAX_SYMBOL_LEN 64
static char symbol[MAX_SYMBOL_LEN] = "do_adjtimex";
module_param_string(symbol, symbol, sizeof(symbol), 0644);
struct test_tk_core {
seqcount_t seq;
struct timekeeper timekeeper;
};
struct test_tk_core * tk_core = 0xffffffffae4a0100; /* /proc/kallsyms中查看到的tk_core地址 */
/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
.symbol_name = symbol,
};
/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
printk("--------------- clocksource name:%s, mult=%u, shift=%u \n",
tk_core->timekeeper.tkr_mono.clock->name,
tk_core->timekeeper.tkr_mono.mult,
tk_core->timekeeper.tkr_mono.shift);
/* A dump_stack() here will give a stack backtrace */
return 0;
}
/* kprobe post_handler: called after the probed instruction is executed */
static void handler_post(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
printk("--------------- clocksource name:%s, mult=%u, shift=%u \n",
tk_core->timekeeper.tkr_mono.clock->name,
tk_core->timekeeper.tkr_mono.mult,
tk_core->timekeeper.tkr_mono.shift);
}
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{
pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr);
/* Return 0 because we don't handle the fault. */
return 0;
}
static int __init kprobe_init(void)
{
int ret;
kp.pre_handler = handler_pre;
kp.post_handler = handler_post;
//kp.fault_handler = handler_fault;
ret = register_kprobe(&kp);
if (ret < 0) {
pr_err("register_kprobe failed, returned %d\n", ret);
return ret;
}
pr_info("Planted kprobe at %p\n", kp.addr);
return 0;
}
static void __exit kprobe_exit(void)
{
unregister_kprobe(&kp);
pr_info("kprobe at %p unregistered\n", kp.addr);
}
module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");
将上面代码编译成内核模块(.ko),然后insmod安装即可。
4.3 对kprobe模块抓取到的数据进行分析
4.3.1 本机时间准确时
本机时间准确时,使用dmesg看到kprobe模块抓取的信息如下:
[21821.544394] --------------- clocksource name:tsc, mult=6935128, shift=24
[21821.544395] --------------- clocksource name:tsc, mult=6935128, shift=24
时钟频率是2419.200 MHz
2419200000 * 6935128 >> 24 = 1000 014 642ns
4.3.2 将本机时间调慢7分钟
将本机时间比标准时间调慢7分钟,使用dmesg看到kprobe模块抓取的信息如下:
[20967.796255] --------------- clocksource name:tsc, mult=7628528, shift=24
[20967.796257] --------------- clocksource name:tsc, mult=7628528, shift=24
2419200000 * 7628528 >> 24 = 1099 999 841ns
4.3.3 将本机时间调快6分钟
将本机时间比标准时间调快6分钟,使用dmesg看到kprobe模块抓取的信息如下:
[21149.432284] --------------- clocksource name:tsc, mult=6241523, shift=24
[21149.432288] --------------- clocksource name:tsc, mult=6241523, shift=24
2419200000 * 6241523 >> 24 = 899 999 883ns
5 总结
当linux内核记录的时间比标准时间慢时,时间同步程序会修改内核的mult变量,让内核时间走的快一些;
当linux内核记录的时间比标准时间快时,时间同步程序会修改内核的mult变量,让内核时间走的慢一些。