Time Management of Linux

最新推荐文章于 2023-02-07 16:11:03 发布

ganggexiongqi

最新推荐文章于 2023-02-07 16:11:03 发布

阅读量2k

点赞数

分类专栏： Linux 基础学习文章标签： timer struct callback function statistics intervals

本文链接：https://blog.csdn.net/ganggexiongqi/article/details/6990037

版权

Linux 基础学习专栏收录该内容

134 篇文章 1 订阅

订阅专栏

Time Management of Linux:

code review and Professional Linux kernel architecture reading notes.

code version is linux-3.2-rc1

FFFFFF+++++

/**
* struct hrtimer - the basic hrtimer structure
* @node:   timerqueue node, which also manages node.expires,
*      the absolute expiry time in the hrtimers internal
*      representation. The time is related to the clock on
*      which the timer is based. Is setup by adding
*      slack to the _softexpires value. For non range timers
*      identical to _softexpires.
* @_softexpires: the absolute earliest expiry time of the hrtimer.
*      The time which was given as expiry time when the timer
*      was armed.
* @function:   timer expiry callback function
* @base:   pointer to the timer base (per cpu and per clock)
* @state: state information (See bit values above)
* @start_site: timer statistics field to store the site where the timer
*      was started
* @start_comm: timer statistics field to store the name of the process which
*      started the timer
* @start_pid: timer statistics field to store the pid of the task which
*      started the timer
*
* The hrtimer structure must be initialized by hrtimer_init()
*/
struct hrtimer {
    struct timerqueue_node      node;
    ktime_t             _softexpires;
    enum hrtimer_restart        (*function)(struct hrtimer *);
    struct hrtimer_clock_base   *base;-
    unsigned long           state;
#ifdef CONFIG_TIMER_STATS
    int             start_pid;
    void                *start_site;
    char                start_comm[16];
#endif
};
struct timerqueue_node {
    struct rb_node node;
    ktime_t expires;
};

struct timerqueue_head {
    struct rb_root head;
    struct timerqueue_node *next;
};

/**
* struct hrtimer_clock_base - the timer base for a specific clock
* @cpu_base:       per cpu clock base
* @index:      clock type index for per_cpu support when moving a
*          timer to a base on another cpu.
* @clockid:        clock id for per_cpu support
* @active:     red black tree root node for the active timers
* @resolution:     the resolution of the clock, in nanoseconds
* @get_time:       function to retrieve the current time of the clock
* @softirq_time:   the time when running the hrtimer queue in the softirq
* @offset:     offset of this clock to the monotonic base
*/
struct hrtimer_clock_base {
    struct hrtimer_cpu_base *cpu_base;
    int         index;
    clockid_t       clockid;
    struct timerqueue_head active;
    ktime_t         resolution;
    ktime_t         (*get_time)(void);
    ktime_t         softirq_time;
    ktime_t         offset;
};
/*
* struct hrtimer_cpu_base - the per cpu clock bases
* @lock:       lock protecting the base and associated clock bases
*          and timers
* @active_bases:   Bitfield to mark bases with active timers(biti == 1 indicate
*          active state of the hrtimer_clock_base i)
* @expires_next:   absolute time of the next event which was scheduled
*          via clock_set_next_event()
* @hres_active:    State of high resolution mode
* @hang_detected: The last hrtimer interrupt detected a hang
* @nr_events:      Total number of hrtimer interrupt events
* @nr_retries:     Total number of hrtimer interrupt retries
* @nr_hangs:       Total number of hrtimer interrupt hangs
* @max_hang_time: Maximum time spent in hrtimer_interrupt
* @clock_base:     array of clock bases for this cpu
*/
struct hrtimer_cpu_base {
    raw_spinlock_t          lock;
    unsigned long           active_bases;
#ifdef CONFIG_HIGH_RES_TIMERS
    ktime_t             expires_next;
    int             hres_active;
    int             hang_detected;
    unsigned long           nr_events;
    unsigned long           nr_retries;
    unsigned long           nr_hangs;
    ktime_t             max_hang_time;
#endif
    struct hrtimer_clock_base   clock_base[HRTIMER_MAX_CLOCK_BASES];
};

--- 低分辨率时钟的实现
tick_handle_periodic
    tick_periodic
        do_timer
        update_process_times
        profile_tick

update_process_times
    run_local_timers
        hrtimer_run_queues
        raise_softirq //激活定时器管理软中断 run_timer_softirq

run_timer_softirq
    hrtimer_run_pending
    __run_timers // 执行到期的所有时钟处理函数

--- 通用事件子系统

时钟事件设备允许注册一个事件，在未来一个指定的时间点上发生。
与完备的定时器相比，它只能存储一个事件。
/**
* struct clock_event_device - clock event device descriptor
* @event_handler: Assigned by the framework to be called by the low
*          level handler of the event source
* @set_next_event: set next event function using a clocksource delta
* @set_next_ktime: set next event function using a direct ktime value
* @next_event:     local storage for the next event in oneshot mode
* @max_delta_ns:   maximum delta value in ns
* @min_delta_ns:   minimum delta value in ns
//假如当前时间为t，则下一个时钟事件可以在[t + min_delta_ns, t + max_delta_ns]发生。
* @mult:       nanosecond to cycles multiplier
* @shift:      nanoseconds to cycles divisor (power of two)
* @mode:       operating mode assigned by the management code
* @features:       features
* @retries:        number of forced programming retries
* @set_mode:       set mode function
* @broadcast:      function to broadcast events
* @min_delta_ticks:    minimum delta value in ticks stored for reconfiguration
* @max_delta_ticks:    maximum delta value in ticks stored for reconfiguration
* @name:       ptr to clock event name
* @rating:     variable to rate clock event devices
* @irq:        IRQ number (only for non CPU local devices)
* @cpumask:        cpumask to indicate for which CPUs this device works
* @list:       list head for the management code
*/
struct clock_event_device {
    void            (*event_handler)(struct clock_event_device *);
    int         (*set_next_event)(unsigned long evt,
                          struct clock_event_device *);
    int         (*set_next_ktime)(ktime_t expires,
                          struct clock_event_device *);
    ktime_t         next_event;
    u64         max_delta_ns;
    u64         min_delta_ns;
    u32         mult;
    u32         shift;
    enum clock_event_mode   mode;
    unsigned int        features;
    unsigned long       retries;

    void            (*broadcast)(const struct cpumask *mask);
    void            (*set_mode)(enum clock_event_mode mode,
                        struct clock_event_device *);
    unsigned long       min_delta_ticks;
    unsigned long       max_delta_ticks;

    const char      *name;
    int         rating;
    int         irq;
    const struct cpumask    *cpumask;
    struct list_head    list;
} ____cacheline_aligned;

如何从机器提供的各种时钟源读取时间值。
为此内核定义了时钟源的抽象。
/**
* struct clocksource - hardware abstraction for a free running counter
* Provides mostly state-free accessors to the underlying hardware.
* This is the structure used for system time.
*
* @name:       ptr to clocksource name
* @list:       list head for registration
* @rating:     rating value for selection (higher is better)
*          To avoid rating inflation the following
*          list should give you a guide as to how
*          to assign your clocksource a rating
*          1-99: Unfit for real use
*              Only available for bootup and testing purposes.
*          100-199: Base level usability.
*              Functional for real use, but not desired.
*          200-299: Good.
*              A correct and usable clocksource.
*          300-399: Desired.
*              A reasonably fast and accurate clocksource.
*          400-499: Perfect
*              The ideal clocksource. A must-use where
*              available.
* @read:       returns a cycle value, passes clocksource as argument//读取时钟周期的当前计数值
* @enable:     optional function to enable the clocksource
* @disable:        optional function to disable the clocksource
* @mask:       bitmask for two's complement
*          subtraction of non 64 bit counters
* @mult:       cycle to nanosecond multiplier
* @shift:      cycle to nanosecond divisor (power of two)
* @max_idle_ns:    max idle time permitted by the clocksource (nsecs)
* @flags:      flags describing special properties
* @archdata:       arch-specific data
* @suspend:        suspend function for the clocksource, if necessary
* @resume:     resume function for the clocksource, if necessary
*/
struct clocksource {
    /*
     * Hotpath data, fits in a single cache line when the
     * clocksource itself is cacheline aligned.
     */
    cycle_t (*read)(struct clocksource *cs);
    cycle_t cycle_last; //
    cycle_t mask;
    u32 mult;
    u32 shift;
    u64 max_idle_ns;

#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
    struct arch_clocksource_data archdata;
#endif

    const char *name;
    struct list_head list;
    int rating;
    int (*enable)(struct clocksource *cs);
    void (*disable)(struct clocksource *cs);
    unsigned long flags;
    void (*suspend)(struct clocksource *cs);
    void (*resume)(struct clocksource *cs);
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
    /* Watchdog related data, used by the framework */
    struct list_head wd_list;
    cycle_t cs_last;
    cycle_t wd_last;
#endif
} ____cacheline_aligned;

------------ 怎么使用时钟源？
首先，要将时钟源注册到内核。clocksource_register()负责该工作。
时钟源只是被添加到了全局的clocksource_list链表。并根据rating对
所有可用时钟源进行排序。
再次, 所有的struct clock_event_device 都保存在全局的clockevent_devices链表上。
clockevents_register_device()用来注册一个设备，并添加到该链表。
================

/*
* hrtimer_interrupt - Scan the current CPU's @hrtimer_cpu_base's clock_base
*          array, and excute the expired timer's callback function of the high-resolution
*          red-black tree, reprogram the event device of the current CPU to raise
*          an interrupt when the next timer is due.
* -------------------------------------------------------
* High resolution timer interrupt
* Called with interrupts disabled
*
* This function will be called when a high-resolution clock is up and running the
* transition to highresolution mode is completely finished.
*/
void hrtimer_interrupt(struct clock_event_device *dev)
{
    struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
    ktime_t expires_next, now, entry_time, delta;
    int i, retries = 0;

    BUG_ON(!cpu_base->hres_active);
    /* Total number of hrtimer interrupt events update */
    cpu_base->nr_events++;
    dev->next_event.tv64 = KTIME_MAX;                                                                                       //????[1]

     /* Get current time. */
    entry_time = now = ktime_get();
retry:
    expires_next.tv64 = KTIME_MAX;

    raw_spin_lock(&cpu_base->lock);
    /*
     * We set expires_next to KTIME_MAX here with cpu_base->lock
     * held to prevent that a timer is enqueued in our queue via
     * the migration code. This does not affect enqueueing of
     * timers which run their callback and need to be requeued on
     * this CPU.
     */
    cpu_base->expires_next.tv64 = KTIME_MAX;
    /* Itaerate over all clock bases (monotonic and real-time) */
    for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
        struct hrtimer_clock_base *base;
        struct timerqueue_node *node;
        ktime_t basenow;
        /* find out the next active hrtimer_clock_base */
        if (!(cpu_base->active_bases & (1 << i)))
            continue;

        base = cpu_base->clock_base + i;
          /* Ajust the 'current time', so the @basenow denote the 'real current time' */
        basenow = ktime_add(now, base->offset);
        /* scan the @base's red-black hrtimer tree */
        while ((node = timerqueue_getnext(&base->active))) {
            struct hrtimer *timer;
               /* get the timer */
            timer = container_of(node, struct hrtimer, node);

            /*
             * The immediate goal for using the softexpires is
             * minimizing wakeups, not running timers at the
             * earliest interrupt after their soft expiration.
             * This allows us to avoid using a Priority Search
             * Tree, which can answer a stabbing querry for
             * overlapping intervals and instead use the simple
             * BST we already have.
             * We don't add extra wakeups by delaying timers that
             * are right-of a not yet expired timer, because that
             * timer will have to trigger a wakeup anyway.
            */
             /* If the timer's soft expiration time lies in the future, process can be stopped */
            if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
                ktime_t expires;
                   /* calculate the next expires time */
                expires = ktime_sub(hrtimer_get_expires(timer),
                            base->offset);
                if (expires.tv64 < expires_next.tv64)
                    expires_next = expires;/* Save the next expires time which used to
                                                           * reprogram the clock event device */
                break;
            }
        /*
         * Remove @timer from its base's hrtimer red-black tree, excute @timer's
         * callback function and restart the @timer by insert it into the base's hrtimer
         * red-black tree.
         */
            __run_hrtimer(timer, &basenow);
        }
    }

    /*
     * Store the new expiry value so the migration code can verify
     * against it.
     */
    cpu_base->expires_next = expires_next;
    raw_spin_unlock(&cpu_base->lock);

    /* Reprogramming necessary ? */
    if (expires_next.tv64 == KTIME_MAX ||
      /* Reprogram the clock event device of this cpu */
        !tick_program_event(expires_next, 0)) {
        cpu_base->hang_detected = 0;
        return;
    }
    /*
     * The next timer was already expired due to:
     * - tracing
     * - long lasting callbacks
     * - being scheduled away when running in a VM
     *
     * We need to prevent that we loop forever in the hrtimer
     * interrupt routine. We give it 3 attempts to avoid
     * overreacting on some spurious event.
     */
    now = ktime_get();
    cpu_base->nr_retries++;
    if (++retries < 3)
        goto retry;
    /*
     * Give the system a chance to do something else than looping
     * here. We stored the entry time, so we know exactly how long
     * we spent here. We schedule the next event this amount of
     * time away.
     */
    cpu_base->nr_hangs++;
    cpu_base->hang_detected = 1;
    delta = ktime_sub(now, entry_time);
    /* update the maximum time spent in hrtimer_interrupt */
    if (delta.tv64 > cpu_base->max_hang_time.tv64)
        cpu_base->max_hang_time = delta;
    /*
     * Limit it to a sensible value as we enforce a longer
     * delay. Give the CPU at least 100ms to catch up.
     */
    if (delta.tv64 > 100 * NSEC_PER_MSEC)
        expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
    else
        expires_next = ktime_add(now, delta);
    tick_program_event(expires_next, 1);
    printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
            ktime_to_ns(delta));
}

/*
* __run_hrtimer - remove @timer from its base's hrtimer red-black tree, excute @timer's
*           callback function and restart the @timer by insert it into the base's hrtimer
*           red-black tree.
*/
static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
{
    struct hrtimer_clock_base *base = timer->base;
    struct hrtimer_cpu_base *cpu_base = base->cpu_base;
    enum hrtimer_restart (*fn)(struct hrtimer *);
    int restart;
     /* Irqs has disabled */
    WARN_ON(!irqs_disabled());

    debug_deactivate(timer);
    /* remove @time from @base's hrtimer red-black tree,
    * and point @base->active to the new root of the rbtree
    */
    __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
    /* Update the statistics for a timer. */
    timer_stats_account_hrtimer(timer);
    fn = timer->function;

    /*
     * Because we run timers from hardirq context, there is no chance
     * they get migrated to another cpu, therefore its safe to unlock
     * the timer base.
     */
    raw_spin_unlock(&cpu_base->lock);
    trace_hrtimer_expire_entry(timer, now);
    /* excute the timer's callback function */
    restart = fn(timer);
    trace_hrtimer_expire_exit(timer);
    raw_spin_lock(&cpu_base->lock);
    /*
     * Note: We clear the CALLBACK bit after enqueue_hrtimer and
     * we do not reprogramm the event hardware. Happens either in
     * hrtimer_start_range_ns() or in hrtimer_interrupt()
     */
    if (restart != HRTIMER_NORESTART) {
        BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
         /* (re)start a timer: The timer is inserted in expiry order into the
         * @base->active red black tree
         */
        enqueue_hrtimer(timer, base);
    }

    WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
     /* Clear the callback flag */
    timer->state &= ~HRTIMER_STATE_CALLBACK;
}

---------------------------------
Call tree:
    run_local_timers ---- Called by the local, per-CPU timer interrupt on SMP.
             hrtimer_run_queues();
             raise_softirq(TIMER_SOFTIRQ);
/*
*
* hrtimer_run_queues - Similar to hrtimer_interrupt()
*
* Called from hardirq context every jiffy
* High-Resolution Timers in Low-Resolution Mode
*/
void hrtimer_run_queues(void)
{
    struct timerqueue_node *node;
    struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
    struct hrtimer_clock_base *base;
    int index, gettime = 1;
    /* Is the high resolution mode active ? */
    if (hrtimer_hres_active())
        return;
    /* scan the clock_base array */
    for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
        base = &cpu_base->clock_base[index];
        if (!timerqueue_getnext(&base->active))
            continue;

        if (gettime) {
             /*
            * Get the coarse grained time at the softirq based on xtime and
            * wall_to_monotonic.
             */
            hrtimer_get_softirq_time(cpu_base);
            gettime = 0;
        }

        raw_spin_lock(&cpu_base->lock);
        while ((node = timerqueue_getnext(&base->active))) {
            struct hrtimer *timer;

            timer = container_of(node, struct hrtimer, node);
         /* If the timer's soft expiration time lies in the future, process can be stopped */
            if (base->softirq_time.tv64 <=
                    hrtimer_get_expires_tv64(timer))
                break;
        /*
         * Remove @timer from its base's hrtimer red-black tree, excute @timer's
         * callback function and restart the @timer by insert it into the base's hrtimer
         * red-black tree.
         */
            __run_hrtimer(timer, &base->softirq_time);
        }
        raw_spin_unlock(&cpu_base->lock);
    }
}

/*
* Get the coarse grained time at the softirq based on xtime and
* wall_to_monotonic.
*/
static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
{
    ktime_t xtim, mono, boot;
    struct timespec xts, tom, slp;

    get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);

    xtim = timespec_to_ktime(xts);
    mono = ktime_add(xtim, timespec_to_ktime(tom));
    boot = ktime_add(mono, timespec_to_ktime(slp));
    base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
    base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
    base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
}

/**
* get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
*    and sleep offsets.
* @xtim:   pointer to timespec to be set with xtime
* @wtom:   pointer to timespec to be set with wall_to_monotonic
* @sleep: pointer to timespec to be set with time in suspend
*/
void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
                struct timespec *wtom, struct timespec *sleep)
{
    unsigned long seq;

    do {
        seq = read_seqbegin(&xtime_lock);
        *xtim = xtime;
        *wtom = wall_to_monotonic;
        *sleep = total_sleep_time;
    } while (read_seqretry(&xtime_lock, seq));
}

======15.4.4======= Periodic Tick Emulation
The clock event handler in high-resolution mode is hrtimer_interrupt.
This implies that itck_handle_periodic does not provide the periodic tick anymore.
So an equivalent functionality thus needs be made available based on high-resolution
timers. The implemention is (nearly) identical between the situations with and
without dynamic ticks.

/**
* struct tick_sched - sched tick emulation and no idle tick control/stats
* @sched_timer:    hrtimer to schedule the periodic tick in high
*          resolution mode
* @idle_tick:      Store the last idle tick expiry time when the tick
*          timer is modified for idle sleeps. This is necessary
*          to resume the tick timer operation in the timeline
*          when the CPU returns from idle
* @tick_stopped:   Indicator that the idle tick has been stopped
* @idle_jiffies:   jiffies at the entry to idle for idle time accounting
* @idle_calls:     Total number of idle calls
* @idle_sleeps:    Number of idle calls, where the sched tick was stopped
* @idle_entrytime: Time when the idle call was entered
* @idle_waketime: Time when the idle was interrupted
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @iowait_sleeptime:   Sum of the time slept in idle with sched tick stopped, with IO outstanding
* @sleep_length:   Duration of the current idle sleep
* @do_timer_lst:   CPU was the last one doing do_timer before going idle
*/
struct tick_sched {
    struct hrtimer          sched_timer;
    unsigned long           check_clocks;
    enum tick_nohz_mode     nohz_mode;
    ktime_t             idle_tick;
    int             inidle;
    int             tick_stopped;
    unsigned long           idle_jiffies;
    unsigned long           idle_calls;
    unsigned long           idle_sleeps;
    int             idle_active;
    ktime_t             idle_entrytime;
    ktime_t             idle_waketime;
    ktime_t             idle_exittime;
    ktime_t             idle_sleeptime;
    ktime_t             iowait_sleeptime;
    ktime_t             sleep_length;
    unsigned long           last_jiffies;
    unsigned long           next_jiffies;
    ktime_t             idle_expires;
    int             do_timer_last;
};

/*
* tick_sched_timer - update jiffies_64, increment the wall time and
*          update the avenrun load, reset the software watchdog, anage
*          process-specific time elements and resets the @timer
* --------------------------------
* We rearm the timer until we get disabled by the idle code.
* Called with interrupts disabled and timer->base->cpu_base->lock held.
*/
static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
{
    struct tick_sched *ts =
        container_of(timer, struct tick_sched, sched_timer);
    struct pt_regs *regs = get_irq_regs();
     /* get the current time */
    ktime_t now = ktime_get();
    int cpu = smp_processor_id();

#ifdef CONFIG_NO_HZ
    /*
     * Check if the do_timer duty was dropped. We don't care about
     * concurrency: This happens only when the cpu in charge went
     * into a long sleep. If two cpus happen to assign themself to
     * this duty, then the jiffies update is still serialized by
     * xtime_lock.
     */
    if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
        tick_do_timer_cpu = cpu;
#endif
    /* Check, if the jiffies need an update */
    if (tick_do_timer_cpu == cpu)
     /* update jiffies_64, increment the wall time and
     * update the avenrun load
     */
        tick_do_update_jiffies64(now);

    /*
     * Do not call, when we are not in irq context and have
     * no valid regs pointer
     */
    if (regs) {
        /*
         * When we are idle and the tick is stopped, we have to touch
         * the watchdog as we might not schedule for a really long
         * time. This happens on complete idle SMP systems while
         * waiting on the login prompt. We also increment the "start of
         * idle" jiffy stamp so the idle accounting adjustment we do
         * when we go busy again does not account too much ticks.
         */
        if (ts->tick_stopped) {
              /* reset the software watchdog */
            touch_softlockup_watchdog();
            ts->idle_jiffies++;
        }
         /* Used to manage process-specific time elements */
        update_process_times(user_mode(regs));
        profile_tick(CPU_PROFILING);
    }
     /* resets the timer so that it expires after @now */
    hrtimer_forward(timer, now, tick_period);

    return HRTIMER_RESTART;
}

/*
* tick_do_update_jiffies64 - update jiffies_64, increment the wall time and
*                                           update the avenrun load
* -----------------
* Must be called with interrupts disabled !
*/
static void tick_do_update_jiffies64(ktime_t now)
{
    unsigned long ticks = 0;
    ktime_t delta;

    /*
     * Do a quick check without holding xtime_lock:
     */
    delta = ktime_sub(now, last_jiffies_update);
    /* jiffies update is NOT needed */
    if (delta.tv64 < tick_period.tv64)
        return;

    /* Reevalute with xtime_lock held */
    write_seqlock(&xtime_lock);

    delta = ktime_sub(now, last_jiffies_update);
    /* jiffies update is needed */
    if (delta.tv64 >= tick_period.tv64) {

        delta = ktime_sub(delta, tick_period);
         /* Remember the last updating time of jiffies64 */
        last_jiffies_update = ktime_add(last_jiffies_update,
                        tick_period);

        /* Slow path for long timeouts */
        /* This will happen when we missed some ticks */
        if (unlikely(delta.tv64 >= tick_period.tv64)) {
            s64 incr = ktime_to_ns(tick_period);
               /* (ticks + 1) is number of ticks we missed */
            ticks = ktime_divns(delta, incr);
              /* Remember the last updating time of jiffies64 */
            last_jiffies_update = ktime_add_ns(last_jiffies_update,
                               incr * ticks);
        }
        /* update jiffies_64, increment the wall time and update the avenrun load */
        do_timer(++ticks);

        /* Keep the tick_next_period variable up to date */
        tick_next_period = ktime_add(last_jiffies_update, tick_period);
    }
    write_sequnlock(&xtime_lock);
}

/*
* do_timer - update jiffies_64, increment the wall time and update the avenrun load
*--------------------------
* The 64-bit jiffies value is not atomic - you MUST NOT read it
* without sampling the sequence number in xtime_lock.
* jiffies is defined in the linker script...
*/
void do_timer(unsigned long ticks)
{
    jiffies_64 += ticks;
     /* Uses the current clocksource to increment the wall time */
    update_wall_time();
    /* update the avenrun load */
    calc_global_load(ticks);
}

/* Structure holding internal timekeeping values. */
struct timekeeper {
    /* Current clocksource used for timekeeping. */
    struct clocksource *clock;
    /* The shift value of the current clocksource. */
    int shift;

    /* Number of clock cycles in one NTP interval. */
    cycle_t cycle_interval;
    /* Number of clock shifted nano seconds in one NTP interval. */
    u64 xtime_interval;
    /* shifted nano seconds left over when rounding cycle_interval */
    s64 xtime_remainder;
    /* Raw nano seconds accumulated per NTP interval. */
    u32 raw_interval;

    /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
    u64 xtime_nsec;
    /* Difference between accumulated time and NTP time in ntp
     * shifted nano seconds. */
    s64 ntp_error;
    /* Shift conversion between clock shifted nano seconds and
     * ntp shifted nano seconds. */
    int ntp_error_shift;
    /* NTP adjusted clock multiplier */
    u32 mult;
};

/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
* Called from the timer interrupt, must hold a write on xtime_lock.
*/
static void update_wall_time(void)
{
    struct clocksource *clock;
    cycle_t offset;
    int shift = 0, maxshift;

    /* Make sure we're fully resumed: */
    if (unlikely(timekeeping_suspended))
        return;

    clock = timekeeper.clock;
    /* */
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
    offset = timekeeper.cycle_interval;
#else
    offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
#endif
    timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;

    /*
     * With NO_HZ we may have to accumulate many cycle_intervals
     * (think "ticks") worth of time at once. To do this efficiently,
     * we calculate the largest doubling multiple of cycle_intervals
     * that is smaller then the offset. We then accumulate that
     * chunk in one go, and then try to consume the next smaller
     * doubled multiple.
     */
    shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
    shift = max(0, shift);
    /* Bound shift to one less then what overflows tick_length */
    maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
    shift = min(shift, maxshift);
    while (offset >= timekeeper.cycle_interval) {
        offset = logarithmic_accumulation(offset, shift);
        if(offset < timekeeper.cycle_interval<<shift)
            shift--;
    }

    /* correct the clock when NTP error is too big */
    timekeeping_adjust(offset);

    /*
     * Since in the loop above, we accumulate any amount of time
     * in xtime_nsec over a second into xtime.tv_sec, its possible for
     * xtime_nsec to be fairly small after the loop. Further, if we're
     * slightly speeding the clocksource up in timekeeping_adjust(),
     * its possible the required corrective factor to xtime_nsec could
     * cause it to underflow.
     *
     * Now, we cannot simply roll the accumulated second back, since
     * the NTP subsystem has been notified via second_overflow. So
     * instead we push xtime_nsec forward by the amount we underflowed,
     * and add that amount into the error.
     *
     * We'll correct this error next time through this function, when
     * xtime_nsec is not as small.
     */
    if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
        s64 neg = -(s64)timekeeper.xtime_nsec;
        timekeeper.xtime_nsec = 0;
        timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
    }

    /*
     * Store full nanoseconds into xtime after rounding it up and
     * add the remainder to the error difference.
     */
    xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
    timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
    timekeeper.ntp_error += timekeeper.xtime_nsec <<
                timekeeper.ntp_error_shift;

      /*
     * Finally, make sure that after the rounding
     * xtime.tv_nsec isn't larger then NSEC_PER_SEC
     */
    if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
        xtime.tv_nsec -= NSEC_PER_SEC;
        xtime.tv_sec++;
        second_overflow();
    }

    /* check to see if there is a new clocksource to use */
    update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
                timekeeper.mult);
}

/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
*/
void calc_global_load(unsigned long ticks)
{
    long active;

    calc_global_nohz(ticks);

    if (time_before(jiffies, calc_load_update + 10))
        return;

    active = atomic_long_read(&calc_load_tasks);
    active = active > 0 ? active * FIXED_1 : 0;

    avenrun[0] = calc_load(avenrun[0], EXP_1, active);
    avenrun[1] = calc_load(avenrun[1], EXP_5, active);
    avenrun[2] = calc_load(avenrun[2], EXP_15, active);

    calc_load_update += LOAD_FREQ;
}

--------- Managing Process Times
The update_process_time() is used to manage process-specific time elements
and is invoked from the local tick.

struct task_struct {
   ...
cputime_t utime, stime, utimescaled, stimescaled;
...
}
/*
* update_process_times - Used to manage process-specific time elements
* ------------------------------
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
*/
void update_process_times(int user_tick)
{
    struct task_struct *p = current;
    int cpu = smp_processor_id();

    /* Note: this timer irq context must be accounted for as well. */
    /* Account a single tick of cpu time */
    account_process_tick(p, user_tick);
     /* Activate and expires low-resolution timers */
    run_local_timers();

    rcu_check_callbacks(cpu, user_tick);
    printk_tick();
#ifdef CONFIG_IRQ_WORK
    if (in_irq())
        irq_work_run();
#endif
    scheduler_tick();
     /* */
    run_posix_cpu_timers(p);
}

/*
* Account a single tick of cpu time.
* @p: the process that the cpu time gets accounted to
* @user_tick: indicates if the tick is a user or a system tick
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
    cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
    struct rq *rq = this_rq();

    if (sched_clock_irqtime) {
        irqtime_account_process_tick(p, user_tick, rq);
        return;
    }

    if (steal_account_process_tick())
        return;

    if (user_tick)
        account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
    else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
        account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
                    one_jiffy_scaled);
    else
        account_idle_time(cputime_one_jiffy);
}

========= Switching to High-Resolution Timers

All systems initially workin low-resolutionmode and without dynamic ticks;
they switch to a different combination only later when the requierd hardware is initialized.

The high-resolution queue is processed by hrtimer_run_queue when low-resolution
timers are active. Before the queues are run, the function provides checks if a
clock evet device suitable for high resolution timers is present in the system.
In this case, the switch to high resolution mode is performed.

Function tick_handle_periodic() is called on the next event of the tick device.
The following is the call tree:

tick_handle_periodic
    tick_periodic
        do_timer
        update_process_times   ++++
        profile_tick

update_process_times
    run_local_timers
        hrtimer_run_queues
        raise_softirq //激活定时器管理软中断 run_timer_softirq +++

run_timer_softirq
    hrtimer_run_pending   ++
    __run_timers // 执行到期的所有时钟处理函数

hrtimer_run_pending
    hrtimer_switch_to_hres // Switch to high resolution mode +

hrtimer_switch_to_hres
    tick_init_highres
            tick_switch_to_oneshot(hrtimer_interrupt)
      /* Set resolution in clock base */
     tick_setup_sched_timer
     retrigger_next_event

/*
* Called from timer softirq every jiffy, expire hrtimers:
*
* For HRT its the fall back code to run the softirq in the timer
* softirq context in case the hrtimer initialization failed or has
* not been done yet.
*/
void hrtimer_run_pending(void)
{
    if (hrtimer_hres_active())
        return;

    /*
     * This _is_ ugly: We have to check in the softirq context,
     * whether we can switch to highres and / or nohz mode. The
     * clocksource switch happens in the timer interrupt with
     * xtime_lock held. Notification from there only sets the
     * check bit in the tick_oneshot code, otherwise we might
     * deadlock vs. xtime_lock.
     */
    if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
        hrtimer_switch_to_hres();
}

/*
* Switch to high resolution mode
*/
static int hrtimer_switch_to_hres(void)
{
    int i, cpu = smp_processor_id();
    struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
    unsigned long flags;

    /* HRT is already active */
    if (base->hres_active)
        return 1;

    local_irq_save(flags);

    /* switch to high resolution mode, hrtimer_interrupt() is installed as event handler */
    if (tick_init_highres()) {
        local_irq_restore(flags);
        printk(KERN_WARNING "Could not switch to high resolution "
                    "mode on CPU %d\n", cpu);
        return 0;
    }
    base->hres_active = 1;
    /* Set resolution in clock base */
    for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
        base->clock_base[i].resolution = KTIME_HIGH_RES;
     /* setup the tick emulation timer */
    tick_setup_sched_timer();

    /* "Retrigger" the interrupt to get things going     */
    retrigger_next_event(NULL);
      /* High-reslution support is active */


    local_irq_restore(flags);
    return 1;
}

/**
* tick_init_highres - switch to high resolution mode
*
* Called with interrupts disabled.
*/
int tick_init_highres(void)
{
    return tick_switch_to_oneshot(hrtimer_interrupt);
}

/**
* tick_switch_to_oneshot - switch to oneshot mode
*/
int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
{
    struct tick_device *td = &__get_cpu_var(tick_cpu_device);
    struct clock_event_device *dev = td->evtdev;

    if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
            !tick_device_is_functional(dev)) {

        printk(KERN_INFO "Clockevents: "
               "could not switch to one-shot mode:");
        if (!dev) {
            printk(" no tick device\n");
        } else {
            if (!tick_device_is_functional(dev))
                printk(" %s is not functional.\n", dev->name);
            else
                printk(" %s does not support one-shot mode.\n",
                       dev->name);
        }
        return -EINVAL;
    }

    td->mode = TICKDEV_MODE_ONESHOT;
    dev->event_handler = handler;
     /* set the operating mode of a clock event device to oneshot mode */
    clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
    /* Select oneshot operating mode for the broadcast device */
    tick_broadcast_switch_to_oneshot();
    return 0;
}

/**
* tick_setup_sched_timer - setup the tick emulation timer
*/
void tick_setup_sched_timer(void)
{
    struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
    ktime_t now = ktime_get();

    /*
     * Emulate tick processing via per-CPU hrtimers:
     * ----------------------------
     * Initialize the hrtimer @ts->sched_timer which used to schedule the
     * periodic tick to CLOCK_MONOTONIC clock.
     */
    hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
     /* Set hrtimer's callback funtion to tick_sched_timer() which is used to
     * update jiffies_64, increment the wall time and update the avenrun load,
     * reset the software watchdog, process-specific time elements and resets
     * the timer
     */
    ts->sched_timer.function = tick_sched_timer;

    /* Get the next period (per cpu) */
    hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());

    for (;;) {
        hrtimer_forward(&ts->sched_timer, now, tick_period);
        hrtimer_start_expires(&ts->sched_timer,
                      HRTIMER_MODE_ABS_PINNED);
        /* Check, if the timer was already in the past */
        /* Check, if the timer is active */
        if (hrtimer_active(&ts->sched_timer))
            break;
        now = ktime_get();
    }

#ifdef CONFIG_NO_HZ
    if (tick_nohz_enabled)
        ts->nohz_mode = NOHZ_MODE_HIGHRES;
#endif
}

/**
* hrtimer_init - initialize a timer to the given clock
* @timer: the timer to be initialized
* @clock_id:   the clock to be used
* @mode:   timer mode abs/rel
*/
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
          enum hrtimer_mode mode)
{
    debug_init(timer, clock_id, mode);
    __hrtimer_init(timer, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init);

/*
* Retrigger next event is called after clock was set
*
* Called with interrupts disabled via on_each_cpu()
*/
static void retrigger_next_event(void *arg)
{
    struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
    struct timespec realtime_offset, xtim, wtm, sleep;

    if (!hrtimer_hres_active())
        return;

    /* Optimized out for !HIGH_RES */
    /* get xtime, wall_to_monotonic,   and sleep offsets. */
    get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
    set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);

    /* Adjust CLOCK_REALTIME offset */
    raw_spin_lock(&base->lock);
    base->clock_base[HRTIMER_BASE_REALTIME].offset =
        timespec_to_ktime(realtime_offset);
    base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
        timespec_to_ktime(sleep);

    hrtimer_force_reprogram(base, 0);
    raw_spin_unlock(&base->lock);
}
====15.5.1&2======= Dynamic Ticks for Low-Resolution Systems
/**
* struct tick_sched - sched tick emulation and no idle tick control/stats
* @sched_timer:    hrtimer to schedule the periodic tick in high
*          resolution mode
* @idle_tick:      Store the last idle tick expiry time when the tick
*          timer is modified for idle sleeps. This is necessary
*          to resume the tick timer operation in the timeline
*          when the CPU returns from idle
* @tick_stopped:   Indicator that the idle tick has been stopped
* @idle_jiffies:   jiffies at the entry to idle for idle time accounting
* @idle_calls:     Total number of idle calls
* @idle_sleeps:    Number of idle calls, where the sched tick was stopped
* @idle_entrytime: Time when the idle call was entered
* @idle_waketime: Time when the idle was interrupted
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @iowait_sleeptime:   Sum of the time slept in idle with sched tick stopped, with IO outstanding
* @sleep_length:   Duration of the current idle sleep
* @do_timer_lst:   CPU was the last one doing do_timer before going idle
*/
struct tick_sched {
    struct hrtimer          sched_timer;
    unsigned long           check_clocks;
    enum tick_nohz_mode     nohz_mode;
    ktime_t             idle_tick;
    int             inidle;
    int             tick_stopped;
    unsigned long           idle_jiffies;
    unsigned long           idle_calls;
    unsigned long           idle_sleeps;
    int             idle_active;
    ktime_t             idle_entrytime;
    ktime_t             idle_waketime;
    ktime_t             idle_exittime;
    ktime_t             idle_sleeptime;
    ktime_t             iowait_sleeptime;
    ktime_t             sleep_length;
    unsigned long           last_jiffies;
    unsigned long           next_jiffies;//Stores the jiffy value at which the next timer will expire.
    ktime_t             idle_expires;// Stores when the next classical timer is due to expire.
                  // In contrast to the value above, the resolution of the value is as good as
                  // possible and not in jiffies.

    int             do_timer_last;
};

   --- 低分辨率时钟的dynamic ticks 实现
tick_handle_periodic
    tick_periodic
        do_timer
        update_process_times
        profile_tick

update_process_times
    run_local_timers
        hrtimer_run_queues
        raise_softirq //激活定时器管理软中断 run_timer_softirq

run_timer_softirq
    hrtimer_run_pending
    __run_timers // 执行到期的所有时钟处理函数

hrtimer_run_pending
           tick_check_oneshot_change
                    tick_nohz_switch_to_nohz //   switch to nohz mode

tick_nohz_switch_to_nohz
        tick_switch_to_oneshot(tick_nohz_handler)


/*
* Called from timer softirq every jiffy, expire hrtimers:
*
* For HRT its the fall back code to run the softirq in the timer
* softirq context in case the hrtimer initialization failed or has
* not been done yet.
*/
void hrtimer_run_pending(void)
{
    if (hrtimer_hres_active())
        return;

    /*
     * This _is_ ugly: We have to check in the softirq context,
     * whether we can switch to highres and / or nohz mode. The
     * clocksource switch happens in the timer interrupt with
     * xtime_lock held. Notification from there only sets the
     * check bit in the tick_oneshot code, otherwise we might
     * deadlock vs. xtime_lock.
     */
    if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
        hrtimer_switch_to_hres();
}

    /**
* Check, if a change happened, which makes oneshot possible.
*
* Called cyclic from the hrtimer softirq (driven by the timer
* softirq) allow_nohz signals, that we can switch into low-res nohz
* mode, because high resolution timers are disabled (either compile
* or runtime).
*/
int tick_check_oneshot_change(int allow_nohz)
{
    struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);

    if (!test_and_clear_bit(0, &ts->check_clocks))
        return 0;

    if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
        return 0;

    if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
        return 0;

    if (!allow_nohz)
        return 1;
     /* switch to nohz mode */
    tick_nohz_switch_to_nohz();
    return 0;
}

/**
* tick_nohz_switch_to_nohz - switch to nohz mode
*/
static void tick_nohz_switch_to_nohz(void)
{
    struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
    ktime_t next;

    if (!tick_nohz_enabled)
        return;

    local_irq_disable();
    /*
    * Set the tick device's clock_event_device's mode to oneshot,
    * and change clock_event_device's event_handler to 'tick_nohz_handler'
    */
    if (tick_switch_to_oneshot(tick_nohz_handler)) {
        local_irq_enable();
        return;
    }

    ts->nohz_mode = NOHZ_MODE_LOWRES;

    /*
     * Recycle the hrtimer in ts, so we can share the
     * hrtimer_forward with the highres code.
     */
    hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
    /* Get the next period */
    next = tick_init_jiffy_update();

    for (;;) {
        hrtimer_set_expires(&ts->sched_timer, next);
        if (!tick_program_event(next, 0))
            break;
        next = ktime_add(next, tick_period);
    }
    local_irq_enable();
}

/**
* tick_switch_to_oneshot - switch to oneshot mode
*/
int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
{
   /*
   * tick_cpu_device is a per-CPU list containing one instance of
   *    struct tick_devic for each CPU in the system
   */
    struct tick_device *td = &__get_cpu_var(tick_cpu_device);
    struct clock_event_device *dev = td->evtdev;

    if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
            !tick_device_is_functional(dev)) {

        printk(KERN_INFO "Clockevents: "
               "could not switch to one-shot mode:");
        if (!dev) {
            printk(" no tick device\n");
        } else {
            if (!tick_device_is_functional(dev))
                printk(" %s is not functional.\n", dev->name);
            else
                printk(" %s does not support one-shot mode.\n",
                       dev->name);
        }
        return -EINVAL;
    }

    td->mode = TICKDEV_MODE_ONESHOT;
    dev->event_handler = handler;
     /* change the mode to oneshot */
    clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
     /* Select oneshot operating mode for the broadcast device */
    tick_broadcast_switch_to_oneshot();
    return 0;
}
----------- The Dynamic Tick Handler 'tick_nohz_handler'

It needs to assume tow responsibilities:
    1. Perform all actions requiered for the tick mechanism
    2. Reprogram the tick device such that the next tick expires at the right time

The role of th global tick device is assumed by one particular CPU, and
the handler needs to check if the current CPU i shte responsible one. Howerver,
the situation is a bit more complicated with dynamic ticks. If a CPU goes into a long
sleep, then it cannot be responsible for the glabal tick anymore,
and drops the duty. If this is the case, the next CPU whose tick handler is called
must assume the duty.

/*
* The nohz low res interrupt handler
*/
static void tick_nohz_handler(struct clock_event_device *dev)
{
    struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
    struct pt_regs *regs = get_irq_regs();
    int cpu = smp_processor_id();
    ktime_t now = ktime_get();

    dev->next_event.tv64 = KTIME_MAX;

    /*
     * Check if the do_timer duty was dropped. We don't care about
     * concurrency: This happens only when the cpu in charge went
     * into a long sleep. If two cpus happen to assign themself to
     * this duty, then the jiffies update is still serialized by
     * xtime_lock.
     */
    if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
        tick_do_timer_cpu = cpu;

    /* Check, if the jiffies need an update */
    if (tick_do_timer_cpu == cpu)
        tick_do_update_jiffies64(now);

    /*
     * When we are idle and the tick is stopped, we have to touch
     * the watchdog as we might not schedule for a really long
     * time. This happens on complete idle SMP systems while
     * waiting on the login prompt. We also increment the "start
     * of idle" jiffy stamp so the idle accounting adjustment we
     * do when we go busy again does not account too much ticks.
     */
    if (ts->tick_stopped) {
        touch_softlockup_watchdog();
        ts->idle_jiffies++;
    }

    update_process_times(user_mode(regs));
    profile_tick(CPU_PROFILING);

    while (tick_nohz_reprogram(ts, now)) {
        now = ktime_get();
        tick_do_update_jiffies64(now);
    }
}

/*
* tick_do_update_jiffies64 - update jiffies_64, increment the wall time and
*                                           update the avenrun load
* -----------------
* Must be called with interrupts disabled !
*/
static void tick_do_update_jiffies64(ktime_t now)
{
    unsigned long ticks = 0;
    ktime_t delta;

    /*
     * Do a quick check without holding xtime_lock:
     */
     /*
     * Compute the difference between the current time and
     * last_jiffes_update(the jiffy of the last update)
     */
    delta = ktime_sub(now, last_jiffies_update);
    /* jiffies update is NOT needed */
    if (delta.tv64 < tick_period.tv64)
        return;

    /* Reevalute with xtime_lock held */
    write_seqlock(&xtime_lock);

    delta = ktime_sub(now, last_jiffies_update);
    /* jiffies update is needed */
    if (delta.tv64 >= tick_period.tv64) {

        delta = ktime_sub(delta, tick_period);
         /* Remember the last updating time of jiffies64 */
        last_jiffies_update = ktime_add(last_jiffies_update,
                        tick_period);

        /* Slow path for long timeouts */
        /* This will happen when we missed more than one ticks */
        if (unlikely(delta.tv64 >= tick_period.tv64)) {
            s64 incr = ktime_to_ns(tick_period);
               /* (ticks + 1) is number of ticks we missed */
            ticks = ktime_divns(delta, incr);
              /* Remember the last updating time of jiffies64 */
            last_jiffies_update = ktime_add_ns(last_jiffies_update,
                               incr * ticks);
        }
        /* update jiffies_64, increment the wall time and update the avenrun load */
        do_timer(++ticks);

        /* Keep the tick_next_period variable up to date */
        tick_next_period = ktime_add(last_jiffies_update, tick_period);
    }
    write_sequnlock(&xtime_lock);
}

=========15.5.3======== Dynamic Ticks for High-Resolution Systems

Since clcok event devices run in one-shot mode anyway if the kernel use hight timer
resolution, support for dynamic ticks is much earier than in the low-resolution case.

/*
* tick_sched_timer - update jiffies_64, increment the wall time and
*          update the avenrun load, reset the software watchdog, anage
*          process-specific time elements and resets the @timer
* --------------------------------
* We rearm the timer until we get disabled by the idle code.
* Called with interrupts disabled and timer->base->cpu_base->lock held.
*/
static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
{
...
#ifdef CONFIG_NO_HZ
    /*
     * Check if the do_timer duty was dropped. We don't care about
     * concurrency: This happens only when the cpu in charge went
     * into a long sleep. If two cpus happen to assign themself to
     * this duty, then the jiffies update is still serialized by
     * xtime_lock.
     */
    if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
        tick_do_timer_cpu = cpu;
#endif
...
}

/**
* tick_setup_sched_timer - setup the tick emulation timer
*/
void tick_setup_sched_timer(void)
{
...
#ifdef CONFIG_NO_HZ
    if (tick_nohz_enabled)
        ts->nohz_mode = NOHZ_MODE_HIGHRES;
#endif
}

=====15.5.4 Stopping and Starting Periodic Ticks

A natural possibility to stop ticks is when the idle task is scheduled:
This proves that a processor really does not hanve anything better to do.
tick_nohz_stop_sched_tick() is provieded by the dynamic tick framework to
stop ticks. Note that the same funcion is ued independent of low and high
resolution.

The idle task is implemented in an architecutre-specific way, and not
all architectures have been updated to support disabling the periodic tick
yet. Architectures differ in some details, but the genral principle is the
same.

void cpu_idle(void)
{
   ...
    /* endless idle loop with no priority at all */
    while (1) {
      tick_nohz_stop_sched_tick(1);
        while (!need_resched()) {
            ...
                if (cpuidle_idle_call())
                    pm_idle();
             ...
        }
      ...
        tick_nohz_restart_sched_tick();
       ...
    }
}

/**
* tick_nohz_stop_sched_tick - stop the idle tick from the idle task
*
* When the next event is more than a tick into the future, stop the idle tick
* Called either from the idle loop or from irq_exit() when an idle period was
* just interrupted by an interrupt which did not cause a reschedule.
*/
void tick_nohz_stop_sched_tick(int inidle)
{
    unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
    struct tick_sched *ts;
    ktime_t last_update, expires, now;
    struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
    u64 time_delta;
    int cpu;

    local_irq_save(flags);

    cpu = smp_processor_id();
    ts = &per_cpu(tick_cpu_sched, cpu);

    /*
     * Call to tick_nohz_start_idle stops the last_update_time from being
     * updated. Thus, it must not be called in the event we are called from
     * irq_exit() with the prior state different than idle.
     */
    if (!inidle && !ts->inidle)
        goto end;
    /*
     * Set ts->inidle unconditionally. Even if the system did not
     * switch to NOHZ mode the cpu frequency governers rely on the
     * update of the idle time accounting in tick_nohz_start_idle().
     */
    ts->inidle = 1;

    now = tick_nohz_start_idle(cpu, ts);

    /*
     * If this cpu is offline and it is the one which updates
     * jiffies, then give up the assignment and let it be taken by
     * the cpu which runs the tick timer next. If we don't drop
     * this here the jiffies might be stale and do_timer() never
     * invoked.
     */
    if (unlikely(!cpu_online(cpu))) {
        if (cpu == tick_do_timer_cpu)
            tick_do_timer_cpu = TICK_DO_TIMER_NONE;
    }

    if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
        goto end;

    if (need_resched())
        goto end;
    if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
        static int ratelimit;

        if (ratelimit < 10) {
            printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
                   (unsigned int) local_softirq_pending());
            ratelimit++;
        }
        goto end;
    }

    ts->idle_calls++;
    /* Read jiffies and the time when jiffies were updated last */
    do {
        seq = read_seqbegin(&xtime_lock);
        last_update = last_jiffies_update;
        last_jiffies = jiffies;
        time_delta = timekeeping_max_deferment();
    } while (read_seqretry(&xtime_lock, seq));

    if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
        arch_needs_cpu(cpu)) {
        next_jiffies = last_jiffies + 1;
        delta_jiffies = 1;
   } else {
        /* Get the next timer wheel timer */
        next_jiffies = get_next_timer_interrupt(last_jiffies);
        delta_jiffies = next_jiffies - last_jiffies;
    }
    /*
     * Do not stop the tick, if we are only one off
     * or if the cpu is required for rcu
     */
    if (!ts->tick_stopped && delta_jiffies == 1)
        goto out;

    /* Schedule the tick, if we are at least one jiffie off */
    if ((long)delta_jiffies >= 1) {

        /*
         * If this cpu is the one which updates jiffies, then
         * give up the assignment and let it be taken by the
         * cpu which runs the tick timer next, which might be
         * this cpu as well. If we don't drop this here the
         * jiffies might be stale and do_timer() never
         * invoked. Keep track of the fact that it was the one
         * which had the do_timer() duty last. If this cpu is
         * the one which had the do_timer() duty last, we
         * limit the sleep time to the timekeeping
         * max_deferement value which we retrieved
         * above. Otherwise we can sleep as long as we want.
         */
        if (cpu == tick_do_timer_cpu) {
            tick_do_timer_cpu = TICK_DO_TIMER_NONE;
            ts->do_timer_last = 1;
        } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
            time_delta = KTIME_MAX;
            ts->do_timer_last = 0;
        } else if (!ts->do_timer_last) {
            time_delta = KTIME_MAX;
        }

        /*
         * calculate the expiry time for the next timer wheel
         * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
         * that there is no timer pending or at least extremely
         * far into the future (12 days for HZ=1000). In this
         * case we set the expiry to the end of time.
         */
        if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
            /*
             * Calculate the time delta for the next timer event.
             * If the time delta exceeds the maximum time delta
             * permitted by the current clocksource then adjust
             * the time delta accordingly to ensure the
             * clocksource does not wrap.
             */
            time_delta = min_t(u64, time_delta,
                       tick_period.tv64 * delta_jiffies);
        }

        if (time_delta < KTIME_MAX)
            expires = ktime_add_ns(last_update, time_delta);
        else
            expires.tv64 = KTIME_MAX;

        /* Skip reprogram of event if its not changed */
        if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
            goto out;

        /*
         * nohz_stop_sched_tick can be called several times before
         * the nohz_restart_sched_tick is called. This happens when
         * interrupts arrive which do not cause a reschedule. In the
         * first call we save the current tick time, so we can restart
         * the scheduler tick in nohz_restart_sched_tick.
         */
        if (!ts->tick_stopped) {
            select_nohz_load_balancer(1);

            ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
            ts->tick_stopped = 1;
            ts->idle_jiffies = last_jiffies;
            rcu_enter_nohz();
        }
       ts->idle_sleeps++;

        /* Mark expires */
        ts->idle_expires = expires;

        /*
         * If the expiration time == KTIME_MAX, then
         * in this case we simply stop the tick timer.
         */
         if (unlikely(expires.tv64 == KTIME_MAX)) {
            if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
                hrtimer_cancel(&ts->sched_timer);
            goto out;
        }

        if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
            hrtimer_start(&ts->sched_timer, expires,
                      HRTIMER_MODE_ABS_PINNED);
            /* Check, if the timer was already in the past */
            if (hrtimer_active(&ts->sched_timer))
                goto out;
        } else if (!tick_program_event(expires, 0))
                goto out;
        /*
         * We are past the event already. So we crossed a
         * jiffie boundary. Update jiffies and raise the
         * softirq.
         */
        tick_do_update_jiffies64(ktime_get());
    }
    raise_softirq_irqoff(TIMER_SOFTIRQ);
out:
    ts->next_jiffies = next_jiffies;
    ts->last_jiffies = last_jiffies;
    ts->sleep_length = ktime_sub(dev->next_event, now);
end:
    local_irq_restore(flags);
}

----------------
tick_nohz_restart_sched_tick
        tick_do_update_jiffies64
         /* Account idle time */
         /* Set tick_sched->tick_stopped = 0 */
         /* program the next tick event */

/**
* tick_nohz_restart_sched_tick - restart the idle tick from the idle task
*
* Restart the idle tick when the CPU is woken up from idle
*/
void tick_nohz_restart_sched_tick(void)
{
    int cpu = smp_processor_id();
    struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
    unsigned long ticks;
#endif
    ktime_t now;

    local_irq_disable();
    if (ts->idle_active || (ts->inidle && ts->tick_stopped))
        now = ktime_get();

    if (ts->idle_active)
        tick_nohz_stop_idle(cpu, now);

    if (!ts->inidle || !ts->tick_stopped) {
        ts->inidle = 0;
        local_irq_enable();
        return;
    }

    ts->inidle = 0;

    rcu_exit_nohz();

    /* Update jiffies first */
    select_nohz_load_balancer(0);
    tick_do_update_jiffies64(now);

#ifndef CONFIG_VIRT_CPU_ACCOUNTING
    /*
     * We stopped the tick in idle. Update process times would miss the
     * time we slept as update_process_times does only a 1 tick
     * accounting. Enforce that this is accounted to idle !
     */
    ticks = jiffies - ts->idle_jiffies;
    /*
     * We might be one off. Do not randomly account a huge number of ticks!
     */
    if (ticks && ticks < LONG_MAX)
        account_idle_ticks(ticks);
#endif

    touch_softlockup_watchdog();
    /*
     * Cancel the scheduled timer and restore the tick
     */
    ts->tick_stopped = 0;
    ts->idle_exittime = now;

    tick_nohz_restart(ts, now);

    local_irq_enable();
}

=========== 15.6 Broadcast Mode
On some architectures, clock event devices will go to sleep when certain
power-saving modes are active. Thankfully, systems do not have only
a single clck evet device, so another device that still works can replace
the stopped devices. The global variable tick_broadcast_device contains
the tick_device instance for the broadcast device.

The APIC devices are not functional, but the broadcast event device still is.
tick_handle_periodic_broadcast is used as the event handler.
It deals with both periodic and one-shot modes of the broadcast device, so
this need not concern us any more. The handler will be activated after
each tick_period.

The broadcast handler uses tick_do_periodic_broadcast. The function invokes
the event_handler method of the nonfunctional device on the current CPU.
The handler can not distinguish if it was invoked from a clock interrupt or
from the broadcast device, and is thus executed as if the underlying
event device were functional.

calling tree:

tick_do_periodic_broadcast
        /* Determine affected CPUs */
      tick_do_broadcast
                /* Remove current CPU from the mask */
                /* Call event_handler for the current CPU */
        /* More CPUs in broadcast mask ? */
                 /* Call broadcast method */

/*
* Periodic broadcast:
* - invoke the broadcast handlers
*/
static void tick_do_periodic_broadcast(void)
{
    raw_spin_lock(&tick_broadcast_lock);

    cpumask_and(to_cpumask(tmpmask),
            cpu_online_mask, tick_get_broadcast_mask());
    tick_do_broadcast(to_cpumask(tmpmask));

    raw_spin_unlock(&tick_broadcast_lock);
}

/*
* Broadcast the event to the cpus, which are set in the mask (mangled).
*/
static void tick_do_broadcast(struct cpumask *mask)
{
    int cpu = smp_processor_id();
    struct tick_device *td;

    /*
     * Check, if the current cpu is in the mask
     */
    if (cpumask_test_cpu(cpu, mask)) {
          /* Remove cpu from the mask */
        cpumask_clear_cpu(cpu, mask);
        td = &per_cpu(tick_cpu_device, cpu);
          /* Call event_handler for current CPU */
        td->evtdev->event_handler(td->evtdev);
    }
    /* More CPUs in broadcast mask ? */
    if (!cpumask_empty(mask)) {
        /*
         * It might be necessary to actually check whether the devices
         * have different broadcast functions. For now, just use the
         * one of the first device. This works as long as we have this
         * misfeature only on x86 (lapic)
         */
        td = &per_cpu(tick_cpu_device, cpumask_first(mask));
         /* Call broadcast method */
        td->evtdev->broadcast(mask);
    }
}

============= hpet
/* Notification for clock events */
static RAW_NOTIFIER_HEAD(clockevents_chain);

1. //kernel/time/tick-common.c
函数 tick_init() 很简单，调用 clockevents_register_notifier 函数向 clockevents_chain
通知链注册元素： tick_notifier。这个元素的回调函数指明了当时钟事件设备信息发生
变化（例如新加入一个时钟事件设备等等）时，应该执行的操作，该回调函数
为 tick_notify

static struct notifier_block tick_notifier = {
    .notifier_call = tick_notify,
};
void __init tick_init(void)
    clockevents_register_notifier(&tick_notifier);// 1. 注册时钟事件回调函数到clockevents_chain链。


    /*
* Notification about clock event devices
*/
static int tick_notify(struct notifier_block *nb, unsigned long reason,
                   void *dev)
{
    switch (reason) {

    case CLOCK_EVT_NOTIFY_ADD:
        return tick_check_new_device(dev); // 中断相关
...
}

/*
* Check, if the new registered device should be used.
*/
static int tick_check_new_device(struct clock_event_device *newdev)
{

2.
    * 初始化本 CPU 上的软件时钟相关的数据结构，参见3.2节
    * 向 cpu_chain 通知链注册元素 timers_nb ，该元素的回调函数用于初始化指定 CPU 上的软件时钟相关的数据结构
    * 初始化时钟的软中断处理函数
//kernel/timer.c
void __init init_timers(void)
{
    int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
                (void *)(long)smp_processor_id());//初始化本 CPU 上的软件时钟相关的数据结构

    init_timer_stats();

    BUG_ON(err != NOTIFY_OK);
    register_cpu_notifier(&timers_nb);// 向 cpu_chain 通知链注册元素 timers_nb ，该元素的回
                                                  //调函数用于初始化指定 CPU 上的软件时钟相关的数据结构
    open_softirq(TIMER_SOFTIRQ, run_timer_softirq); //初始化时钟的软中断处理函数
}

3.
/**
* hpet_enable - Try to setup the HPET timer. Returns 1 on success.
*/
int __init hpet_enable(void) //arch/x86/kernel/hpet.c
   ...
    if (hpet_clocksource_register())//初始化时钟源信息（ struct clocksource 类型的变量），
                                                              //并将其添加到时钟源链表中，即 clocksource_list 链表
        goto out_nohpet;

    if (id & HPET_ID_LEGSUP) {
        hpet_legacy_clockevent_register(); //初始化时钟事件设备信息（ struct clock_event_device
        // 类型的变量），并向通知链 clockevents_chain 发布通知：一个时钟事件设备要被
        //添加到系统中。在通知（执行回调函数）结束后，该时钟事件设备被添加到时钟
        //事件设备链表中，即 clockevent_devices 链表
        return 1;
    }
....

static int hpet_clocksource_register(void)
{
    u64 start, now;
    cycle_t t1;

    /* Start the counter */
    hpet_restart_counter();

    /* Verify whether hpet counter works */
    t1 = hpet_readl(HPET_COUNTER);
    rdtscll(start);

    /*
     * We don't know the TSC frequency yet, but waiting for
     * 200000 TSC cycles is safe:
     * 4 GHz == 50us
     * 1 GHz == 200us
     */
    do {
        rep_nop();
        rdtscll(now);
    } while ((now - start) < 200000UL);

    if (t1 == hpet_readl(HPET_COUNTER)) {
        printk(KERN_WARNING
               "HPET counter not counting. HPET disabled\n");
        return -ENODEV;
    }

    clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);//
    return 0;
}

/*
* Common hpet info
*/
static unsigned long hpet_freq;

static struct clocksource clocksource_hpet = {
    .name       = "hpet",
    .rating     = 250,
    .read       = read_hpet,
    .mask       = HPET_MASK,
    .flags      = CLOCK_SOURCE_IS_CONTINUOUS,
    .resume     = hpet_resume_counter,
#ifdef CONFIG_X86_64
    .archdata   = { .vclock_mode = VCLOCK_HPET },//忽略
#endif
};

/*
* Clock source related code
*/
static cycle_t read_hpet(struct clocksource *cs)
{
    return (cycle_t)hpet_readl(HPET_COUNTER);
}

static void hpet_resume_counter(struct clocksource *cs)
{
    hpet_resume_device();
    hpet_restart_counter();
}

--------------------   event
static void hpet_legacy_clockevent_register(void)
{
    /* Start HPET legacy interrupts */
    hpet_enable_legacy_int();

    /*
     * Start hpet with the boot cpu mask and make it
     * global after the IO_APIC has been initialized.
     */
    hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
    clockevents_config_and_register(&hpet_clockevent, hpet_freq,
                    HPET_MIN_PROG_DELTA, 0x7FFFFFFF); // Configure and register a clock event device
    global_clock_event = &hpet_clockevent;//global_clock_event 指向 hpet 时钟事件设备（ hpet_clockevent ）
    printk(KERN_DEBUG "hpet clockevent registered\n");
}

/*
* The hpet clock event device
*/
static struct clock_event_device hpet_clockevent = {
    .name       = "hpet",
    .features   = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
    .set_mode   = hpet_legacy_set_mode,
    .set_next_event = hpet_legacy_next_event,
    .irq        = 0,
    .rating     = 50,
};

/* 作用 */
static void hpet_legacy_set_mode(enum clock_event_mode mode,
            struct clock_event_device *evt)
{
    hpet_set_mode(mode, evt, 0);
}

/* 作用*/
static int hpet_legacy_next_event(unsigned long delta,
            struct clock_event_device *evt)
{
    return hpet_next_event(delta, evt, 0);
}

============ 中断
static irqreturn_t timer_interrupt(int irq, void *dev_id)
{
    /* Keep nmi watchdog up to date */
    inc_irq_stat(irq0_irqs);

    global_clock_event->event_handler(global_clock_event);//hpet_clockevent.event_handler()

    /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
    if (MCA_bus)
        outb_p(inb_p(0x61)| 0x80, 0x61);

    return IRQ_HANDLED;
}

static struct irqaction irq0 = {
    .handler = timer_interrupt,
    .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
    .name = "timer"
};

void __init setup_default_timer_irq(void)
{
    setup_irq(0, &irq0);
}

/* Default timer init function */
void __init hpet_time_init(void)
{
    if (!hpet_enable())
        setup_pit_timer();
    setup_default_timer_irq();// 中断的注册
}
===========================